#!/usr/bin/env python """Tool for finding dupes and orphans in iTunes music library""" import sys from optparse import OptionParser import logging as log import os from xml.parsers import expat import urllib from urlparse import urlparse import re from collections import defaultdict __version__ = '0.2' __author__ = 'cj_ ' __all__ = ['ITunesLibrary'] MATCH_ARTIST = False MATCH_ALBUM = False ORPHANS = False DUPES = False REPORT = True LIBRARY = os.path.join(os.environ['HOME'], 'Music/iTunes/iTunes Music Library.xml') LOGLEVEL = log.INFO LOGFORMAT = ('', '') LOGSTREAM = sys.stdout LOGFILE = None class PropertyList(object): __slots__ = ['stack', 'indata', 'buffer', 'key'] _structs = {'dict': dict, 'array': list} _datatypes = ('key', 'integer', 'string', 'date', 'data') _statics = {'true': True, 'false': False} def __init__(self, file): self.stack = None self.indata = False self.buffer = [] self.key = None parser = expat.ParserCreate() parser.StartElementHandler = self.on_start_element parser.EndElementHandler = self.on_end_element parser.CharacterDataHandler = self.on_data parser.ParseFile(file) assert len(self.stack) == 1, 'truncated property list' def on_start_element(self, name, attrs): if name in self._structs: val = self._structs[name]() if not self.stack: self.stack = [val] else: self.set_value(val) self.stack.insert(0, val) elif name in self._datatypes: self.indata = True elif name in self._statics: self.set_value(self._statics[name]) def on_end_element(self, name): if name in self._datatypes: data = ''.join(self.buffer).encode('ascii', 'replace') self.indata = False self.buffer = [] if name == 'key': self.key = data else: if name == 'integer': data = int(data) self.set_value(data) elif name in self._structs: self.stack.pop(0) def on_data(self, data): if self.indata: self.buffer.append(data) def set_value(self, val): if isinstance(self.struct, dict): self.struct[self.key] = val elif isinstance(self.struct, list): self.struct.append(val) @property def struct(self): return self.stack[0] class ITunesLibrary(object): """iTunes library interface""" _ignore = ('.DS_Store',) _norm_map = [('(.+)\.([^.]{3})$', r'\1'), ('[_-]', ' '), ("[^a-z0-9 '&]", ''), ('(?:^|\s)(the|an|a)(?:\s|$)', ''), ('\s+', ''), ('^\d+([^0-9])', r'\1'), ("'s", 'is'), ("'re", 'are'), ("don't", 'donot'), ("i'll", 'iwill'), ("i'm", 'im'), ("'ll", 'all'), ("'t", 'not'), ("'d", 'did'), ("'ve", 'have'), ("(n'|&)", 'and'), ('ing', 'in'), ('lil', 'little'), ('through', 'thru'), ('wanna', 'wantto'), ('about', 'bout'), ('and?', 'n'), ("'", ''), ('\s+', '')] _norm_map = [(re.compile(s), r) for s, r in _norm_map] def __init__(self, path): assert os.access(path, os.R_OK), 'library unreadable' log.debug('parsing %s' % path) file = open(path, 'rb') try: self.library = PropertyList(file).struct finally: file.close() log.debug('loaded %s tracks' % len(self.tracks)) def find_dupes(self, match_album=MATCH_ALBUM, match_artist=MATCH_ARTIST, report=REPORT): """Finds possible duplicate songs in library""" log.debug('looking for possible dupes') seen = defaultdict(list) for track in self.tracks.values(): name = self._normalize(track['Name']) if match_album and 'Album' in track: album = self._normalize(track['Album']) else: album = '' if match_artist and 'Artist' in track: artist = self._normalize(track['Artist']) else: artist = '' name = name.replace(album, '') name = name.replace(artist, '') key = artist + album + name seen[key].append(track) dupes = [i for i in seen.items() if len(i[1]) > 1] dupes = sorted(dupes, lambda x, y: cmp(x[0], y[0])) log.info('found %s possible dupes' % len(dupes)) if report: log.info('possible dupes:') for key, tracks in dupes: for track in tracks: log.info(self._urlpath(track['Location'])) log.info('---') def find_orphans(self, report=REPORT): """Finds possible orphaned files inside iTunes music directory""" music_dir = self._urlpath(self.library['Music Folder']) log.debug('scanning %s' % music_dir) files = dict((path.lower(), path) for path, filename in self._walk(music_dir) if filename not in self._ignore) log.debug('found %s files' % len(files)) log.debug('looking for possible orphans') for track in self.tracks.values(): path = self._urlpath(track['Location']).lower() if path in files: del files[path] log.info('found %s possible orphans' % len(files)) if report: log.info('possible orphans:') for path in sorted(files.values()): log.info(path) @property def tracks(self): return self.library['Tracks'] @classmethod def _normalize(cls, name): name = name.lower() for pattern, replace in cls._norm_map: name = pattern.sub(replace, name) return name @staticmethod def _walk(dir): for basedir, subdirs, filenames in os.walk(dir): for filename in filenames: yield os.path.join(basedir, filename), filename @staticmethod def _urlpath(url): return urllib.unquote(urlparse(url).path) def __repr__(self): return public_attrs(self) def public_attrs(obj): return '<%s object at 0x%x: %s>' % ( obj.__class__.__name__, id(obj), dict(i for i in obj.__dict__.items() if i[0][0] != '_')) def parse_args(): parser = OptionParser(version=__version__) countonly = not REPORT toggle = lambda x: ('store_%s' % (not x)).lower() parser.add_option( '-d', '--dupes', action=toggle(DUPES), default=DUPES, help='show possible duplicate media (default: %default)') parser.add_option( '-o', '--orphans', action=toggle(ORPHANS), default=ORPHANS, help='show orphaned files in music dir (default: %default)') parser.add_option( '-a', '--album', action=toggle(MATCH_ALBUM), default=MATCH_ALBUM, help='only show dupes if their album matches (default: %default)') parser.add_option( '-A', '--artist', action=toggle(MATCH_ARTIST), default=MATCH_ARTIST, help='only show dupes if artist matches (default: %default)') parser.add_option( '-l', '--library', default=LIBRARY, help='itunes library (default: %default)', metavar='FILE') parser.add_option( '-c', '--countonly', action=toggle(countonly), default=countonly, help='only show count of dupes/orphans (default: %default)') parser.add_option( '-D', '--debug', action='store_const', dest='loglevel', default=LOGLEVEL, const=log.DEBUG, help='enable debugging messages') parser.add_option( '-L', '--logfile', default=LOGFILE, help='log messages to file (default: %default)', metavar='FILE') opts, args = parser.parse_args() if args: parser.print_help() parser.error('invalid extra args') if not (opts.dupes or opts.orphans): parser.print_help() parser.error('you must specific -o or -d') handlers = [log.StreamHandler(LOGSTREAM)] if opts.logfile: handlers.append(log.FileHandler(opts.logfile)) formatter = log.Formatter(*LOGFORMAT) for handler in handlers: handler.setFormatter(formatter) log.root.addHandler(handler) log.root.setLevel(opts.loglevel) return opts def main(): opts = parse_args() report = not opts.countonly library = ITunesLibrary(path=opts.library) if opts.dupes: library.find_dupes(match_album=opts.album, match_artist=opts.artist, report=report) if opts.orphans: library.find_orphans(report=report) return 0 if __name__ == '__main__': sys.exit(main())