#!/usr/bin/env python

"""Tool for finding dupes and orphans in iTunes music library"""

import sys
from optparse import OptionParser
import logging as log
import os
from xml.parsers import expat
import urllib
from urlparse import urlparse
import re
from collections import defaultdict

__version__ = '0.2'
__author__ = 'cj_ <cjones@gruntle.org>'
__all__ = ['ITunesLibrary']

MATCH_ARTIST = False
MATCH_ALBUM = False
ORPHANS = False
DUPES = False
REPORT = True
LIBRARY = os.path.join(os.environ['HOME'],
                       'Music/iTunes/iTunes Music Library.xml')
LOGLEVEL = log.INFO
LOGFORMAT = ('', '')
LOGSTREAM = sys.stdout
LOGFILE = None

class PropertyList(object):

    __slots__ = ['stack', 'indata', 'buffer', 'key']
    _structs = {'dict': dict, 'array': list}
    _datatypes = ('key', 'integer', 'string', 'date', 'data')
    _statics = {'true': True, 'false': False}

    def __init__(self, file):
        self.stack = None
        self.indata = False
        self.buffer = []
        self.key = None
        parser = expat.ParserCreate()
        parser.StartElementHandler = self.on_start_element
        parser.EndElementHandler = self.on_end_element
        parser.CharacterDataHandler = self.on_data
        parser.ParseFile(file)
        assert len(self.stack) == 1, 'truncated property list'

    def on_start_element(self, name, attrs):
        if name in self._structs:
            val = self._structs[name]()
            if not self.stack:
                self.stack = [val]
            else:
                self.set_value(val)
            self.stack.insert(0, val)
        elif name in self._datatypes:
            self.indata = True
        elif name in self._statics:
            self.set_value(self._statics[name])

    def on_end_element(self, name):
        if name in self._datatypes:
            data = ''.join(self.buffer).encode('ascii', 'replace')
            self.indata = False
            self.buffer = []
            if name == 'key':
                self.key = data
            else:
                if name == 'integer':
                    data = int(data)
                self.set_value(data)
        elif name in self._structs:
            self.stack.pop(0)

    def on_data(self, data):
        if self.indata:
            self.buffer.append(data)

    def set_value(self, val):
        if isinstance(self.struct, dict):
            self.struct[self.key] = val
        elif isinstance(self.struct, list):
            self.struct.append(val)

    @property
    def struct(self):
        return self.stack[0]


class ITunesLibrary(object):

    """iTunes library interface"""

    _ignore = ('.DS_Store',)
    _norm_map = [('(.+)\.([^.]{3})$', r'\1'),
                 ('[_-]', ' '), ("[^a-z0-9 '&]", ''),
                 ('(?:^|\s)(the|an|a)(?:\s|$)', ''), ('\s+', ''),
                 ('^\d+([^0-9])', r'\1'), ("'s", 'is'), ("'re", 'are'),
                 ("don't", 'donot'), ("i'll", 'iwill'), ("i'm", 'im'),
                 ("'ll", 'all'), ("'t", 'not'), ("'d", 'did'), ("'ve", 'have'),
                 ("(n'|&)", 'and'), ('ing', 'in'), ('lil', 'little'),
                 ('through', 'thru'), ('wanna', 'wantto'), ('about', 'bout'),
                 ('and?', 'n'), ("'", ''), ('\s+', '')]
    _norm_map = [(re.compile(s), r) for s, r in _norm_map]

    def __init__(self, path):
        assert os.access(path, os.R_OK), 'library unreadable'
        log.debug('parsing %s' % path)
        file = open(path, 'rb')
        try:
            self.library = PropertyList(file).struct
        finally:
            file.close()
        log.debug('loaded %s tracks' % len(self.tracks))

    def find_dupes(self, match_album=MATCH_ALBUM, match_artist=MATCH_ARTIST,
                   report=REPORT):
        """Finds possible duplicate songs in library"""
        log.debug('looking for possible dupes')
        seen = defaultdict(list)
        for track in self.tracks.values():
            name = self._normalize(track['Name'])
            if match_album and 'Album' in track:
                album = self._normalize(track['Album'])
            else:
                album = ''
            if match_artist and 'Artist' in track:
                artist = self._normalize(track['Artist'])
            else:
                artist = ''
            name = name.replace(album, '')
            name = name.replace(artist, '')
            key = artist + album + name
            seen[key].append(track)

        dupes = [i for i in seen.items() if len(i[1]) > 1]
        dupes = sorted(dupes, lambda x, y: cmp(x[0], y[0]))
        log.info('found %s possible dupes' % len(dupes))
        if report:
            log.info('possible dupes:')
            for key, tracks in dupes:
                for track in tracks:
                    log.info(self._urlpath(track['Location']))
                log.info('---')

    def find_orphans(self, report=REPORT):
        """Finds possible orphaned files inside iTunes music directory"""
        music_dir = self._urlpath(self.library['Music Folder'])
        log.debug('scanning %s' % music_dir)
        files = dict((path.lower(), path)
                     for path, filename in self._walk(music_dir)
                     if filename not in self._ignore)
        log.debug('found %s files' % len(files))

        log.debug('looking for possible orphans')
        for track in self.tracks.values():
            path = self._urlpath(track['Location']).lower()
            if path in files:
                del files[path]
        log.info('found %s possible orphans' % len(files))

        if report:
            log.info('possible orphans:')
            for path in sorted(files.values()):
                log.info(path)

    @property
    def tracks(self):
        return self.library['Tracks']

    @classmethod
    def _normalize(cls, name):
        name = name.lower()
        for pattern, replace in cls._norm_map:
            name = pattern.sub(replace, name)
        return name

    @staticmethod
    def _walk(dir):
        for basedir, subdirs, filenames in os.walk(dir):
            for filename in filenames:
                yield os.path.join(basedir, filename), filename

    @staticmethod
    def _urlpath(url):
        return urllib.unquote(urlparse(url).path)

    def __repr__(self):
        return public_attrs(self)


def public_attrs(obj):
    return '<%s object at 0x%x: %s>' % (
            obj.__class__.__name__, id(obj),
            dict(i for i in obj.__dict__.items() if i[0][0] != '_'))

def parse_args():
    parser = OptionParser(version=__version__)
    countonly = not REPORT
    toggle = lambda x: ('store_%s' % (not x)).lower()
    parser.add_option(
            '-d', '--dupes', action=toggle(DUPES), default=DUPES,
            help='show possible duplicate media (default: %default)')
    parser.add_option(
            '-o', '--orphans', action=toggle(ORPHANS), default=ORPHANS,
            help='show orphaned files in music dir (default: %default)')
    parser.add_option(
            '-a', '--album', action=toggle(MATCH_ALBUM), default=MATCH_ALBUM,
            help='only show dupes if their album matches (default: %default)')
    parser.add_option(
            '-A', '--artist', action=toggle(MATCH_ARTIST), default=MATCH_ARTIST,
            help='only show dupes if artist matches (default: %default)')
    parser.add_option(
            '-l', '--library', default=LIBRARY,
            help='itunes library (default: %default)', metavar='FILE')
    parser.add_option(
            '-c', '--countonly', action=toggle(countonly), default=countonly,
            help='only show count of dupes/orphans (default: %default)')
    parser.add_option(
            '-D', '--debug', action='store_const', dest='loglevel',
            default=LOGLEVEL, const=log.DEBUG, help='enable debugging messages')
    parser.add_option(
            '-L', '--logfile', default=LOGFILE,
            help='log messages to file (default: %default)', metavar='FILE')
    opts, args = parser.parse_args()

    if args:
        parser.print_help()
        parser.error('invalid extra args')
    if not (opts.dupes or opts.orphans):
        parser.print_help()
        parser.error('you must specific -o or -d')

    handlers = [log.StreamHandler(LOGSTREAM)]
    if opts.logfile:
        handlers.append(log.FileHandler(opts.logfile))
    formatter = log.Formatter(*LOGFORMAT)
    for handler in handlers:
        handler.setFormatter(formatter)
        log.root.addHandler(handler)
    log.root.setLevel(opts.loglevel)

    return opts

def main():
    opts = parse_args()
    report = not opts.countonly
    library = ITunesLibrary(path=opts.library)
    if opts.dupes:
        library.find_dupes(match_album=opts.album, match_artist=opts.artist,
                           report=report)
    if opts.orphans:
        library.find_orphans(report=report)
    return 0

if __name__ == '__main__':
    sys.exit(main())
