pandoraclient/pandoraclient

#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2012

from __future__ import division, with_statement
import datetime
import json
from optparse import OptionParser
import os
import ox
import re
import shutil
import sys
import unicodedata


FILES = {
    'config': 'pandoraclient.config.jsonc',
    'errors': 'pandoraclient.errors.json',
    'files': 'pandoraclient.files.%s.json',
    'organize': 'pandoraclient.organize.jsonc',
    'sync': 'pandoraclient.sync.jsonc'
}
DIRECTORIES = ['Extras', 'Segments', 'Versions']


def copy():

    pass


def execute_copy():

    pass


def organize():

    def add_directory(path):
        d = directories[-1]
        for i, directory in enumerate(directories):
            if ox.sort_string(path).lower() < ox.sort_string(directory).lower():
                d = directories[max(i - 1, 0)]
                break
        return '%s/%s' % (d, path)

    def remove_directory(path):
        return '/'.join(path.split('/')[1:])

    def is_system_file(filename):
        return re.search('^\.(_|DS_Store$)', filename) != None

    config = get_config()
    volume = config['volumes'].keys()[0]
    volume_path = config['volumes'][volume]
    if isinstance(volume_path, str):
        volume_path = volume_path.decode('utf-8')
    if volume_path[-1] != '/':
        volume_path += '/'
    api = ox.API(config['url'])
    FILES['files'] = FILES['files'] % volume
    cache = {}
    if os.path.exists(FILES['files']):
        data = json.load(open(FILES['files']))
        if data['path'] == volume_path:
            for file in data['files']:
                cache['%s %s %s' % (file['path'], str(file['size']), file['time'])] = file['hash']
    directories_by_path = {}
    files = []
    files_by_hash = {}
    files_by_item = {}
    files_by_path = {}
    issues = {
        'duplicate files': [],
        'empty directories': [],
        'incorrect filenames': [],
        'incorrect filenames (target exists)': [],
        'missing files': [],
        'missing subtitles': [],
        'missing video': [],
        'multiple versions': [],
        'non-canonical filenames': [],
        'non-canonical filenames (target exists)': [],
        'system files': [],
        'unexpected directories': [],
        'unexpected files': [],
        'unknown extensions': []
    }
    previous_path = ''

    print 'Scanning %s' % volume_path
    directories = sorted(os.listdir(volume_path))
    for absolute_path, dirnames, filenames in os.walk(volume_path, followlinks=True):
        relative_path = unicodedata.normalize('NFD', absolute_path)[len(volume_path):]
        parts = relative_path.split('/')
        length = len(parts)
        for filename in ox.sorted_strings(filenames):
            full_path = os.path.join(absolute_path, filename)
            filename = unicodedata.normalize('NFD', filename)
            if relative_path != previous_path and length == 3:
                print relative_path
                previous_path = relative_path
            path = os.path.join(relative_path, filename)
            if is_system_file(filename):
                issues['system files'].append(path)
            else:
                # unexpected files, unexpected directories
                parts = path.split('/')
                if length < 3:
                    issues['unexpected files'].append(path)
                if length <= 3:
                    file = {}
                    file['path'] = path
                    file['size'] = os.path.getsize(full_path)
                    file['time'] = datetime.datetime.utcfromtimestamp(os.stat(full_path).st_mtime).isoformat()
                    key = '%s %s %s' % (file['path'], str(file['size']), file['time'])
                    file['hash'] = cache[key] if key in cache else ox.oshash(full_path)
                    files.append(file)
                    if not file['hash'] in files_by_hash:
                        files_by_hash[file['hash']] = []
                    files_by_hash[file['hash']].append(file)
                    if not relative_path in files_by_path:
                        files_by_path[relative_path] = []
                    files_by_path[relative_path].append(path)
                elif parts[3] in DIRECTORIES:
                    path_key = '/'.join(parts[:3])
                    if not path_key in directories_by_path:
                        directories_by_path[path_key] = []
                    directory = '/'.join(parts[:4])
                    if not directory in directories_by_path[path_key]:
                        directories_by_path[path_key].append(directory)
                else:
                    issues['unexpected directories'].append(path)
        # empty directories, missing files
        if not filenames or not filter(lambda x: not is_system_file(x), filenames):
            if not dirnames:
                issues['empty directories'].append(relative_path)
            elif length == 3:
                issues['missing files'].append(relative_path)

    print 'Scanning for duplicates'
    for hash in files_by_hash:
        if len(files_by_hash[hash]) > 1:
            issues['duplicate files'].append(sorted(map(lambda x: x['path'], files_by_hash[hash])))

    print 'Scanning for inconsistencies'
    paths = api.getPath({'id': files_by_hash.keys()})['data']
    for file in files:
        file = dict(file, **ox.movie.parse_path(remove_directory(file['path'])))
        file['normalizedPath'] = add_directory(file['normalizedPath'])
        # unknown extensions
        if not file['type']:
            issues['unknown extensions'].append(file['path'])
        path_key = 'normalizedPath'
        if file['hash'] in paths and paths[file['hash']] != remove_directory(file['path']):
            file['correctedPath'] = add_directory(paths[file['hash']])
            path_key = 'correctedPath'
        if len(path.split('/')) == 4:
            file['item'] = '%s/%s' % ('/'.join(file[path_key].split('/')[:-1]), file['title'])
            if not file['item'] in files_by_item:
                files_by_item[file['item']] = []
            files_by_item[file['item']].append(file)

    for item in sorted(files_by_item):
        item_files = files_by_item[item]
        # missing video, missing subtitles, multiple versions
        versions = ox.movie.parse_item_files(item_files)
        main_files = sum([version['files'] for version in versions if version['isMainVersion']], [])
        other_files = sum([version['files'] for version in versions if not version['isMainVersion']], [])
        if not main_files:
            issues['missing video'].append([file['path'] for file in item_files])
        else:
            video_files = [file for file in main_files if file['type'] == 'video']
            subtitle_files = [file for file in main_files if file['type'] == 'subtitle']
            if subtitle_files and len(subtitle_files) < len(video_files):
                issues['missing subtitles'].append([file['path'] for file in main_files])
            if other_files:
                issues['multiple versions'].append([file['path'] for file in main_files + other_files])
        # incorrect filenames, non-canonical filenames
        for version in versions:
            path_key = {}
            for file in version['files']:
                path_key[file['path']] = 'correctedPath' if 'correctedPath' in file else 'normalizedPath'
            rename = [[
                file['path'], file[path_key[file['path']]]
            ] for file in version['files'] if file['path'] != file[path_key[file['path']]]]
            languages = {'idx': [], 'srt': [], 'sub': []}
            for extension in languages:
                languages[extension] = set([file['language'] for file in version['files'] if file['extension'] == extension])
                if len(languages[extension]) == 1 and 'en' in languages[extension]:
                    # only english subtitles
                    regexp = '\.en(?=\.%s$)' % extension
                    # don't add '.en'
                    rename = [paths for paths in rename if not re.search(regexp, paths[1])]
                    # remove '.en'
                    rename += [[
                        file['path'], re.sub(regexp, '', file[path_key[file['path']]])
                    ] for file in version['files'] if re.search(regexp, file[path_key[file['path']]]) and file['path'] != re.sub(regexp, '', file[path_key[file['path']]])]
            for paths in rename:
                issues['%s filenames' % (
                    'incorrect' if path_key[paths[0]] == 'correctedPath' else 'non-canonical'
                )].append(paths)

    print 'Scanning for conflicts'
    existing_paths = [file['path'] for file in files]
    for key in ['incorrect filenames', 'non-canonical filenames']:
        exists_key = '%s (target exists)' % key
        path_count = {}
        for path in [paths[1] for paths in issues[key]]:
            path_count[path] = 1 if not path in path_count else path_count[path] + 1
        for paths in [paths for paths in issues[key] if paths[0].lower() != paths[1].lower()]:
            if path_count[paths[1]] > 1:
                # multiple files with the same target path
                issues[key].remove(paths)
                if not paths in issues[exists_key]:
                    issues[exists_key].append(paths)
            elif not paths[1] in existing_paths:
                # target path does not exist, remove original path from existing paths
                existing_paths.remove(paths[0])
        for paths in [paths for paths in issues[key] if paths[0].lower() != paths[1].lower()]:
            if paths[1] in existing_paths:
                # target path exists
                issues[key].remove(paths)
                if not paths in issues[exists_key]:
                    issues[exists_key].append(paths)

    # orphaned directories
    new_path = {}
    rename_key = {}
    for key in ['incorrect filenames', 'non-canonical filenames']:
        for i, paths in enumerate(issues[key]):
            new_path[paths[0]] = '/'.join(paths[1].split('/')[:-1])
            if not new_path[paths[0]] in rename_key:
                rename_key[new_path[paths[0]]] = key
    for path in [path for path in files_by_path if path in directories_by_path]:
        new_paths = []
        for path_file in files_by_path[path]:
            if path_file in new_path:
                new_paths.append(new_path[path_file])
        if len(new_paths) == len(files_by_path[path]) and len(set(new_paths)) == 1 and new_paths[0] != path:
            # per path, if all files get moved to the same path, move directories too
            for directory in directories_by_path[path]:
                new_directory = os.path.join(new_paths[0], directory.split('/')[-1])
                key = rename_key[new_paths[0]]
                exists_key = '%s (target exists)' % key
                issues[exists_key if os.path.exists(new_directory) else key].append([directory, new_directory])
    for key in ['incorrect filenames', 'non-canonical filenames']:
        for key in [key, '%s (target exists)' % key]:
            issues[key] = sorted(issues[key], key=lambda x: x[0].lower())

    for issue in issues:
        if issues[issue]:
            if isinstance(issues[issue][0], str):
                issues[issue] = sorted(issues[issue])
            else:
                issues[issue] = sorted(issues[issue], key=lambda x: x[0])
    keys = {
        'automatic': {
            'remove': [
                'empty directories',
                'system files',
                'unexpected files',
                'unknown extensions'
            ],
            'rename': [
                'incorrect filenames',
                'non-canonical filenames'
            ]
        },
        'manual': {
            'rename': [
                'incorrect filenames (target exists)',
                'non-canonical filenames (target exists)'
            ],
            'resolve': [
                'duplicate files',
                'missing files',
                'missing subtitles',
                'missing video',
                'multiple versions',
                'unexpected directories'
            ]
        }
    }

    print 'Writing %s' % FILES['files']
    data = ['{']
    data.append(4 * ' ' + '"directories": ' + get_json(directories) + ',')
    data.append(4 * ' ' + '"files": [')
    for f, file in enumerate(files):
        data.append(8 * ' ' + get_json({
            'hash': file['hash'],
            'path': file['path'],
            'size': file['size'],
            'time': file['time']
        }, sort_keys=True) + (',' if f < len(files) - 1 else ''))
    data.append(4 * ' ' + '],')
    data.append(4 * ' ' + '"path": ' + get_json(volume_path) + ',')
    data.append(4 * ' ' + '"totals": {"files": %d, "size": %d' % (
        len(files), sum([file['size'] for file in files])
    ) + '}')
    data.append('}')
    write_file(FILES['files'], u'\n'.join(data))

    print 'Writing %s' % FILES['organize']
    data = ['{']
    sections = sorted(keys)
    for s, section in enumerate(sections):
        data.append('')
        data.append(4 * ' ' + '"%s": {' % section)
        actions = sorted(keys[section])
        for a, action in enumerate(actions):
            data.append('')
            data.append(8 * ' ' + '"%s": [' % action)
            issue_keys = keys[section][action]
            for i, issue in enumerate(issue_keys):
                data.append('')
                data.append(12 * ' ' + '/* %s */' % issue)
                if issues[issue]:
                    for line in get_json(issues[issue], indent=4).split('\n')[1:-1]:
                        data.append(8 * ' ' + line)
                    if i < len(issue_keys) - 1:
                        data[-1] += ','
            data.append('')
            data.append(8 * ' ' + ']' + (',' if a < len(actions) - 1 else ''))
        data.append('')
        data.append(4 * ' ' + '}' + (',' if s < len(sections) - 1 else ''))
    data.append('}')
    write_file(FILES['organize'], u'\n'.join(data))
    print 'Next, edit %s and run pandoraclient organize -x' % FILES['organize']


def execute_organize():

    def get_empty_directories():
        empty_directories = []
        for absolute_path, dirnames, filenames in os.walk(volume_path, followlinks=True):
            if not dirnames and not filenames:
                empty_directories.append(absolute_path)
        return empty_directories

    def remove_file(path):
        print 'Removing "%s"' % path
        try:
            if os.path.isdir(path):
                os.rmdir(path)
            else:
                os.remove(path)
        except:
            raise IOError('Could not remove file')

    def rename_file(source, target):
        print 'Renaming "%s" to "%s"' % (source, target)
        if not os.path.exists(source):
            raise IOError('Source does not exist')
        elif os.path.exists(target):
            raise IOError('Target exists')
        else:
            ox.makedirs(os.path.dirname(target))
            shutil.move(source, target)

    config = get_config()
    volume = config['volumes'].keys()[0]
    volume_path = config['volumes'][volume]

    if not os.path.exists(FILES['organize']):
        sys.exit('%s not found' % FILES['organize'])

    data = ox.jsonc.load(open(FILES['organize']))
    old_empty_directories = get_empty_directories()
    remove = map(lambda x: os.path.join(volume_path, x), data['automatic']['remove'])
    rename = map(lambda x: map(lambda y: os.path.join(volume_path, y), x), data['automatic']['rename'])
    errors = []
    for path in remove:
        try:
            remove_file(path)
        except:
            errors.append('Could not remove "%s"' % path)
    for paths in rename:
        source = paths[0]
        target = paths[1] + '.pandora'
        try:
            rename_file(source, target)
        except IOError as error:
            errors.append('Could not rename "%s" to "%s" (%s)' % (source, target, error))
    for paths in rename:
        source = paths[1] + '.pandora'
        target = paths[1]
        try:
            rename_file(source, target)
        except IOError as error:
            errors.append('Could not rename "%s" to "%s" (%s)' % (source, target, error))
    while True:
        new_empty_directories = [path for path in get_empty_directories() if not path in old_empty_directories]
        if new_empty_directories:
            for path in new_empty_directories:
                try:
                    remove_file(path)
                except:
                    errors.append('Could not remove "%s"' % path)
        else:
            break

    for error in errors:
        print error


def sync():

    def get_directories(files):
        directories = {}
        for file in files:
            # path = os.path.dirname(file['path'])
            path = '/'.join(file['path'].split('/')[:3])
            if not path in directories:
                directories[path] = {'files': []}
            directories[path]['files'].append(file)
        for path in directories:
            directories[path]['hashes'] = '\n'.join(sorted(
                [file['hash'] for file in directories[path]['files']]
            ))
            directories[path]['paths'] = '\n'.join(sorted(
                [file['path'] for file in directories[path]['files']]
            ))
        return directories

    if len(sys.argv) != 3:
        print 'Usage: %s sync targetfiles.json' % sys.argv[0]
        sys.exit(1)

    print 'Reading files'
    source_file = FILES['files']
    target_file = sys.argv[2]
    files = [ox.json.load(open(file))['files'] for file in [source_file, target_file]]
    directories = [get_directories(file) for file in files]
    file_path_by_hash = [{}, {}]
    hashes = [{}, {}]
    paths = [{}, {}]
    for i in [0, 1]:
        for file in files[i]:
            if file['hash'] in file_path_by_hash[i]:
                print 'Duplicate file:'
                print file_path_by_hash[i][file['hash']]
                print file['path']
                sys.exit()
            file_path_by_hash[i][file['hash']] = file['path']
        for path in sorted(directories[i]):
            directory = directories[i][path]
            hashes[i][directory['hashes']] = path
            paths[i][directory['paths']] = path

    print 'Determining files to sync'
    sync = {
        'same files, different filenames': [],
        'same file, different filename': [],
        'unique directory': [],
        'unique file': []
    }
    for path in sorted(directories[0]):
        directory = directories[0][path]
        file_hashes = [
            directory['hashes'].split('\n'),
            directories[1][path]['hashes'].split('\n')
        ]
        if directory['hashes'] in hashes[1]:
            # same files
            if not directory['paths'] in paths[1]:
                # different filenames
                rename = []
                for file in directory['files']:
                    if file['path'] != file_path_by_hash[1][file['hash']]:
                        rename.append([file['path'], file_path_by_hash[1][file['hash']]])
                sync['same files, different filenames'].append(rename)
            for hash in file_hashes[0]:
                for i in [0, 1]:
                    del file_path_by_hash[i][hash]
        else:
            for hash in file_hashes[0]:
                if not hash in file_hashes[1]:
                    if hash in file_path_by_hash[1]:
                        sync['same file, different filename'].append([
                            file_path_by_hash[0][hash],
                            file_path_by_hash[1][hash]
                        ])
                        del file_path_by_hash[0][hash]
                        del file_path_by_hash[1][hash]
                    else:
                        sync['unique file'].append([
                            file_path_by_hash[0][hash],
                            None
                        ])
                        del file_path_by_hash[0][hash]
            for hash in [hash for hash in file_hashes[1] if hash in file_path_by_hash[1]]:
                sync['unique file'].append([
                    None,
                    file_path_by_hash[0][hash]
                ])
        '''
        elif path in directories[1] and directory['hashes'] != directories[1][path]['hashes']:
            # same directory, different files
            file_hashes_1 = directories[1][path]['hashes'].split('\n')
            if file_hashes_1[0] in file_path_by_hash[1]:
                # directory in b not handled in previous step
                sync['same directory, different files'].append([
                    [file['path'] for file in directory['files']],
                    [file['path'] for file in directories[1][path]['files']]
                ])
                for hash in file_hashes:
                    del file_path_by_hash[0][hash]
                for hash in file_hashes_1:
                    del file_path_by_hash[1][hash]
        elif len(
            [hash for hash in file_hashes if hash in file_path_by_hash[0] and not hash in file_path_by_hash[1]]
        ) == len(file_hashes):
            sync['unique directory'].append([directory['files'], None])
            for hash in file_hashes:
                del file_path_by_hash[0][hash]
    for path in sorted(directories[1]):
        directory = directories[1][path]
        file_hashes = directory['hashes'].split('\n')
        if len(
            [hash for hash in file_hashes if hash in file_path_by_hash[1] and not hash in file_path_by_hash[0]]
        ) == len(file_hashes):
            sync['unique directory'].append([None, directory['paths']])
            for hash in file_hashes:
                del file_path_by_hash[1][hash]
    for hash in file_path_by_hash[0]:
        if hash in file_path_by_hash[1]:
            sync['same file, different filenames'].append(
                [file_path_by_hash[0][hash], file_path_by_hash[1][hash]]
            )
    '''

    print 'Writing %s' % FILES['sync']
    data = ['[']
    data.append('')
    for key in [
        'same files, different filenames',
        'same directory, different files',
        'same file, different directory',
        'unique directory'
    ]:
        data.append('')
        data.append(4 * ' ' + '/* %s */' % key)
        for paths in sync[key]:
            data.append('')
            if key == 'same files, different filenames':
                data.append(4 * ' ' + '/* rename in b */')
                data.append(',\n'.join([4 * ' ' + get_json(['b', 'rename'] + x) for x in paths]) + ',')
                data.append(4 * ' ' + '/* rename in a */')
                data.append(',\n'.join([4 * ' ' + '// ' + get_json(['a', 'rename'] + list(reversed(x))) for x in paths]) + ',')
            elif key == 'same directory, different files':
                data.append(4 * ' ' + '/* remove in b, copy from a to b */')
                data.append(',\n'.join([4 * ' ' + get_json(['b', 'remove', x]) for x in paths[1]]) + ',')
                data.append(',\n'.join([4 * ' ' + get_json(['a', 'copy', x]) for x in paths[0]]) + ',')
                data.append(4 * ' ' + '/* remove in a, copy from b to a */')
                data.append(',\n'.join([4 * ' ' + '// ' + get_json(['a', 'remove', x]) for x in paths[0]]) + ',')
                data.append(',\n'.join([4 * ' ' + '// ' + get_json(['b', 'copy', x]) for x in paths[1]]) + ',')
            elif key == 'same file, different directory':
                data.append(4 * ' ' + '/* rename in b */')
                #print get_json(paths, indent=4).encode('utf-8')
                data.append(4 * ' ' + get_json(['b', 'rename'] + paths) + ',')
                data.append(4 * ' ' + '/* rename in a */')
                data.append(4 * ' ' + '// ' + get_json(['a', 'rename'] + list(reversed(paths))) + ',')
            else:
                copy = ['a', 'b'] if paths[0] else ['b', 'a']
                i = 0 if paths[0] else 1
                data.append(4 * ' ' + '/* copy from %s to %s */' % (copy[0], copy[1]))
                data.append('\n'.join([4 * ' ' + get_json([copy[0], 'copy', x]) for x in paths[i]]) + ',')
                data.append(4 * ' ' + '/* remove in %s */' % copy[0])
                data.append('\n'.join([4 * ' ' + '// ' + get_json([copy[0], 'remove', x]) for x in paths[i]]) + ',')
    data[-1] = data[-1][:-1]
    data.append('')
    data.append(']')
    write_file(FILES['sync'], u'\n'.join(data))


def execute_sync():

    pass


def update():

    pass


def execute_update():

    pass


def get_config():
    if not os.path.exists(FILES['config']):
        sys.exit('%s not found' % FILES['config'])
    with open(FILES['config']) as f:
        config = ox.jsonc.load(f)
    return config


def get_json(data, indent=None, sort_keys=False):
    # return json.dumps(data, ensure_ascii=False, indent=indent, sort_keys=sort_keys)
    return json.dumps(data, indent=indent, sort_keys=sort_keys)


def write_file(filename, data):
    with open(filename, 'w') as f:
        # f.write(data.encode('utf-8'))
        f.write(data)

if __name__ == '__main__':

    actions = ['copy', 'organize', 'sync', 'upload']
    action_string = '%s or %s' % (', '.join(actions[:-1]), actions[-1])
    usage = 'usage: %prog action [volume] [options]'
    description = 'Action: %s' % action_string
    parser = OptionParser(usage=usage, description=description)
    parser.add_option(
        '-e', '--encode', action='store_true', dest='encode',
        help='encode only, do not upload'
    )
    parser.add_option(
        '-v', '--verbose', action='store_true', dest='verbose',
        help='verbose output'
    )
    parser.add_option(
        '-x', '--execute', action='store_true', dest='execute',
        help='execute %s' % action_string
    )
    (opts, args) = parser.parse_args()
    if len(args) == 0:
        sys.exit(parser.get_usage())
    action = args[0]
    args = args[1:]
    if action == 'copy':
        if not opts.execute:
            copy()
        else:
            copy_x()
    elif action == 'organize':
        if not opts.execute:
            organize()
        else:
            execute_organize()
    elif action == 'sync':
        if not opts.execute:
            sync()
        else:
            execute_sync()
    elif action == 'upload':
        if not opts.execute:
            upload()
        else:
            execute_upload()