#!/usr/bin/env python # -*- coding: utf-8 -*- # -*- Mode: Python; -*- # vi:si:et:sw=2:sts=2:ts=2 # OXDb Import client, crawls the filesystem and gathers information about # movies # import md5 import os import sys import re import urllib import time import simplejson from scrapeit.utils import read_url class OXDb: def __init__(self, archive): self.archive = archive def getBase(self): self.basePath = self.ar self.basePath = self._remote_call('base')['base'] if not self.basePath.endswith('/'): self.basePath = self.basePath + '/' return self.basePath def files(self): return self._remote_call('list')['files'] def addFile(self, params): return self._remote_call('add', params)['result'] def removeFile(self, params): return self._remote_call('remove', params)['result'] def oxdb_md5sum(fname): '''Returns an md5 hash for file''' msum = None if os.path.exists(fname): f = file(fname, 'rb') m = md5.new() while True: d = f.read(8096) if not d: break m.update(d) md5sum = m.hexdigest() f.close() return md5sum _oxdb_extensions = ( '.avi', '.mov', '.ogg', '.ogm', '.mkv', '.mpg', '.wmv', '.mp4v', '.mp4', '.rm', '.mpeg', '.rmvb', '.mp3', '.wav', '.srt', '.sub', '.idx', '.rar', '.jpg', '.png', ) def _oxdb_file_blacklist(f): if f.startswith('.'): return True return False def oxdb_spider(archive_base): oxdb_files = [] for root, dirs, files in os.walk(archive_base): for d in dirs: oxdb_files.extend(oxdb_spider(os.path.join(root, d))) for f in files: if not _oxdb_file_blacklist(f): if os.path.splitext(f)[1] in _oxdb_extensions: oxdb_files.append(os.path.join(root, f)) return oxdb_files _known_oxdb_extensions = ['Interview'] _known_oxdb_extensions_reg = ["\d\d\dx\d\d\d", "S\d\dE\d\d", "S\d\dE\d\d-E\d\d" "Season .*", "Episode .*", 'khz$'] def _in_known_oxdb_extensions(term): ''' used to remove parts that are known to not be part of the title ''' if term in _known_oxdb_extensions: return True for reg in _known_oxdb_extensions_reg: if re.compile(reg, re.IGNORECASE).findall(term): return True return False def oxdb_title(title): ''' normalize filename to get movie title ''' title = title.split('.')[:-1] while len(title) > 1 and ( \ _in_known_oxdb_extensions(title[-1]) or \ title[-1].startswith('Part ') or \ len(title[-1]) == 2 or \ len(title[-1]) == 4): title = title[:-1] title = ".".join(title) return title def oxdb_import_files(archive): stats = {'skipped': 0, 'added': 0, 'remove':0} oxdb_backend = OXDb() base = oxdb_backend.getBase() print base files = oxdb_spider(base) oxdb_files = oxdb_backend.files() md5sum_on_disk = [] for f in files: meta = oxdb_file_stats(f) f = f.replace(base, '') if oxdb_files.has_key(f) and oxdb_files[f]['size'] == meta['size']: stats['skipped'] += 1 md5sum_on_disk.append(oxdb_files[f]['md5sum']) else: meta = oxdb_file_metadata(meta) #remove base meta['path'] = f.encode('utf-8') #ignore files changed in the last 5 minutes if time.time() - meta['date'] > 300: print oxdb_backend.addFile(meta), f stats['added'] += 1 else: print "to hot, skipping for now", f md5sum_on_disk.append(meta['md5sum']) for f in oxdb_files: if oxdb_files[f]['md5sum'] not in md5sum_on_disk: print "remove", f oxdb_backend.removeFile({'md5sum':oxdb_files[f]['md5sum']}) stats['remove'] += 1 print stats return stats def oxdb_file_stats(fname): stat = os.stat(fname) size = stat.st_size date = stat.st_mtime return { 'path': fname, 'size': size, 'date': date, } def oxdb_file_metadata(meta): ''' check if file is in db, add otherwise ''' meta['video'] = '' meta['audio'] = '' meta['length'] = 0 meta['bpp'] = 0 meta['md5sum'] = oxdb_md5sum(meta['path']) #FIXME: use midentifiy or other to get more info about file return meta title = oxdb_title(os.path.basename(meta['path'])) director = os.path.basename(os.path.dirname(meta['path'])) print '"%s"' % title, ' by', director #imdb = oxdb_backend.byMD5Sum(md5sum) # if invoked on command line, print md5 hashes of specified files. if __name__ == '__main__': #for fname in sys.argv[1:]: # print oxdb_md5sum(fname), fname oxdb_import_files()