#!/usr/bin/env python # -*- coding: utf-8 -*- # -*- Mode: Python; -*- # vi:si:et:sw=2:sts=2:ts=2 # OXDb Import client, crawls the filesystem and gathers information about # movies # import md5 import os import sys import re import urllib from scrapeit.utils import read_url _oxdb_file_blacklist = ['.DS_Store'] _oxdb_extensions = [ '.avi', '.ogg', '.ogm', '.mkv', '.mpg', '.wmv', '.mp4v', '.mp4', '.srt', '.sub', '.idx' ] _known_oxdb_extensions = ['Interview'] _known_oxdb_extensions_reg = ["\d\d\dx\d\d\d", "S\d\dE\d\d", "S\d\dE\d\d-E\d\d" "Season .*", "Episode .*", 'khz$'] def _in_known_oxdb_extensions(term): ''' used to remove parts that are known to not be part of the title ''' if term in _known_oxdb_extensions: return True for reg in _known_oxdb_extensions_reg: if re.compile(reg, re.IGNORECASE).findall(term): return True return False def oxdb_filenameUmlaute(string): string = u"%s" % string string = string.replace(u'ae', u'ä') string = string.replace(u'oe', u'ö') string = string.replace(u'ue', u'ü') string = string.replace(u'Ae', u'Ä') string = string.replace(u'Oe', u'Ö') string = string.replace(u'Ue', u'Ü') return string def oxdb_director(director): director = os.path.basename(os.path.dirname(director)) director.replace(' & ', ', ') return director def oxdb_title(title): ''' normalize filename to get movie title ''' title = os.path.basename(title).replace('. ', '_dot__space_') title = title.replace(' .', '_space__dot_') title = title.split('.')[0] title = title.replace('_dot__space_', '. ') title = title.replace('_space__dot_', ' .') return title def oxdb_id(title, director): key = u"%s/%s" % (director,title) oxdb_id = '0x%s' % md5.new(key.encode('utf-8')).hexdigest() return oxdb_id ''' seperate number with thousand comma ''' def formatNumber(n, sep=','): ln = list(str(n)) ln.reverse() newn = [] while len(ln) > 3: newn.extend(ln[:3]) newn.append(sep) ln = ln[3:] newn.extend(ln) newn.reverse() return "".join(newn) ''' format runtime for stats ''' def oxdb_runtimeformat(runtime): if runtime == 0: return '' if runtime < 60: return "%s sec" % runtime minutes = int(runtime / 60) seconds = runtime % 60 if minutes < 900: return "%s min" % minutes hours = int(minutes / 60) minutes = minutes % 60 if hours < 24: return "%s hours %s minutes %s seconds" % (hours, minutes, seconds) days = int(hours / 24) hours = hours % 24 if days < 365: return "%s days %s hours %s minutes %s seconds" % (days, hours, minutes, seconds) years = int(days / 365) days = days % 365 return "%s years %s days %s hours %s minutes %s seconds" % (years, days, hours, minutes, seconds) def oxdb_lengthformat(mseconds): """ Format mseconds in a nice way """ seconds = mseconds/1000 minutes = int(seconds / 60) seconds = seconds % 60 hours = int(minutes / 60) minutes = minutes % 60 if hours > 24: days = int(hours / 24) hours = hours % 24 return "%d:%02d:%02d:%02d" % (days, hours, minutes, seconds) return "%02d:%02d:%02d" % (hours, minutes, seconds) """ Format the value like a 'human-readable' file size (i.e. 13 KB, 4.1 MB, 102 bytes, etc). number - number to format. long_name - long name. i.e. byte short - short name, i.e. B """ def oxdb_format(number, long_name, short): if not number: return "0 %ss" % long_name number = float(number) if number < 1024: return "%d %s%s" % (number, long_name, number != 1 and 's' or '') if number < 1024 * 1024: return "%d K%s" % ((number / 1024), short) if number < 1024 * 1024 * 1024: return "%.1f M%s" % (number / (1024 * 1024), short) if number < 1024 * 1024 * 1024 * 1024: return "%.2f G%s" % (number / (1024 * 1024 * 1024), short) return "%.3f T%s" % (number / (1024 * 1024 * 1024 * 1024), short) def oxdb_filesizeformat(number): return oxdb_format(number, 'byte', 'B') def oxdb_bitformat(number): return oxdb_format(number, 'bit', 'b') def oxdb_pixelformat(number): return oxdb_format(number, 'pixel', 'px') from htmlentitydefs import name2codepoint # This pattern matches a character entity reference (a decimal numeric # references, a hexadecimal numeric reference, or a named reference). charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def htmldecode(text): """Decode HTML entities in the given text.""" if type(text) is unicode: uchr = unichr else: uchr = lambda value: value > 255 and unichr(value) or chr(value) def entitydecode(match, uchr=uchr): entity = match.group(1) if entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): return uchr(int(entity[1:])) elif entity in name2codepoint: return uchr(name2codepoint[entity]) else: return match.group(0) return charrefpat.sub(entitydecode, text) def highlight(text, query): if query: m = re.compile("(%s)" % re.escape(query), re.IGNORECASE).findall(text) for i in m: text = re.sub("(%s)" % re.escape(i), '\\1', text) return text def imdb2oxdb(imdb): if imdb.startswith('0x') or imdb.startswith('ox') : return imdb return "0x%06X" % int(imdb) def oxdb2imdb(oxdb): if len(oxdb) == 8: return "%07d" % float(oxdb) return oxdb def trimString(string, length): if len(string) > length: string = string[:length - 13] + '...' + string[-10:] return string languages = ('be', 'bg', 'ba', 'wo', 'bn', 'bo', 'bh', 'bi', 'ji', 'br', 'ja', 'ru', 'rw', 'tl', 'rm', 'rn', 'ro', 'gu', 'jw', 'gd', 'ga', 'sv', 'gn', 'gl', 'om', 'tn', 'fa', 'oc', 'ss', 'or', 'hy', 'hr', 'sw', 'hu', 'hi', 'su', 'ha', 'ps', 'pt', 'sk', 'pa', 'pl', 'el', 'eo', 'en', 'zh', 'sm', 'eu', 'et', 'sa', 'es', 'mg', 'uz', 'ml', 'mo', 'mn', 'mi', 'as', 'mk', 'ur', 'mt', 'ms', 'mr', 'my', 'fr', 'fy', 'ia', 'zu', 'fi', 'fj', 'fo', 'nl', 'no', 'na', 'ne', 'xh', 'co', 'ca', 'cy', 'cs', 'ka', 'kk', 'sr', 'sq', 'ko', 'kn', 'km', 'kl', 'ks', 'si', 'sh', 'so', 'sn', 'ku', 'sl', 'ky', 'sg', 'sd', 'yo', 'de', 'da', 'dz', 'la', 'ln', 'lo', 'tt', 'tr', 'ts', 'lv', 'to', 'lt', 'tk', 'th', 'ti', 'tg', 'te', 'ta', 'aa', 'ab', 'uk', 'af', 'vi', 'is', 'am', 'it', 'iw', 'vo', 'ik', 'ar', 'in', 'ay', 'az', 'ie', 'qu', 'st', 'tw')