oxdbarchive/oxdbarchive/oxdb_utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- Mode: Python; -*-
# vi:si:et:sw=2:sts=2:ts=2
# OXDb Import client, crawls the filesystem and gathers information about 
# movies
#

import md5
import os
import sys
import re
import urllib

from scrapeit.utils import read_url

_oxdb_file_blacklist = ['.DS_Store']
_oxdb_extensions = [
  '.avi', '.ogg', '.ogm', '.mkv', '.mpg', '.wmv', '.mp4v', '.mp4',
  '.srt', '.sub', '.idx'
]

_known_oxdb_extensions = ['Interview']
_known_oxdb_extensions_reg = ["\d\d\dx\d\d\d", "S\d\dE\d\d", "S\d\dE\d\d-E\d\d" "Season .*", "Episode .*", 'khz$']

def _in_known_oxdb_extensions(term):
  '''
    used to remove parts that are known to not be part of the title
  '''
  if term in _known_oxdb_extensions:
    return True
  for reg in _known_oxdb_extensions_reg:
    if re.compile(reg, re.IGNORECASE).findall(term):
      return True
  return False
  
def oxdb_filenameUmlaute(string):
  string = u"%s" % string
  string = string.replace(u'ae', u'ä')
  string = string.replace(u'oe', u'ö')
  string = string.replace(u'ue', u'ü')
  string = string.replace(u'Ae', u'Ä')
  string = string.replace(u'Oe', u'Ö')
  string = string.replace(u'Ue', u'Ü')
  return string
  
def oxdb_director(director):
  director = os.path.basename(os.path.dirname(director))
  director.replace(' & ', ', ')
  return director
  
def oxdb_title(title):
  '''
    normalize filename to get movie title
  '''
  title = os.path.basename(title).replace('. ', '_dot__space_')
  title = title.replace(' .', '_space__dot_')
  title = title.split('.')[0]
  title = title.replace('_dot__space_', '. ')
  title = title.replace('_space__dot_', ' .')
  return title


def oxdb_id(title, director):
  key = u"%s/%s" % (director,title)
  oxdb_id = '0x%s' % md5.new(key.encode('utf-8')).hexdigest()
  return oxdb_id

'''
seperate number with thousand comma
'''
def formatNumber(n, sep=','):
  ln = list(str(n))
  ln.reverse()
  newn = []
  while len(ln) > 3:
    newn.extend(ln[:3])
    newn.append(sep)
    ln = ln[3:]
    newn.extend(ln)
    newn.reverse()
  return "".join(newn)
  
'''
format runtime for stats
'''
def oxdb_runtimeformat(runtime):
  if runtime == 0:
    return ''
  if runtime < 60:
    return "%s sec" % runtime
  minutes = int(runtime / 60)
  seconds = runtime % 60
  if minutes < 900:
    return "%s min" % minutes
  hours = int(minutes / 60)
  minutes = minutes % 60
  if hours < 24:
    return "%s hours %s minutes %s seconds" % (hours, minutes, seconds)
  days = int(hours / 24)
  hours = hours % 24
  if days < 365:
    return "%s days %s hours %s minutes %s seconds" % (days, hours, minutes, seconds)
  years = int(days / 365)
  days = days % 365
  return "%s years %s days %s hours %s minutes %s seconds" % (years, days, hours, minutes, seconds)
  
def oxdb_lengthformat(mseconds):
  """
    Format mseconds in a nice way
  """
  seconds = mseconds/1000
  minutes = int(seconds / 60)
  seconds = seconds % 60
  hours = int(minutes / 60)
  minutes = minutes % 60
  if hours > 24:
    days = int(hours / 24)
    hours = hours % 24
    return "%d:%02d:%02d:%02d" % (days, hours, minutes, seconds)
  return "%02d:%02d:%02d" % (hours, minutes, seconds)

"""
Format the value like a 'human-readable' file size (i.e. 13 KB, 4.1 MB, 102
bytes, etc).
  number - number to format.
  long_name - long name. i.e. byte
  short - short name, i.e. B
"""
def oxdb_format(number, long_name, short):
  if not number:
    return "0 %ss" % long_name
  number = float(number)
  if number < 1024:
    return "%d %s%s" % (number, long_name, number != 1 and 's' or '')
  if number < 1024 * 1024:
    return "%d K%s" % ((number / 1024), short)
  if number < 1024 * 1024 * 1024:
    return "%.1f M%s" % (number / (1024 * 1024), short)
  if number < 1024 * 1024 * 1024 * 1024:
    return "%.2f G%s" % (number / (1024 * 1024 * 1024), short)
  return "%.3f T%s" % (number / (1024 * 1024 * 1024 * 1024), short)

def oxdb_filesizeformat(number):
  return oxdb_format(number, 'byte', 'B')

def oxdb_bitformat(number):
  return oxdb_format(number, 'bit', 'b')

def oxdb_pixelformat(number):
  return oxdb_format(number, 'pixel', 'px')
  

from htmlentitydefs import name2codepoint

# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')

def htmldecode(text):
  """Decode HTML entities in the given text."""
  if type(text) is unicode:
    uchr = unichr
  else:
    uchr = lambda value: value > 255 and unichr(value) or chr(value)
  def entitydecode(match, uchr=uchr):
    entity = match.group(1)
    if entity.startswith('#x'):
      return uchr(int(entity[2:], 16))
    elif entity.startswith('#'):
      return uchr(int(entity[1:]))
    elif entity in name2codepoint:
      return uchr(name2codepoint[entity])
    else:
      return match.group(0)
  return charrefpat.sub(entitydecode, text)

def highlight(text, query):
  if query:
    m = re.compile("(%s)" % re.escape(query), re.IGNORECASE).findall(text)
    for i in m:
      text = re.sub("(%s)" % re.escape(i), '<span class="textHighlight">\\1</span>', text)
  return text

def imdb2oxdb(imdb):
  if imdb.startswith('0x') or imdb.startswith('ox') :
    return imdb
  return "0x%06X" % int(imdb)
  
def oxdb2imdb(oxdb):
  if len(oxdb) == 8:
    return "%07d" % float(oxdb)
  return oxdb
  
def trimString(string, length):
  if len(string) > length:
    string = string[:length - 13] + '...' + string[-10:]
  return string

languages = ('be', 'bg', 'ba', 'wo', 'bn', 'bo', 'bh', 'bi', 'ji', 'br', 'ja', 
 'ru', 'rw', 'tl', 'rm', 'rn', 'ro', 'gu', 'jw', 'gd', 'ga', 'sv', 'gn', 'gl',
 'om', 'tn', 'fa', 'oc', 'ss', 'or', 'hy', 'hr', 'sw', 'hu', 'hi', 'su', 'ha', 
 'ps', 'pt', 'sk', 'pa', 'pl', 'el', 'eo', 'en', 'zh', 'sm', 'eu', 'et', 'sa', 
 'es', 'mg', 'uz', 'ml', 'mo', 'mn', 'mi', 'as', 'mk', 'ur', 'mt', 'ms', 'mr',
 'my', 'fr', 'fy', 'ia', 'zu', 'fi', 'fj', 'fo', 'nl', 'no', 'na', 'ne', 'xh',
 'co', 'ca', 'cy', 'cs', 'ka', 'kk', 'sr', 'sq', 'ko', 'kn', 'km', 'kl', 'ks', 
 'si', 'sh', 'so', 'sn', 'ku', 'sl', 'ky', 'sg', 'sd', 'yo', 'de', 'da', 'dz',
 'la', 'ln', 'lo', 'tt', 'tr', 'ts', 'lv', 'to', 'lt', 'tk', 'th', 'ti', 'tg',
 'te', 'ta', 'aa', 'ab', 'uk', 'af', 'vi', 'is', 'am', 'it', 'iw', 'vo', 'ik', 
 'ar', 'in', 'ay', 'az', 'ie', 'qu', 'st', 'tw')
oxdb archive, backend 219 2007-07-10 12:31:08 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`# -- Mode: Python; --`
			`# vi:si:et:sw=2:sts=2:ts=2`
			`# OXDb Import client, crawls the filesystem and gathers information about`
			`# movies`
			`#`

			`import md5`
			`import os`
			`import sys`
			`import re`
			`import urllib`

			`from scrapeit.utils import read_url`

			`_oxdb_file_blacklist = ['.DS_Store']`
			`_oxdb_extensions = [`
			`'.avi', '.ogg', '.ogm', '.mkv', '.mpg', '.wmv', '.mp4v', '.mp4',`
			`'.srt', '.sub', '.idx'`
			`]`

			`_known_oxdb_extensions = ['Interview']`
			`_known_oxdb_extensions_reg = ["\d\d\dx\d\d\d", "S\d\dE\d\d", "S\d\dE\d\d-E\d\d" "Season .", "Episode .", 'khz$']`

			`def _in_known_oxdb_extensions(term):`
			`'''`
			`used to remove parts that are known to not be part of the title`
			`'''`
			`if term in _known_oxdb_extensions:`
			`return True`
			`for reg in _known_oxdb_extensions_reg:`
			`if re.compile(reg, re.IGNORECASE).findall(term):`
			`return True`
			`return False`

			`def oxdb_filenameUmlaute(string):`
			`string = u"%s" % string`
			`string = string.replace(u'ae', u'ä')`
			`string = string.replace(u'oe', u'ö')`
			`string = string.replace(u'ue', u'ü')`
			`string = string.replace(u'Ae', u'Ä')`
			`string = string.replace(u'Oe', u'Ö')`
			`string = string.replace(u'Ue', u'Ü')`
			`return string`

			`def oxdb_director(director):`
			`director = os.path.basename(os.path.dirname(director))`
			`director.replace(' & ', ', ')`
			`return director`

			`def oxdb_title(title):`
			`'''`
			`normalize filename to get movie title`
			`'''`
			`title = os.path.basename(title).replace('. ', '_dot__space_')`
			`title = title.replace(' .', '_space__dot_')`
			`title = title.split('.')[0]`
			`title = title.replace('_dot__space_', '. ')`
			`title = title.replace('_space__dot_', ' .')`
			`return title`


			`def oxdb_id(title, director):`
			`key = u"%s/%s" % (director,title)`
			`oxdb_id = '0x%s' % md5.new(key.encode('utf-8')).hexdigest()`
			`return oxdb_id`

			`'''`
			`seperate number with thousand comma`
			`'''`
			`def formatNumber(n, sep=','):`
			`ln = list(str(n))`
			`ln.reverse()`
			`newn = []`
			`while len(ln) > 3:`
			`newn.extend(ln[:3])`
			`newn.append(sep)`
			`ln = ln[3:]`
			`newn.extend(ln)`
			`newn.reverse()`
			`return "".join(newn)`

			`'''`
			`format runtime for stats`
			`'''`
			`def oxdb_runtimeformat(runtime):`
			`if runtime == 0:`
			`return ''`
			`if runtime < 60:`
			`return "%s sec" % runtime`
			`minutes = int(runtime / 60)`
			`seconds = runtime % 60`
			`if minutes < 900:`
			`return "%s min" % minutes`
			`hours = int(minutes / 60)`
			`minutes = minutes % 60`
			`if hours < 24:`
			`return "%s hours %s minutes %s seconds" % (hours, minutes, seconds)`
			`days = int(hours / 24)`
			`hours = hours % 24`
			`if days < 365:`
			`return "%s days %s hours %s minutes %s seconds" % (days, hours, minutes, seconds)`
			`years = int(days / 365)`
			`days = days % 365`
			`return "%s years %s days %s hours %s minutes %s seconds" % (years, days, hours, minutes, seconds)`

			`def oxdb_lengthformat(mseconds):`
			`"""`
			`Format mseconds in a nice way`
			`"""`
			`seconds = mseconds/1000`
			`minutes = int(seconds / 60)`
			`seconds = seconds % 60`
			`hours = int(minutes / 60)`
			`minutes = minutes % 60`
			`if hours > 24:`
			`days = int(hours / 24)`
			`hours = hours % 24`
			`return "%d:%02d:%02d:%02d" % (days, hours, minutes, seconds)`
			`return "%02d:%02d:%02d" % (hours, minutes, seconds)`

			`"""`
			`Format the value like a 'human-readable' file size (i.e. 13 KB, 4.1 MB, 102`
			`bytes, etc).`
			`number - number to format.`
			`long_name - long name. i.e. byte`
			`short - short name, i.e. B`
			`"""`
			`def oxdb_format(number, long_name, short):`
			`if not number:`
			`return "0 %ss" % long_name`
			`number = float(number)`
			`if number < 1024:`
			`return "%d %s%s" % (number, long_name, number != 1 and 's' or '')`
			`if number < 1024 * 1024:`
			`return "%d K%s" % ((number / 1024), short)`
			`if number < 1024 * 1024 * 1024:`
			`return "%.1f M%s" % (number / (1024 * 1024), short)`
			`if number < 1024 * 1024 * 1024 * 1024:`
			`return "%.2f G%s" % (number / (1024 * 1024 * 1024), short)`
			`return "%.3f T%s" % (number / (1024 * 1024 * 1024 * 1024), short)`

			`def oxdb_filesizeformat(number):`
			`return oxdb_format(number, 'byte', 'B')`

			`def oxdb_bitformat(number):`
			`return oxdb_format(number, 'bit', 'b')`

			`def oxdb_pixelformat(number):`
			`return oxdb_format(number, 'pixel', 'px')`


			`from htmlentitydefs import name2codepoint`

			`# This pattern matches a character entity reference (a decimal numeric`
			`# references, a hexadecimal numeric reference, or a named reference).`
			`charrefpat = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?')`

			`def htmldecode(text):`
			`"""Decode HTML entities in the given text."""`
			`if type(text) is unicode:`
			`uchr = unichr`
			`else:`
			`uchr = lambda value: value > 255 and unichr(value) or chr(value)`
			`def entitydecode(match, uchr=uchr):`
			`entity = match.group(1)`
			`if entity.startswith('#x'):`
			`return uchr(int(entity[2:], 16))`
			`elif entity.startswith('#'):`
			`return uchr(int(entity[1:]))`
			`elif entity in name2codepoint:`
			`return uchr(name2codepoint[entity])`
			`else:`
			`return match.group(0)`
			`return charrefpat.sub(entitydecode, text)`

			`def highlight(text, query):`
			`if query:`
			`m = re.compile("(%s)" % re.escape(query), re.IGNORECASE).findall(text)`
			`for i in m:`
			`text = re.sub("(%s)" % re.escape(i), '<span class="textHighlight">\\1</span>', text)`
			`return text`

			`def imdb2oxdb(imdb):`
			`if imdb.startswith('0x') or imdb.startswith('ox') :`
			`return imdb`
			`return "0x%06X" % int(imdb)`

			`def oxdb2imdb(oxdb):`
			`if len(oxdb) == 8:`
			`return "%07d" % float(oxdb)`
			`return oxdb`

			`def trimString(string, length):`
			`if len(string) > length:`
			`string = string[:length - 13] + '...' + string[-10:]`
			`return string`

			`languages = ('be', 'bg', 'ba', 'wo', 'bn', 'bo', 'bh', 'bi', 'ji', 'br', 'ja',`
			`'ru', 'rw', 'tl', 'rm', 'rn', 'ro', 'gu', 'jw', 'gd', 'ga', 'sv', 'gn', 'gl',`
			`'om', 'tn', 'fa', 'oc', 'ss', 'or', 'hy', 'hr', 'sw', 'hu', 'hi', 'su', 'ha',`
			`'ps', 'pt', 'sk', 'pa', 'pl', 'el', 'eo', 'en', 'zh', 'sm', 'eu', 'et', 'sa',`
			`'es', 'mg', 'uz', 'ml', 'mo', 'mn', 'mi', 'as', 'mk', 'ur', 'mt', 'ms', 'mr',`
			`'my', 'fr', 'fy', 'ia', 'zu', 'fi', 'fj', 'fo', 'nl', 'no', 'na', 'ne', 'xh',`
			`'co', 'ca', 'cy', 'cs', 'ka', 'kk', 'sr', 'sq', 'ko', 'kn', 'km', 'kl', 'ks',`
			`'si', 'sh', 'so', 'sn', 'ku', 'sl', 'ky', 'sg', 'sd', 'yo', 'de', 'da', 'dz',`
			`'la', 'ln', 'lo', 'tt', 'tr', 'ts', 'lv', 'to', 'lt', 'tk', 'th', 'ti', 'tg',`
			`'te', 'ta', 'aa', 'ab', 'uk', 'af', 'vi', 'is', 'am', 'it', 'iw', 'vo', 'ik',`
			`'ar', 'in', 'ay', 'az', 'ie', 'qu', 'st', 'tw')`