cablegates/pandora/item/utils.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
#
from decimal import Decimal
import os
import re
import hashlib
import unicodedata

from django.conf import settings
import ox
import ox.iso
from ox.normalize import normalizeName, normalizeTitle


def parse_decimal(string):
    string = string.replace(':', '/')
    if '/' not in string:
        string = '%s/1' % string
    d = string.split('/')
    return Decimal(d[0]) / Decimal(d[1])


def plural_key(term):
    return {
        'country': 'countries',
    }.get(term, term + 's')


def oxid(title, directors, year='', seriesTitle='', episodeTitle='', season=0, episode=0):
    director = ', '.join(directors)
    oxid_value = u"\n".join([title, director, year])
    oxid = hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()
    if seriesTitle:
        oxid_value = u"\n".join([seriesTitle, "%02d" % season])
        oxid = hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()[:20]
        oxid_value = u"\n".join(["%02d" % episode, episodeTitle, director, year])
        oxid += hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()[:20]
    return u"0x" + oxid


def oxdb_id(title, director=[], year='', season='', episode='', episode_title='', episode_director=[], episode_year=''):
    # new id function, will replace oxid()
    def get_hash(string):
        return hashlib.sha1(string.encode('utf-8')).hexdigest().upper()
    director = ', '.join(director)
    episode_director = ', '.join(episode_director)
    if not episode:
        oxdb_id = get_hash(director)[:8] + get_hash('\n'.join([title, str(year)]))[:8]
    else:
        oxdb_id = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \
                  get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]
    return u'0x' + oxdb_id


def oxdb_directors(director):
    director = os.path.basename(os.path.dirname(director))
    if director.endswith('_'):
        director = "%s." % director[:-1]
    directors = [normalizeName(d) for d in director.split('; ')]

    def cleanup(director):
        director = director.strip()
        director = director.replace('Series', '')
        director = director.replace('Unknown Director', '')
        director = director.replace('Various Directors', '')
        return director
    directors = filter(None, [cleanup(d) for d in directors])
    return directors


def oxdb_title(_title, searchTitle = False):
    '''
      normalize filename to get item title
    '''
    _title = os.path.basename(_title)
    _title = _title.replace('... ', '_dot_dot_dot_')
    _title = _title.replace('. ', '_dot__space_')
    _title = _title.replace(' .', '_space__dot_')
    title = _title.split('.')[0]
    title = re.sub('([A-Za-z0-9])_ ', '\\1: ', title)
    se = re.compile('Season (\d+).Episode (\d+)').findall(_title)
    if se:
        se = "S%02dE%02d" % (int(se[0][0]), int(se[0][1]))
        if 'Part' in _title.split('.')[-2] and 'Episode' not in _title.split('.')[-3]:
            stitle = _title.split('.')[-3]
        else:
            stitle = _title.split('.')[-2]
        if stitle.startswith('Episode '):
            stitle = ''
        if searchTitle:
            title = '"%s" %s' % (title, stitle)
        else:
            title = '%s (%s) %s' % (title, se, stitle)
            title = title.strip()
    title = title.replace('_dot_dot_dot_', '... ')
    title = title.replace('_dot__space_', '. ')
    title = title.replace('_space__dot_', ' .')
    year = ox.findRe(title, '(\(\d{4}\))')
    if year and title.endswith(year):
        title = title[:-len(year)].strip()
    title = normalizeTitle(title)
    return title


def oxdb_year(data):
    return ox.findRe(data, '\.(\d{4})\.')


def oxdb_series_title(path):
    seriesTitle = u''
    if path.startswith('Series'):
        seriesTitle = os.path.basename(path)
    else:
        t = oxdb_title(path)
        if " (S" in t:
            seriesTitle = t.split(" (S")[0]
    return seriesTitle


def oxdb_episode_title(path):
    episodeTitle = u''
    ep = re.compile('.Episode \d+?\.(.*?)\.[a-zA-Z]').findall(path)
    if ep:
        episodeTitle = ep[0]
    return episodeTitle


def oxdb_season_episode(path):
    season = 0
    episode = 0
    path = os.path.basename(path)
    se = re.compile('Season (\d+).Episode (\d+)').findall(path)
    if se:
        season = int(se[0][0])
        episode = int(se[0][1])
    else:
        ep = re.compile('.Episode (\d+?)').findall(path)
        if ep:
            episode = int(ep[0][0])
    if season == 0 and episode == 0:
        se = re.compile('S(\d\d)E(\d\d)').findall(path)
        if se:
            season = int(se[0][0])
            episode = int(se[0][1])
    return (season, episode)


def oxdb_part(path):
    part = 1
    path = path.lower()
    p = re.compile('part\s*?(\d+)\.').findall(path)
    if p:
        part = p[0]
    else:
        p = re.compile('cd\s*?(\d+)\.').findall(path)
        if p:
            part = p[0]
    return part


def parse_path(path):
    '''
        expects path in the form
            L/Last, First/Title (YYYY)
            M/McCarthy, Thomas/The Visitor (2007)
            G/Godard, Jean-Luc/Histoire(s) du cinema_ Toutes les histoires (1988)
    '''
    r = {}
    r['title'] = oxdb_title(path)
    if not settings.USE_IMDB:
        return r
    import ox.web.imdb
    search_title = oxdb_title(path, True)
    r['directors'] = oxdb_directors(path)
    year = ox.findRe(path, '\((\d{4})\)')
    if year:
        r['year'] = year

    #FIXME: only include it its actually a series
    r['episode_title'] = oxdb_episode_title(path)
    r['season'], r['episode'] = oxdb_season_episode(path)
    r['series_title'] = oxdb_series_title(path)

    r['imdbId'] = ox.web.imdb.guess(search_title, ', '.join(r['directors']), timeout=-1)
    r['oxdbId'] = oxdb_id(r['title'], r['directors'], r.get('year', ''),
                          r.get('season', ''), r.get('episode', ''),
                          episode_title=r['episode_title'],
                          episode_director=[],
                          episode_year='')
    return r


def sort_string(string):
    string = string.replace(u'Þ', 'Th')
    return unicodedata.normalize('NFKD', string)


def sort_title(title):
    #title
    title = re.sub(u'[\'!¿¡,\.;\-"\:\*\[\]]', '', title)

    #title = title.replace(u'Æ', 'Ae')
    if isinstance(title, str):
        title = unicode(title)
    title = sort_string(title)

    #pad numbered titles
    title = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), title)
    return title.strip()

def get_positions(ids, pos):
    '''
    >>> get_positions([1,2,3,4], [2,4])
    {2: 1, 4: 3}
    '''
    positions = {}
    for i in pos:
        try:
            positions[i] = ids.index(i)
        except:                         
            pass                                    
    return positions
foo 2009-06-08 16:08:59 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`#`
actions, move parse_decimal 2010-12-22 15:17:38 +00:00			`from decimal import Decimal`
foo 2009-06-08 16:08:59 +00:00			`import os`
			`import re`
			`import hashlib`
use unicodedata.normalize for sort_title 2010-11-14 18:58:33 +00:00			`import unicodedata`
foo 2009-06-08 16:08:59 +00:00
make systems without imdb work 2011-01-28 08:48:38 +00:00			`from django.conf import settings`
oxlib->ox, oxweb->ox.web 2010-07-07 22:46:41 +00:00			`import ox`
			`import ox.iso`
cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00			`from ox.normalize import normalizeName, normalizeTitle`
foo 2009-06-08 16:08:59 +00:00
uploads, transcodes, /ra 2010-08-24 17:16:33 +00:00
actions, move parse_decimal 2010-12-22 15:17:38 +00:00			`def parse_decimal(string):`
			`string = string.replace(':', '/')`
			`if '/' not in string:`
			`string = '%s/1' % string`
			`d = string.split('/')`
			`return Decimal(d[0]) / Decimal(d[1])`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
uploads, transcodes, /ra 2010-08-24 17:16:33 +00:00			`def plural_key(term):`
			`return {`
			`'country': 'countries',`
			`}.get(term, term + 's')`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`def oxid(title, directors, year='', seriesTitle='', episodeTitle='', season=0, episode=0):`
			`director = ', '.join(directors)`
foo 2009-06-08 16:08:59 +00:00			`oxid_value = u"\n".join([title, director, year])`
			`oxid = hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()`
			`if seriesTitle:`
			`oxid_value = u"\n".join([seriesTitle, "%02d" % season])`
			`oxid = hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()[:20]`
			`oxid_value = u"\n".join(["%02d" % episode, episodeTitle, director, year])`
			`oxid += hashlib.sha1(oxid_value.encode('utf-8')).hexdigest()[:20]`
			`return u"0x" + oxid`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
more plural cleanup 2011-01-04 06:50:52 +00:00			`def oxdb_id(title, director=[], year='', season='', episode='', episode_title='', episode_director=[], episode_year=''):`
adding new 0xDB Id function 2010-09-17 14:11:37 +00:00			`# new id function, will replace oxid()`
			`def get_hash(string):`
			`return hashlib.sha1(string.encode('utf-8')).hexdigest().upper()`
more plural cleanup 2011-01-04 06:50:52 +00:00			`director = ', '.join(director)`
			`episode_director = ', '.join(episode_director)`
adding new 0xDB Id function 2010-09-17 14:11:37 +00:00			`if not episode:`
year, episode, season can be ints 2010-12-07 18:35:55 +00:00			`oxdb_id = get_hash(director)[:8] + get_hash('\n'.join([title, str(year)]))[:8]`
adding new 0xDB Id function 2010-09-17 14:11:37 +00:00			`else:`
year, episode, season can be ints 2010-12-07 18:35:55 +00:00			`oxdb_id = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \`
			`get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]`
adding new 0xDB Id function 2010-09-17 14:11:37 +00:00			`return u'0x' + oxdb_id`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`def oxdb_directors(director):`
unbreak parse_path 2010-12-08 00:30:45 +00:00			`director = os.path.basename(os.path.dirname(director))`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`if director.endswith('_'):`
			`director = "%s." % director[:-1]`
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`directors = [normalizeName(d) for d in director.split('; ')]`
cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`def cleanup(director):`
			`director = director.strip()`
			`director = director.replace('Series', '')`
			`director = director.replace('Unknown Director', '')`
			`director = director.replace('Various Directors', '')`
			`return director`
			`directors = filter(None, [cleanup(d) for d in directors])`
			`return directors`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00
cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_title(_title, searchTitle = False):`
			`'''`
rename Movie to Item 2010-09-23 16:01:48 +00:00			`normalize filename to get item title`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`'''`
			`_title = os.path.basename(_title)`
			`_title = _title.replace('... ', '_dot_dot_dot_')`
			`_title = _title.replace('. ', '_dot__space_')`
			`_title = _title.replace(' .', '_space__dot_')`
			`title = _title.split('.')[0]`
A_ should also be A: 2010-12-23 09:02:15 +00:00			`title = re.sub('([A-Za-z0-9])_ ', '\\1: ', title)`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`se = re.compile('Season (\d+).Episode (\d+)').findall(_title)`
			`if se:`
			`se = "S%02dE%02d" % (int(se[0][0]), int(se[0][1]))`
			`if 'Part' in _title.split('.')[-2] and 'Episode' not in _title.split('.')[-3]:`
			`stitle = _title.split('.')[-3]`
			`else:`
			`stitle = _title.split('.')[-2]`
			`if stitle.startswith('Episode '):`
cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00			`stitle = ''`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`if searchTitle:`
			`title = '"%s" %s' % (title, stitle)`
			`else:`
			`title = '%s (%s) %s' % (title, se, stitle)`
			`title = title.strip()`
			`title = title.replace('_dot_dot_dot_', '... ')`
			`title = title.replace('_dot__space_', '. ')`
			`title = title.replace('_space__dot_', ' .')`
update files archive api 2010-08-07 14:31:20 +00:00			`year = ox.findRe(title, '(\(\d{4}\))')`
adjust oxdb_directors/oxdb_title to new folder layout 2010-12-01 12:21:23 +00:00			`if year and title.endswith(year):`
update files archive api 2010-08-07 14:31:20 +00:00			`title = title[:-len(year)].strip()`
			`title = normalizeTitle(title)`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`return title`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_year(data):`
oxlib->ox, oxweb->ox.web 2010-07-07 22:46:41 +00:00			`return ox.findRe(data, '\.(\d{4})\.')`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00
cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_series_title(path):`
			`seriesTitle = u''`
			`if path.startswith('Series'):`
unbreak parse_path 2010-12-08 00:30:45 +00:00			`seriesTitle = os.path.basename(path)`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`else:`
			`t = oxdb_title(path)`
			`if " (S" in t:`
			`seriesTitle = t.split(" (S")[0]`
			`return seriesTitle`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_episode_title(path):`
			`episodeTitle = u''`
			`ep = re.compile('.Episode \d+?\.(.*?)\.[a-zA-Z]').findall(path)`
			`if ep:`
file import cleanup 2010-01-22 23:57:06 +00:00			`episodeTitle = ep[0]`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`return episodeTitle`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_season_episode(path):`
			`season = 0`
			`episode = 0`
			`path = os.path.basename(path)`
			`se = re.compile('Season (\d+).Episode (\d+)').findall(path)`
			`if se:`
			`season = int(se[0][0])`
			`episode = int(se[0][1])`
			`else:`
			`ep = re.compile('.Episode (\d+?)').findall(path)`
			`if ep:`
			`episode = int(ep[0][0])`
			`if season == 0 and episode == 0:`
			`se = re.compile('S(\d\d)E(\d\d)').findall(path)`
			`if se:`
			`season = int(se[0][0])`
			`episode = int(se[0][1])`
			`return (season, episode)`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`def oxdb_part(path):`
			`part = 1`
			`path = path.lower()`
			`p = re.compile('part\s*?(\d+)\.').findall(path)`
			`if p:`
			`part = p[0]`
			`else:`
			`p = re.compile('cd\s*?(\d+)\.').findall(path)`
			`if p:`
			`part = p[0]`
			`return part`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
update files archive api 2010-08-07 14:31:20 +00:00			`def parse_path(path):`
unbreak parse_path 2010-12-08 00:30:45 +00:00			`'''`
			`expects path in the form`
			`L/Last, First/Title (YYYY)`
			`M/McCarthy, Thomas/The Visitor (2007)`
			`G/Godard, Jean-Luc/Histoire(s) du cinema_ Toutes les histoires (1988)`
			`'''`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`r = {}`
			`r['title'] = oxdb_title(path)`
make systems without imdb work 2011-01-28 08:48:38 +00:00			`if not settings.USE_IMDB:`
			`return r`
			`import ox.web.imdb`
			`search_title = oxdb_title(path, True)`
			`r['directors'] = oxdb_directors(path)`
update files archive api 2010-08-07 14:31:20 +00:00			`year = ox.findRe(path, '\((\d{4})\)')`
			`if year:`
			`r['year'] = year`

			`#FIXME: only include it its actually a series`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`r['episode_title'] = oxdb_episode_title(path)`
			`r['season'], r['episode'] = oxdb_season_episode(path)`
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`r['series_title'] = oxdb_series_title(path)`
update files archive api 2010-08-07 14:31:20 +00:00
- simplify metadata db structure - better sort values if field is empty - switch to python-ox, faster imdb import - move files backend into own app 2010-07-12 14:56:14 +00:00			`r['imdbId'] = ox.web.imdb.guess(search_title, ', '.join(r['directors']), timeout=-1)`
use oxdb_id for parse_path too. pytohn-ox needs /home/j 2010-11-30 23:33:42 +00:00			`r['oxdbId'] = oxdb_id(r['title'], r['directors'], r.get('year', ''),`
			`r.get('season', ''), r.get('episode', ''),`
			`episode_title=r['episode_title'],`
use imdb/icon/expose timeline 2011-01-16 13:28:57 +00:00			`episode_director=[],`
use oxdb_id for parse_path too. pytohn-ox needs /home/j 2010-11-30 23:33:42 +00:00			`episode_year='')`
testing interface, more work on backend 2009-10-04 22:00:08 +00:00			`return r`

cleanup imports and syntax warnings 2011-01-01 11:44:42 +00:00
sort cleanup 2011-01-03 19:45:56 +00:00			`def sort_string(string):`
			`string = string.replace(u'Þ', 'Th')`
			`return unicodedata.normalize('NFKD', string)`


stream, videosupport 2010-09-03 13:28:44 +00:00			`def sort_title(title):`
			`#title`
			`title = re.sub(u'[\'!¿¡,\.;\-"\:\*\[\]]', '', title)`
use unicodedata.normalize for sort_title 2010-11-14 18:58:33 +00:00
			`#title = title.replace(u'Æ', 'Ae')`
unicode 2010-11-26 17:16:57 +00:00			`if isinstance(title, str):`
			`title = unicode(title)`
sort cleanup 2011-01-03 19:45:56 +00:00			`title = sort_string(title)`
use unicodedata.normalize for sort_title 2010-11-14 18:58:33 +00:00
stream, videosupport 2010-09-03 13:28:44 +00:00			`#pad numbered titles`
			`title = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), title)`
			`return title.strip()`
even more improvements to lists 2011-01-13 19:40:50 +00:00
			`def get_positions(ids, pos):`
			`'''`
			`>>> get_positions([1,2,3,4], [2,4])`
			`{2: 1, 4: 3}`
			`'''`
			`positions = {}`
			`for i in pos:`
			`try:`
			`positions[i] = ids.index(i)`
			`except:`
			`pass`
			`return positions`