diff --git a/ox/__init__.py b/ox/__init__.py index 7c1f959..06225a9 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -14,5 +14,6 @@ from image import * from js import * from location import * from normalize import * +from sort import * from text import * from torrent import * \ No newline at end of file diff --git a/ox/file.py b/ox/file.py index 1d3e2ef..3636a0b 100644 --- a/ox/file.py +++ b/ox/file.py @@ -88,3 +88,45 @@ def makedirs(path): except OSError, e: if e.errno != 17: raise + +def copy_file(source, target, verbose=False): + if verbose: + print 'copying', source, 'to', target + write_file(target, read_file(source)) + +def read_file(file, verbose=False): + if verbose: + print 'reading', file + f = open(file) + data = f.read() + f.close() + return data + +def read_json(file, verbose=False): + return json.loads(read_file(file, verbose=verbose)) + +def write_file(file, data, verbose=False): + if verbose: + print 'writing', file + write_path(file) + f = open(file, 'w') + f.write(data) + f.close() + return len(data) + +def write_json(file, data, indent=0, sort_keys=False, verbose=False): + data = json.dumps(data, indent=indent, sort_keys=sort_keys) + write_file(file, data, verbose=verbose) + +def write_link(source, target, verbose=False): + if verbose: + print 'linking', source, 'to', target + write_path(target) + if os.path.exists(target): + os.unlink(target) + os.symlink(source, target) + +def write_path(file): + path = os.path.split(file)[0] + if path and not os.path.exists(path): + os.makedirs(path) diff --git a/ox/normalize.py b/ox/normalize.py index f2a4016..61cdc28 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -37,6 +37,26 @@ _noarticles = ( 'i was', ) +ARTICLES = list(set([ + # def sg, def pl, indef sg, indef pl. all m/f/n + 'der', 'die', 'das', 'ein', 'eine', # de + 'the', 'a', 'an', # en + 'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es + 'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr + 'il', 'lo', "l'" 'la', 'i', 'gli', 'le', # it + 'de', 'het', 'een', # nl + 'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt +])) +NAME_PREFIXES = [ + 'al', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'du', + 'e', 'el', 'la', 'the', 'van', 'vom', 'von', 'y', 'zu' +] +NAME_MIDFIXES = ['und'] +NAME_SUFFIXES = ['jr', 'jr.', 'sr', 'sr.'] + +def get_sort_title(): + + def canonicalTitle(title): """Return the title in the canonic format 'Movie Title, The'. diff --git a/ox/text.py b/ox/text.py index 3a7d565..3f333b0 100644 --- a/ox/text.py +++ b/ox/text.py @@ -4,6 +4,78 @@ import math import re +ARTICLES = list(set([ + # def sg, def pl, indef sg, indef pl (each m/f/n) + 'der', 'die', 'das', 'ein', 'eine', # de + 'the', 'a', 'an', # en + 'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es + 'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr + 'il', 'lo', "l'" 'la', 'i', 'gli', 'le', # it + 'de', 'het', 'een', # nl + 'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt + # some _disabled because of collisions +])) +PREFIXES = [ + 'al', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'du', + 'e', 'el', 'la', 'the', 'van', 'vom', 'von', 'y', 'zu' +] +MIDFIXES = ['und'] +SUFFIXES = ['jr', 'jr.', 'sr', 'sr.'] + +def get_sort_name(name): + """ + + >>> get_sort_name('Alfred Hitchcock') + 'Hitchcock, Alfred' + + >>> get_sort_name('Jean-Luc Godard') + 'Godard, Jean-Luc' + + >>> get_sort_name('Rainer Werner Fassbinder') + 'Fassbinder, Rainer Werner' + + >>> get_sort_name('Brian De Palma') + 'De Palma, Brian' + + >>> get_sort_name('Johan van der Keuken') + 'van der Keuken, Johan' + + >>> get_sort_name('Edward D. Wood Jr.') + 'Wood Jr., Edward D.' + + """ + def add_name(): + if len(first_names): + last_names.insert(0, first_names.pop()) + def find_name(names): + return len(first_names) and first_names[-1].lower() in names + first_names = name.split(' ') + last_names = [] + if find_name(SUFFIXES): + add_name() + add_name() + if find_name(MIDFIXES): + add_name() + add_name() + while find_name(PREFIXES): + add_name() + return ', '.join([' '.join(last_names), ' '.join(first_names)]) + +def get_sort_title(title): + """ + + >>> get_sort_title('Themroc') + 'Themroc' + + >>> get_sort_title('Die Hard') + 'Hard, Die' + + """ + for article in ARTICLES: + if title.lower().startswith(article + ' '): + length = len(article) + return title[length + 1:] + ', ' + title[:length] + return title def findRe(string, regexp): result = re.compile(regexp, re.DOTALL).findall(string)