From bb35daa95c5d56174d486ba476c24d09440f750a Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 14 Aug 2012 16:12:43 +0200 Subject: [PATCH] replace all CammelCase with under_score in ox --- README | 10 ++--- ox/html.py | 14 +++---- ox/location.py | 2 +- ox/movie.py | 18 ++++----- ox/normalize.py | 42 ++++++++++----------- ox/oembed.py | 6 +-- ox/srt.py | 4 +- ox/text.py | 80 ++++++++++++++++++++-------------------- ox/web/allmovie.py | 22 +++++------ ox/web/amazon.py | 16 ++++---- ox/web/criterion.py | 26 ++++++------- ox/web/duckduckgo.py | 4 +- ox/web/epguides.py | 6 +-- ox/web/flixter.py | 2 +- ox/web/freebase.py | 4 +- ox/web/google.py | 4 +- ox/web/imdb.py | 14 +++---- ox/web/impawards.py | 22 +++++------ ox/web/itunes.py | 64 ++++++++++++++++---------------- ox/web/lyricsfly.py | 10 ++--- ox/web/metacritic.py | 6 +-- ox/web/mininova.py | 20 +++++----- ox/web/movieposterdb.py | 4 +- ox/web/opensubtitles.py | 4 +- ox/web/rottentomatoes.py | 16 ++++---- ox/web/siteparser.py | 6 +-- ox/web/spiegel.py | 6 +-- ox/web/thepiratebay.py | 24 ++++++------ ox/web/tv.py | 10 ++--- ox/web/vimeo.py | 2 +- ox/web/wikipedia.py | 18 ++++----- 31 files changed, 242 insertions(+), 244 deletions(-) diff --git a/README b/README index 3749823..c45ba3c 100644 --- a/README +++ b/README @@ -10,12 +10,12 @@ Depends: Usage: import ox - data = ox.cache.readUrl('http:/...') - text = ox.stripTags(data) - ox.normalizeNewlines(text) - ox.formatBytes(len(data)) + data = ox.cache.read_url('http:/...') + text = ox.strip_tags(data) + ox.normalize_newlines(text) + ox.format_bytes(len(data)) - ox.formatBytes(1234567890) + ox.format_bytes(1234567890) '1.15 GB' import ox.web.imdb diff --git a/ox/html.py b/ox/html.py index 6950e01..20aeb45 100644 --- a/ox/html.py +++ b/ox/html.py @@ -56,15 +56,15 @@ def strip_tags(value): stripTags = strip_tags -def stripSpacesBetweenTags(value): +def strip_spaces_between_tags(value): "Returns the given HTML with spaces between tags normalized to a single space" return re.sub(r'>\s+<', '> <', value) -def stripEntities(value): +def strip_entities(value): "Returns the given HTML with all entities (&something;) stripped" return re.sub(r'&(?:\w+|#\d);', '', value) -def fixAmpersands(value): +def fix_ampersands(value): "Returns the given HTML with all unencoded ampersands encoded correctly" return unencoded_ampersands_re.sub('&', value) @@ -113,11 +113,11 @@ def clean_html(text): * Removes stuff like "

  

", but only if it's at the bottom of the text. """ - from text import normalizeNewlines - text = normalizeNewlines(text) + from text import normalize_newlines + text = normalize_newlines(text) text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) - text = fixAmpersands(text) + text = fix_ampersands(text) # Remove all target="" attributes from tags. text = link_target_attribute_re.sub('\\1', text) # Trim stupid HTML such as
. @@ -168,8 +168,6 @@ def decode_html(html): return match.group(0) return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') -decodeHtml = decode_html - def highlight(text, query, hlClass="hl"): """ >>> highlight('me & you and $&%', 'and') diff --git a/ox/location.py b/ox/location.py index 3efc479..101ce7c 100644 --- a/ox/location.py +++ b/ox/location.py @@ -18,7 +18,7 @@ def latlngspan2latlng(lat, lng, latSpan, lngSpan): lat_ne = lat + latSpan, lng_ne = lng + latSpan ) -def parseLocationString(location_string): +def parse_location_string(location_string): l = location_string.split('+') if len(l) == 1: l = location_string.split(';') diff --git a/ox/movie.py b/ox/movie.py index f9649b2..a0966e6 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -8,8 +8,8 @@ import hashlib import os import re -from normalize import normalizeName -from text import get_sort_name, findRe +from normalize import normalize_name +from text import get_sort_name, find_re __all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid'] @@ -308,14 +308,14 @@ def parse_movie_path(path): if title.endswith('_'): title = title[:-1] + '.' - year = findRe(title, '(\(\d{4}\))') + year = find_re(title, '(\(\d{4}\))') if not year: - year = findRe(title, '(\(\d{4}-\d*\))') + year = find_re(title, '(\(\d{4}-\d*\))') if year and title.endswith(year): title = title[:-len(year)].strip() year = year[1:-1] if '-' in year: - year = findRe(year, '\d{4}') + year = find_re(year, '\d{4}') #director if len(parts) == 4: @@ -323,7 +323,7 @@ def parse_movie_path(path): if director.endswith('_'): director = "%s." % director[:-1] director = director.split('; ') - director = [normalizeName(d).strip() for d in director] + director = [normalize_name(d).strip() for d in director] director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director) else: director = [] @@ -338,13 +338,13 @@ def parse_movie_path(path): language = '' #season/episode/episodeTitle - season = findRe(parts[-1], '\.Season (\d+)\.') + season = find_re(parts[-1], '\.Season (\d+)\.') if season: season = int(season) else: season = None - episode = findRe(parts[-1], '\.Episode (\d+)\.') + episode = find_re(parts[-1], '\.Episode (\d+)\.') if episode: episode = int(episode) else: @@ -373,7 +373,7 @@ def parse_movie_path(path): title = u'%s %s' % (title, episodeTitle) #part - part = findRe(parts[-1], '\.Part (\d+)\.') + part = find_re(parts[-1], '\.Part (\d+)\.') if part: part = int(part) else: diff --git a/ox/normalize.py b/ox/normalize.py index f2a4016..5359d95 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -37,13 +37,13 @@ _noarticles = ( 'i was', ) -def canonicalTitle(title): +def canonical_title(title): """Return the title in the canonic format 'Movie Title, The'. - >>> canonicalTitle('The Movie Title') + >>> canonical_title('The Movie Title') 'Movie Title, The' - >>> canonicalTitle('Los Angeles Plays Itself') + >>> canonical_title('Los Angeles Plays Itself') 'Los Angeles Plays Itself' """ try: @@ -72,10 +72,10 @@ def canonicalTitle(title): ## break return title -def normalizeTitle(title): +def normalize_title(title): """Return the title in the normal "The Title" format. - >>> normalizeTitle('Movie Title, The') + >>> normalize_title('Movie Title, The') 'The Movie Title' """ stitle = title.split(', ') @@ -85,14 +85,14 @@ def normalizeTitle(title): title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) return title -def normalizeImdbId(imdbId): +def normalize_imdbid(imdbId): """Return 7 digit imdbId. - >>> normalizeImdbId('http://www.imdb.com/title/tt0159206/') + >>> normalize_imdbid('http://www.imdb.com/title/tt0159206/') '0159206' - >>> normalizeImdbId(159206) + >>> normalize_imdbid(159206) '0159206' - >>> normalizeImdbId('tt0159206') + >>> normalize_imdbid('tt0159206') '0159206' """ if isinstance(imdbId, basestring): @@ -106,20 +106,20 @@ def normalizeImdbId(imdbId): _sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van', 'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al') -def canonicalName(name): +def canonical_name(name): """Return the given name in canonical "Surname, Name" format. It assumes that name is in the 'Name Surname' format. - >>> canonicalName('Jean Luc Godard') + >>> canonical_name('Jean Luc Godard') 'Godard, Jean Luc' - >>> canonicalName('Ivan Ivanov-Vano') + >>> canonical_name('Ivan Ivanov-Vano') 'Ivanov-Vano, Ivan' - >>> canonicalName('Gus Van Sant') + >>> canonical_name('Gus Van Sant') 'Van Sant, Gus' - >>> canonicalName('Brian De Palma') + >>> canonical_name('Brian De Palma') 'De Palma, Brian' """ @@ -167,19 +167,19 @@ def canonicalName(name): name = '%s, %s' % (sname[-1], ' '.join(sname[:-1])) return name -def normalizeName(name): +def normalize_name(name): """Return a name in the normal "Name Surname" format. - >>> normalizeName('Godard, Jean Luc') + >>> normalize_name('Godard, Jean Luc') 'Jean Luc Godard' - >>> normalizeName('Ivanov-Vano, Ivan') + >>> normalize_name('Ivanov-Vano, Ivan') 'Ivan Ivanov-Vano' - >>> normalizeName('Van Sant, Gus') + >>> normalize_name('Van Sant, Gus') 'Gus Van Sant' - >>> normalizeName('De Palma, Brian') + >>> normalize_name('De Palma, Brian') 'Brian De Palma' """ sname = name.split(', ') @@ -187,12 +187,12 @@ def normalizeName(name): name = '%s %s' % (sname[1], sname[0]) return name -def normalizePath(path): +def normalize_path(path): path = path.replace(':', '_').replace('/', '_') if path.endswith('.'): path = path[:-1] + '_' return path -def stripAccents(s): +def strip_accents(s): if isinstance(s, str): s = unicode(s) return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) diff --git a/ox/oembed.py b/ox/oembed.py index 822ff14..26e100d 100644 --- a/ox/oembed.py +++ b/ox/oembed.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # ci:si:et:sw=4:sts=4:ts=4 import re -from text import findRe +from text import find_re import cache from utils import json, ET @@ -13,14 +13,14 @@ def get_embed_code(url, maxwidth=None, maxheight=None): json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('').findall(html)) xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('').findall(html)) if json_oembed: - oembed_url = findRe(json_oembed[0], 'href="(.*?)"') + oembed_url = find_re(json_oembed[0], 'href="(.*?)"') if maxwidth: oembed_url += '&maxwidth=%d' % maxwidth if maxheight: oembed_url += '&maxheight=%d' % maxheight embed = json.loads(cache.readUrl(oembed_url)) elif xml_oembed: - oembed_url = findRe(json_oembed[0], 'href="(.*?)"') + oembed_url = find_re(json_oembed[0], 'href="(.*?)"') if maxwidth: oembed_url += '&maxwidth=%d' % maxwidth if maxheight: diff --git a/ox/srt.py b/ox/srt.py index 83459e1..5b87a2f 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -11,7 +11,7 @@ import ox __all__ = [] -def _detectEncoding(fp): +def _detect_encoding(fp): bomDict={ # bytepattern : name (0x00, 0x00, 0xFE, 0xFF): "utf_32_be", (0xFF, 0xFE, 0x00, 0x00): "utf_32_le", @@ -63,7 +63,7 @@ def load(filename, offset=0): return offset + ox.time2ms(t.replace(',', '.')) / 1000 with open(filename) as f: - encoding = _detectEncoding(f) + encoding = _detect_encoding(f) data = f.read() try: data = unicode(data, encoding) diff --git a/ox/text.py b/ox/text.py index d6625dc..860744c 100644 --- a/ox/text.py +++ b/ox/text.py @@ -257,24 +257,24 @@ def get_sort_title(title): return title[length + spaces:] + ', ' + title[:length] return title -def findRe(string, regexp): +def find_re(string, regexp): result = re.compile(regexp, re.DOTALL).findall(string) if result: return result[0].strip() return '' -def findString(string, string0='', string1 = ''): +def find_string(string, string0='', string1 = ''): """Return the string between string0 and string1. If string0 or string1 is left out, begining or end of string is used. - >>> findString('i am not there', string1=' not there') + >>> find_string('i am not there', string1=' not there') 'i am' - >>> findString('i am not there', 'i am ', ' there') + >>> find_string('i am not there', 'i am ', ' there') 'not' - >>> findString('i am not there', 'i am not t') + >>> find_string('i am not there', 'i am not t') 'here' """ @@ -286,7 +286,7 @@ def findString(string, string0='', string1 = ''): string1 = re.escape(string1) else: string1 = '$' - return findRe(string, string0 + '(.*?)' + string1) + return find_re(string, string0 + '(.*?)' + string1) def parse_useragent(useragent): data = {} @@ -319,7 +319,7 @@ def parse_useragent(useragent): break; return data -def removeSpecialCharacters(text): +def remove_special_characters(text): """ Removes special characters inserted by Word. """ @@ -346,22 +346,22 @@ def wrap(text, width): text.split(' ') ) -def wrapString(string, length=80, separator='\n', balance=False): +def wrap_string(string, length=80, separator='\n', balance=False): ''' - >>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16) + >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) u"Anticonstitution\\nellement, Paris \\ns'eveille" - >>> wrapString(u'All you can eat', 12, '\\n', True) + >>> wrap_string(u'All you can eat', 12, '\\n', True) u'All you \\ncan eat' ''' words = string.split(' ') if balance: # balance lines: test if same number of lines # can be achieved with a shorter line length - lines = wrapString(string, length, separator, False).split(separator) + lines = wrap_string(string, length, separator, False).split(separator) if len(lines) > 1: while length > max(map(lambda x : len(x), words)): length -= 1 - if len(wrapString(string, length, separator, False).split(separator)) > len(lines): + if len(wrap_string(string, length, separator, False).split(separator)) > len(lines): length += 1 break lines = [''] @@ -382,12 +382,12 @@ def wrapString(string, length=80, separator='\n', balance=False): lines[len(lines) - 1] += u' ' return separator.join(lines).strip() -def truncateString(string, length, padding='...', position='right'): - # >>> truncateString('anticonstitutionellement', 16, '...', 'left') +def truncate_string(string, length, padding='...', position='right'): + # >>> truncate_string('anticonstitutionellement', 16, '...', 'left') # '...utionellement' - # >>> truncateString('anticonstitutionellement', 16, '...', 'center') + # >>> truncate_string('anticonstitutionellement', 16, '...', 'center') # 'anticon...lement' - # >>> truncateString('anticonstitutionellement', 16, '...', 'right') + # >>> truncate_string('anticonstitutionellement', 16, '...', 'right') # 'anticonstitut...' stringLength = len(string); paddingLength = len(padding) @@ -402,12 +402,12 @@ def truncateString(string, length, padding='...', position='right'): string = '%s%s' % (string[:length - paddingLength], padding) return string; -def truncateWords(s, num): +def truncate_words(s, num): """Truncates a string after a certain number of chacters, but ends with a word - >>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23) + >>> truncate_string('Truncates a string after a certain number of chacters, but ends with a word', 23) 'Truncates a string...' - >>> truncateString('Truncates a string', 23) + >>> truncate_string('Truncates a string', 23) 'Truncates a string' """ @@ -422,25 +422,25 @@ def truncateWords(s, num): ts += "..." return ts.strip() -def trimString(string, num): +def trim_string(string, num): """Truncates a string after a certain number of chacters, adding ... at -10 characters - >>> trimString('Truncates a string after a certain number of chacters', 23) + >>> trim_string('Truncates a string after a certain number of chacters', 23) 'Truncates ...f chacters' - >>> trimString('Truncates a string', 23) + >>> trim_string('Truncates a string', 23) 'Truncates a string' """ if len(string) > num: string = string[:num - 13] + '...' + string[-10:] return string -def getValidFilename(s): +def get_valid_filename(s): """ Returns the given string converted to a string that can be used for a clean filename. Specifically, leading and trailing spaces are removed; all non-filename-safe characters are removed. - >>> getValidFilename("john's portrait in 2004.jpg") + >>> get_valid_filename("john's portrait in 2004.jpg") 'john_s_portrait_in_2004.jpg' """ s = s.strip() @@ -449,34 +449,34 @@ def getValidFilename(s): s = s.replace('__', '_').replace('__', '_') return s -def getTextList(list_, last_word='or'): +def get_text_list(list_, last_word='or'): """ - >>> getTextList([u'a', u'b', u'c', u'd']) + >>> get_text_list([u'a', u'b', u'c', u'd']) u'a, b, c or d' - >>> getTextList([u'a', u'b', u'c'], 'and') + >>> get_text_list([u'a', u'b', u'c'], 'and') u'a, b and c' - >>> getTextList([u'a', u'b'], 'and') + >>> get_text_list([u'a', u'b'], 'and') u'a and b' - >>> getTextList([u'a']) + >>> get_text_list([u'a']) u'a' - >>> getTextList([]) + >>> get_text_list([]) '' """ if len(list_) == 0: return '' if len(list_) == 1: return list_[0] return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1]) -def getListText(text, last_word='or'): +def get_list_text(text, last_word='or'): """ - >>> getListText(u'a, b, c or d') + >>> get_list_text(u'a, b, c or d') [u'a', u'b', u'c', u'd'] - >>> getListText(u'a, b and c', u'and') + >>> get_list_text(u'a, b and c', u'and') [u'a', u'b', u'c'] - >>> getListText(u'a and b', u'and') + >>> get_list_text(u'a and b', u'and') [u'a', u'b'] - >>> getListText(u'a') + >>> get_list_text(u'a') [u'a'] - >>> getListText(u'') + >>> get_list_text(u'') [] """ list_ = [] @@ -490,7 +490,7 @@ def getListText(text, last_word='or'): list_.append(last[1].strip()) return list_ -def normalizeNewlines(text): +def normalize_newlines(text): return re.sub(r'\r\n|\r|\n', '\n', text) def recapitalize(text): @@ -514,7 +514,7 @@ def phone2numeric(phone): 'y': '9', 'x': '9'}.get(m.group(0).lower()) return letters.sub(char2number, phone) -def compressString(s): +def compress_string(s): import cStringIO, gzip zbuf = cStringIO.StringIO() zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) @@ -523,13 +523,13 @@ def compressString(s): return zbuf.getvalue() smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') -def smartSplit(text): +def smart_split(text): """ Generator that splits a string by spaces, leaving quoted phrases together. Supports both single and double quotes, and supports escaping quotes with backslashes. In the output, strings will keep their initial and trailing quote marks. - >>> list(smartSplit('This is "a person\\'s" test.')) + >>> list(smart_split('This is "a person\\'s" test.')) ['This', 'is', '"a person\\'s"', 'test.'] """ for bit in smart_split_re.finditer(text): diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index aca56b1..76e961b 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -3,7 +3,7 @@ import re import time -from ox import strip_tags, findRe +from ox import strip_tags, find_re from ox.cache import read_url @@ -28,22 +28,22 @@ def getData(id): } html = read_url(data["url"], unicode=True) data['aka'] = parseList(html, 'AKA') - data['category'] = findRe(html, '
category
.*?
(.*?)
') + data['category'] = find_re(html, '
category
.*?
(.*?)
') data['countries'] = parseList(html, 'countries') data['director'] = parseEntry(html, 'directed by') data['genres'] = parseList(html, 'genres') data['keywords'] = parseList(html, 'keywords') - data['posters'] = [findRe(html, '(.*?)')).strip() + data['synopsis'] = strip_tags(find_re(html, '
(.*?)
')).strip() data['themes'] = parseList(html, 'themes') data['types'] = parseList(html, 'types') - data['year'] = findRe(html, '.*?(\d+)') + data['year'] = find_re(html, '.*?(\d+)') #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('
(.*?)')).strip() + data['review'] = strip_tags(find_re(html, '
(.*?)
')).strip() return data def getUrl(id): return "http://allmovie.com/work/%s" % id def parseEntry(html, title): - html = findRe(html, '
%s
.*?
(.*?)
' % title) + html = find_re(html, '
%s
.*?
(.*?)
' % title) return strip_tags(html).strip() def parseList(html, title): - html = findRe(html, '
%s
.*?
(.*?)
' % title.lower()) + html = find_re(html, '
%s
.*?
(.*?)
' % title.lower()) r = map(lambda x: strip_tags(x), re.compile('
  • (.*?)
  • ', re.DOTALL).findall(html)) if not r and html: r = [strip_tags(html)] @@ -74,11 +74,11 @@ def parseTable(html): lambda x: strip_tags(x).strip().replace(' ', ''), x.split('-') ), - findRe(html, '
    (.*?)').split('')[:-1] + find_re(html, '
    (.*?)').split('')[:-1] ) def parseText(html, title): - return strip_tags(findRe(html, '%s.*?

    (.*?)' % title)).strip() + return strip_tags(find_re(html, '%s.*?

    (.*?)' % title)).strip() if __name__ == '__main__': print getData('129689') diff --git a/ox/web/amazon.py b/ox/web/amazon.py index f1a9595..75351e7 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -3,7 +3,7 @@ import re from urllib import quote -from ox import findRe, strip_tags, decodeHtml +from ox import find_re, strip_tags, decode_html from ox.cache import read_url @@ -12,7 +12,7 @@ def findISBN(title, author): url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) data = read_url(url, unicode=True) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) - id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') + id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') data = getData(id) if author in data['authors']: return data @@ -24,13 +24,13 @@ def getData(id): def findData(key): - return findRe(data, '

  • %s:(.*?)
  • '% key).strip() + return find_re(data, '
  • %s:(.*?)
  • '% key).strip() r = {} r['amazon'] = url - r['title'] = findRe(data, '(.*?)(.*?)(.*?).*?\(Author\)', re.DOTALL).findall(data) - r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']]) + r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']]) t = re.compile('>(.*?)
    \(Translator\)').findall(data) if t: r['translator'] = t @@ -38,15 +38,15 @@ def getData(id): r['language'] = findData('Language') r['isbn-10'] = findData('ISBN-10') r['isbn-13'] = findData('ISBN-13').replace('-', '') - r['dimensions'] = findRe(data, '
  • .*?Product Dimensions:.*?(.*?)
  • ') + r['dimensions'] = find_re(data, '
  • .*?Product Dimensions:.*?(.*?)
  • ') r['pages'] = findData('Paperback') if not r['pages']: r['pages'] = findData('Hardcover') - r['review'] = strip_tags(findRe(data, '

    Review

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() + r['review'] = strip_tags(find_re(data, '

    Review

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() - r['description'] = strip_tags(findRe(data, '

    Product Description

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() + r['description'] = strip_tags(find_re(data, '

    Product Description

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) if r['cover']: diff --git a/ox/web/criterion.py b/ox/web/criterion.py index b99b0b4..850b8b5 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -5,7 +5,7 @@ import re import ox.cache from ox.cache import read_url from ox.html import strip_tags -from ox.text import findRe, removeSpecialCharacters +from ox.text import find_re, remove_special_characters import imdb @@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False): html = read_url(data["url"], timeout=timeout, unicode=True) except: html = ox.cache.read_url(data["url"], timeout=timeout) - data["number"] = findRe(html, "
  • Spine #(\d+)") + data["number"] = find_re(html, "
  • Spine #(\d+)") - data["title"] = findRe(html, "(.*?)")) - results = findRe(html, '
    (.*?)
    ') + data["director"] = strip_tags(find_re(html, "

    (.*?)

    ")) + results = find_re(html, '
    (.*?)
    ') results = re.compile("
  • (.*?)
  • ").findall(results) data["country"] = results[0] data["year"] = results[1] - data["synopsis"] = strip_tags(findRe(html, "

    SYNOPSIS: (.*?)

    ")) + data["synopsis"] = strip_tags(find_re(html, "

    SYNOPSIS: (.*?)

    ")) - result = findRe(html, "
    (.*?)
    ") + result = find_re(html, "
    (.*?)
    ") if 'Blu-Ray' in result or 'Essential Art House DVD' in result: r = re.compile('

    Other Editions

    (.*?)
    ', re.DOTALL).findall(html) if r: result = r[0] - result = findRe(result, "(.*?)' % id) - result = findRe(result, "src=\"(.*?)\"") + result = find_re(html_, '(.*?)' % id) + result = find_re(result, "src=\"(.*?)\"") if result: data["posters"] = [result.replace("_w100", "")] else: data["posters"] = [] - result = findRe(html, "\"Film(.*?)')) - r['imdb'] = findRe(data, '

    .*?

    ') + r['title'] = strip_tags(find_re(data, '

    (.*?)

    ')) + r['imdb'] = find_re(data, '

    .*?

    ') r['episodes'] = {} #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) (.*?)').findall(data): diff --git a/ox/web/flixter.py b/ox/web/flixter.py index 2feffa4..5cf3a8e 100644 --- a/ox/web/flixter.py +++ b/ox/web/flixter.py @@ -5,7 +5,7 @@ import re from lxml.html import document_fromstring from ox.cache import read_url -from ox import findRe, strip_tags +from ox import find_re, strip_tags from ox.web.imdb import ImdbCombined diff --git a/ox/web/freebase.py b/ox/web/freebase.py index cf22404..d3a5313 100644 --- a/ox/web/freebase.py +++ b/ox/web/freebase.py @@ -3,7 +3,7 @@ import json from ox.cache import read_url -from ox import findRe +from ox import find_re class Imdb(dict): def __init__(self, id, timeout=-1): @@ -36,7 +36,7 @@ class Imdb(dict): if 'nytimes' in self: self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-')) - self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/') + self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/') diff --git a/ox/web/google.py b/ox/web/google.py index eb0a542..34c6e31 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -4,7 +4,7 @@ import re import urllib import ox -from ox import strip_tags, decodeHtml +from ox import strip_tags, decode_html DEFAULT_MAX_RESULTS = 10 DEFAULT_TIMEOUT = 24*60*60 @@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): for a in re.compile( '(.*?).*?(.*?)<\/span>' ).findall(data): - results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2])))) + results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2])))) if len(results) >= max_results: break return results diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 3d86338..0e55b14 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -8,8 +8,8 @@ import time import unicodedata import ox -from ox import findRe, strip_tags -from ox.normalize import normalizeTitle, normalizeImdbId +from ox import find_re, strip_tags +from ox.normalize import normalize_title, normalize_imdbid import ox.cache from siteparser import SiteParser @@ -50,7 +50,7 @@ class Imdb(SiteParser): 'page': 'business', 're': [ '
    Budget
    \s*?\$(.*?)Gross\s*?\$(.*?)(.*?) \(')) - data['year'] = findRe(html, '\((.*?)\)') + data['title'] = strip_tags(find_re(html, '

    (.*?) \(')) + data['year'] = find_re(html, '\((.*?)\)') data['posters'] = [] - poster = findRe(html, '')) + 1 + pages = int(find_re(html, '')) + 1 for page in range(pages, 0, -1): for id in getIdsByPage(page): if not id in ids: @@ -81,7 +81,7 @@ def getIdsByPage(page): def getUrl(id): url = u"http://www.impawards.com/%s.html" % id html = read_url(url, unicode=True) - if findRe(html, "No Movie Posters on This Page"): + if find_re(html, "No Movie Posters on This Page"): url = u"http://www.impawards.com/%s_ver1.html" % id return url diff --git a/ox/web/itunes.py b/ox/web/itunes.py index 30b3094..951b121 100644 --- a/ox/web/itunes.py +++ b/ox/web/itunes.py @@ -4,9 +4,9 @@ import re import urllib from ox.cache import read_url -from ox.html import decodeHtml, strip_tags -from ox.text import findRe -from ox.text import findString +from ox.html import decode_html, strip_tags +from ox.text import find_re +from ox.text import find_string # to sniff itunes traffic, use something like @@ -65,26 +65,26 @@ def parseXmlDict(xml): strings = xml.split('') for string in strings: if string.find('') != -1: - key = findRe(string, '(.*?)') - type = findRe(string, '<(.*?)>') + key = find_re(string, '(.*?)') + type = find_re(string, '<(.*?)>') if type == 'true/': value = True else: - value = findRe(string, '<%s>(.*?)' % (type, type)) + value = find_re(string, '<%s>(.*?)' % (type, type)) if type == 'integer': value = int(value) elif type == 'string': - value = decodeHtml(value) + value = decode_html(value) values[key] = value return values def parseCast(xml, title): list = [] try: - strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings = find_re(xml, '%s(.*?)' % title[:-1].upper()).split('') strings.pop() for string in strings: - list.append(findRe(string, '(.*?)')) + list.append(find_re(string, '(.*?)')) return list except: return list @@ -92,12 +92,12 @@ def parseCast(xml, title): def parseMovies(xml, title): list = [] try: - strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings = find_re(xml, '%s(.*?)' % title[:-1].upper()).split('') strings.pop() for string in strings: list.append({ - 'id': findRe(string, 'viewMovie\?id=(.*?)&'), - 'title': findRe(string, '(.*?)') + 'id': find_re(string, 'viewMovie\?id=(.*?)&'), + 'title': find_re(string, '(.*?)') }) return list except: @@ -114,24 +114,24 @@ class ItunesAlbum: def getId(self): url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) xml = read_url(url, headers = ITUNES_HEADERS) - id = findRe(xml, 'viewAlbum\?id=(.*?)&') + id = find_re(xml, 'viewAlbum\?id=(.*?)&') return id def getData(self): data = {'id': self.id} url = composeUrl('viewAlbum', {'id': self.id}) xml = read_url(url, None, ITUNES_HEADERS) - data['albumName'] = findRe(xml, '(.*?)') - data['artistName'] = findRe(xml, '(.*?)') - data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') - data['genre'] = findRe(xml, 'Genre:(.*?)<') - data['releaseDate'] = findRe(xml, 'Released(.*?)<') - data['review'] = strip_tags(findRe(xml, 'REVIEW.*?(.*?)')) + data['albumName'] = find_re(xml, '(.*?)') + data['artistName'] = find_re(xml, '(.*?)') + data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"') + data['genre'] = find_re(xml, 'Genre:(.*?)<') + data['releaseDate'] = find_re(xml, 'Released(.*?)<') + data['review'] = strip_tags(find_re(xml, 'REVIEW.*?(.*?)')) data['tracks'] = [] - strings = findRe(xml, 'items.*?(.*?)$').split('') + strings = find_re(xml, 'items.*?(.*?)$').split('') for string in strings: data['tracks'].append(parseXmlDict(string)) - data['type'] = findRe(xml, 'listType(.*?)<') + data['type'] = find_re(xml, 'listType(.*?)<') return data class ItunesMovie: @@ -145,7 +145,7 @@ class ItunesMovie: def getId(self): url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) xml = read_url(url, headers = ITUNES_HEADERS) - id = findRe(xml, 'viewMovie\?id=(.*?)&') + id = find_re(xml, 'viewMovie\?id=(.*?)&') return id def getData(self): @@ -156,21 +156,21 @@ class ItunesMovie: f.write(xml) f.close() data['actors'] = parseCast(xml, 'actors') - string = findRe(xml, 'Average Rating:(.*?)') + string = find_re(xml, 'Average Rating:(.*?)') data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5 data['directors'] = parseCast(xml, 'directors') - data['format'] = findRe(xml, 'Format:(.*?)<') - data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) - data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?)')) - data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') + data['format'] = find_re(xml, 'Format:(.*?)<') + data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<')) + data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY.*?(.*?)')) + data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"') data['producers'] = parseCast(xml, 'producers') - data['rated'] = findRe(xml, 'Rated(.*?)<') + data['rated'] = find_re(xml, 'Rated(.*?)<') data['relatedMovies'] = parseMovies(xml, 'related movies') - data['releaseDate'] = findRe(xml, 'Released(.*?)<') - data['runTime'] = findRe(xml, 'Run Time:(.*?)<') + data['releaseDate'] = find_re(xml, 'Released(.*?)<') + data['runTime'] = find_re(xml, 'Run Time:(.*?)<') data['screenwriters'] = parseCast(xml, 'screenwriters') - data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') - data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') + data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&') + data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"') return data if __name__ == '__main__': diff --git a/ox/web/lyricsfly.py b/ox/web/lyricsfly.py index a9e9d87..7b8e9bb 100644 --- a/ox/web/lyricsfly.py +++ b/ox/web/lyricsfly.py @@ -1,20 +1,20 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 from ox.cache import read_url -from ox.html import decodeHtml -from ox.text import findRe +from ox.html import decode_html +from ox.text import find_re def getLyrics(title, artist): html = read_url('http://lyricsfly.com/api/') - key = findRe(html, '(.*?)') + key = find_re(html, '(.*?)') url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) xml = read_url(url) - lyrics = findRe(xml, '(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') + lyrics = find_re(xml, '(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('[br]', '\n').strip() lyrics.replace('\n\n\n', '\n\n') - lyrics = decodeHtml(lyrics.replace('&', '&')) + lyrics = decode_html(lyrics.replace('&', '&')) return lyrics if __name__ == '__main__': diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index ef27871..8d25855 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -5,7 +5,7 @@ from urllib import quote from lxml.html import document_fromstring from ox.cache import read_url -from ox import findRe, strip_tags +from ox import find_re, strip_tags def getUrl(id): return 'http://www.metacritic.com/movie/%s' % id @@ -16,14 +16,14 @@ def getId(url): def getUrlByImdb(imdb): url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb data = read_url(url) - metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"') + metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"') return metacritic_url or None def getMetacriticShowUrl(title): title = quote(title) url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title data = read_url(url) - return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') + return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?') def getData(url): data = read_url(url, unicode=True) diff --git a/ox/web/mininova.py b/ox/web/mininova.py index bcbb2e1..c555051 100644 --- a/ox/web/mininova.py +++ b/ox/web/mininova.py @@ -6,8 +6,8 @@ import socket from urllib import quote from ox.cache import read_url -from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines -from ox.normalize import normalizeImdbId +from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines +from ox.normalize import normalize_imdbid import ox from torrent import Torrent @@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10): torrentDate = row[0] torrentExtra = row[1] torrentId = row[2] - torrentTitle = decodeHtml(row[3]).strip() + torrentTitle = decode_html(row[3]).strip() torrentLink = "http://www.mininova.org/tor/" + torrentId privateTracker = 'priv.gif' in torrentExtra if not privateTracker: @@ -38,13 +38,13 @@ def findMovieByImdb(imdbId): '''find torrents on mininova for a given imdb id ''' results = [] - imdbId = normalizeImdbId(imdbId) + imdbId = normalize_imdbid(imdbId) data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True) return _parseResultsPage(data) def getId(mininovaId): mininovaId = unicode(mininovaId) - d = findRe(mininovaId, "/(\d+)") + d = find_re(mininovaId, "/(\d+)") if d: return d mininovaId = mininovaId.split('/') @@ -81,14 +81,14 @@ def getData(mininovaId): for d in re.compile('

    .(.*?):(.*?)

    ', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) - value = decodeHtml(strip_tags(d[1].strip())) + value = decode_html(strip_tags(d[1].strip())) torrent[key] = value - torrent[u'title'] = findRe(data, '(.*?):.*?') - torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') - torrent[u'description'] = findRe(data, '
    (.*?)
    ') + torrent[u'title'] = find_re(data, '(.*?):.*?') + torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') + torrent[u'description'] = find_re(data, '
    (.*?)
    ') if torrent['description']: - torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip() + torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = getTorrentInfo(t) return torrent diff --git a/ox/web/movieposterdb.py b/ox/web/movieposterdb.py index 0f35541..27f5638 100644 --- a/ox/web/movieposterdb.py +++ b/ox/web/movieposterdb.py @@ -4,7 +4,7 @@ import re from ox.cache import read_url -from ox import findRe +from ox import find_re def getData(id): ''' @@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1): results = re.compile('
    ', re.DOTALL).findall(html) for result in results: html = read_url(result, timeout=timeout, unicode=True) - posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) + posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) return posters def getUrl(id): diff --git a/ox/web/opensubtitles.py b/ox/web/opensubtitles.py index 41f2d20..1b35599 100644 --- a/ox/web/opensubtitles.py +++ b/ox/web/opensubtitles.py @@ -4,7 +4,7 @@ import re import feedparser from ox.cache import read_url -from ox import findRe, strip_tags +from ox import find_re, strip_tags from ox import langCode2To3, langTo3Code def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): @@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): if opensubtitleId: opensubtitleId = opensubtitleId[0] else: - opensubtitleId = findRe(data, '/en/subtitles/(.*?)/') + opensubtitleId = find_re(data, '/en/subtitles/(.*?)/') return opensubtitleId def downloadSubtitleById(opensubtitle_id): diff --git a/ox/web/rottentomatoes.py b/ox/web/rottentomatoes.py index 1c059f9..cc7b041 100644 --- a/ox/web/rottentomatoes.py +++ b/ox/web/rottentomatoes.py @@ -3,7 +3,7 @@ import re from ox.cache import getHeaders, read_url -from ox import findRe, strip_tags +from ox import find_re, strip_tags def getUrlByImdb(imdb): @@ -22,16 +22,16 @@ def getUrlByImdb(imdb): return None def get_og(data, key): - return findRe(data, '(.*?)') + r['title'] = find_re(data, '

    (.*?)

    ') if '(' in r['title']: - r['year'] = findRe(r['title'], '\((\d*?)\)') + r['year'] = find_re(r['title'], '\((\d*?)\)') r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip() - r['summary'] = strip_tags(findRe(data, '

    (.*?)

    ')).strip() + r['summary'] = strip_tags(find_re(data, '

    (.*?)

    ')).strip() r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') if not r['summary']: r['summary'] = get_og(data, 'description') @@ -40,9 +40,9 @@ def getData(url): meter = filter(lambda m: m[1].isdigit(), meter) if meter: r['tomatometer'] = meter[0][1] - r['rating'] = findRe(data, 'Average Rating: ([\d.]+)/10') - r['user_score'] = findRe(data, '(\d+)') - r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5') + r['rating'] = find_re(data, 'Average Rating: ([\d.]+)/10') + r['user_score'] = find_re(data, '(\d+)') + r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5') poster = get_og(data, 'image') if poster and not 'poster_default.gif' in poster: r['posters'] = [poster] diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 9444e8d..f821cab 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -3,7 +3,7 @@ import re from ..cache import read_url -from .. import strip_tags, decodeHtml +from .. import strip_tags, decode_html from ..utils import datetime @@ -11,8 +11,8 @@ def cleanup(key, data, data_type): if data: if isinstance(data[0], basestring): #FIXME: some types need strip_tags - #data = [strip_tags(decodeHtml(p)).strip() for p in data] - data = [decodeHtml(p).strip() for p in data] + #data = [strip_tags(decode_html(p)).strip() for p in data] + data = [decode_html(p).strip() for p in data] elif isinstance(data[0], list) or isinstance(data[0], tuple): data = [cleanup(key, p, data_type) for p in data] while len(data) == 1 and not isinstance(data, basestring): diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py index 78c0370..1c968f6 100644 --- a/ox/web/spiegel.py +++ b/ox/web/spiegel.py @@ -5,7 +5,7 @@ import re import time import ox.cache -from ox.html import decodeHtml, strip_tags +from ox.html import decode_html, strip_tags import ox.net @@ -44,8 +44,8 @@ def getNews(year, month, day): new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) - # fix decodeHtml - # new['description'] = formatString(decodeHtml(description)) + # fix decode_html + # new['description'] = formatString(decode_html(description)) new['description'] = formatString(description) new['imageUrl'] = imageUrl new['section'] = formatSection(section) diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py index 1054d6d..cc89fae 100644 --- a/ox/web/thepiratebay.py +++ b/ox/web/thepiratebay.py @@ -6,8 +6,8 @@ import socket from urllib import quote, urlencode from urllib2 import URLError -from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines -from ox.normalize import normalizeImdbId +from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines +from ox.normalize import normalize_imdbid import ox from torrent import Torrent @@ -38,7 +38,7 @@ def findMovies(query, max_results=10): for row in re.compile(regexp, re.DOTALL).findall(data): torrentType = row[0] torrentLink = "http://thepiratebay.org" + row[1] - torrentTitle = decodeHtml(row[2]) + torrentTitle = decode_html(row[2]) # 201 = Movies , 202 = Movie DVDR, 205 TV Shows if torrentType in ['201']: results.append((torrentTitle, torrentLink, '')) @@ -48,15 +48,15 @@ def findMovies(query, max_results=10): return results def findMovieByImdb(imdb): - return findMovies("tt" + normalizeImdbId(imdb)) + return findMovies("tt" + normalize_imdbid(imdb)) def getId(piratebayId): if piratebayId.startswith('http://torrents.thepiratebay.org/'): piratebayId = piratebayId.split('org/')[1] - d = findRe(piratebayId, "tor/(\d+)") + d = find_re(piratebayId, "tor/(\d+)") if d: piratebayId = d - d = findRe(piratebayId, "torrent/(\d+)") + d = find_re(piratebayId, "torrent/(\d+)") if d: piratebayId = d return piratebayId @@ -80,21 +80,21 @@ def getData(piratebayId): torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId data = read_url(torrent['comment_link'], unicode=True) - torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB') + torrent[u'title'] = find_re(data, '(.*?) \(download torrent\) - TPB') if not torrent[u'title']: return None - torrent[u'title'] = decodeHtml(torrent[u'title']).strip() - torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + torrent[u'title'] = decode_html(torrent[u'title']).strip() + torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') title = quote(torrent['title'].encode('utf-8')) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) - value = decodeHtml(strip_tags(d[1].strip())) + value = decode_html(strip_tags(d[1].strip())) torrent[key] = value - torrent[u'description'] = findRe(data, '
    (.*?)
    ') + torrent[u'description'] = find_re(data, '
    (.*?)
    ') if torrent[u'description']: - torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip() + torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = _read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = getTorrentInfo(t) return torrent diff --git a/ox/web/tv.py b/ox/web/tv.py index 735a771..f5a36f1 100644 --- a/ox/web/tv.py +++ b/ox/web/tv.py @@ -3,7 +3,7 @@ import re import time -from ox import strip_tags, findRe +from ox import strip_tags, find_re from ox.cache import read_url @@ -16,11 +16,11 @@ def getEpisodeData(url): ''' data = read_url(url, unicode=True) r = {} - r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?
    (.*?)(.*?)') - r['title'] = findRe(data, '.*?: (.*?) - TV.com ') + r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?
    (.*?)(.*?)') + r['title'] = find_re(data, '.*?: (.*?) - TV.com ') #episode score - r['episode score'] = findRe(data, '(.*?)') + r['episode score'] = find_re(data, '(.*?)') match = re.compile('Episode Number: (\d*?)    Season Num: (\d*?)    First Aired: (.*?)  ').findall(data) if match: diff --git a/ox/web/vimeo.py b/ox/web/vimeo.py index ff20041..cf2257c 100644 --- a/ox/web/vimeo.py +++ b/ox/web/vimeo.py @@ -5,7 +5,7 @@ from StringIO import StringIO import xml.etree.ElementTree as ET from ox.cache import read_url -from ox import findString, findRe +from ox import find_string, find_re def getData(id): diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index 8c11ff5..48a5c5a 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -5,7 +5,7 @@ from urllib import urlencode from ox.utils import json from ox.cache import read_url -from ox import findRe, decodeHtml +from ox import find_re, decode_html def getId(url): @@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl): if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) data = getWikiData(wikipediaUrl) - filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') + filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox = {} _box = filmbox_data.strip().split('|') for row in _box: @@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl): if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): del filmbox['amg_id'] if 'Allmovie movie' in data: - filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)') + filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)') elif 'Allmovie title' in data: - filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)') + filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)') if 'Official website' in data: - filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip() + filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip() r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) if r: @@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl): if r: filmbox['rottentomatoes_id'] = r[0].replace('id=', '') if 'google video' in data: - filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]') + filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]') if 'DEFAULTSORT' in data: - filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') + filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') return filmbox def getImageUrl(name): url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20') data = read_url(url, unicode=True) - url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"') + url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"') if not url: - url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"') + url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"') if url: url = 'http:' + url return url