replace all CammelCase with under_score in ox

This commit is contained in:
j 2012-08-14 16:12:43 +02:00
parent 2de989e188
commit bb35daa95c
31 changed files with 242 additions and 244 deletions

10
README
View file

@ -10,12 +10,12 @@ Depends:
Usage: Usage:
import ox import ox
data = ox.cache.readUrl('http:/...') data = ox.cache.read_url('http:/...')
text = ox.stripTags(data) text = ox.strip_tags(data)
ox.normalizeNewlines(text) ox.normalize_newlines(text)
ox.formatBytes(len(data)) ox.format_bytes(len(data))
ox.formatBytes(1234567890) ox.format_bytes(1234567890)
'1.15 GB' '1.15 GB'
import ox.web.imdb import ox.web.imdb

View file

@ -56,15 +56,15 @@ def strip_tags(value):
stripTags = strip_tags stripTags = strip_tags
def stripSpacesBetweenTags(value): def strip_spaces_between_tags(value):
"Returns the given HTML with spaces between tags normalized to a single space" "Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value) return re.sub(r'>\s+<', '> <', value)
def stripEntities(value): def strip_entities(value):
"Returns the given HTML with all entities (&something;) stripped" "Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value) return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value): def fix_ampersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly" "Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value) return unencoded_ampersands_re.sub('&amp;', value)
@ -113,11 +113,11 @@ def clean_html(text):
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the * Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text. bottom of the text.
""" """
from text import normalizeNewlines from text import normalize_newlines
text = normalizeNewlines(text) text = normalize_newlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text) text = fix_ampersands(text)
# Remove all target="" attributes from <a> tags. # Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text) text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">. # Trim stupid HTML such as <br clear="all">.
@ -168,8 +168,6 @@ def decode_html(html):
return match.group(0) return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
decodeHtml = decode_html
def highlight(text, query, hlClass="hl"): def highlight(text, query, hlClass="hl"):
""" """
>>> highlight('me &amp; you and &#36;&#38;%', 'and') >>> highlight('me &amp; you and &#36;&#38;%', 'and')

View file

@ -18,7 +18,7 @@ def latlngspan2latlng(lat, lng, latSpan, lngSpan):
lat_ne = lat + latSpan, lng_ne = lng + latSpan lat_ne = lat + latSpan, lng_ne = lng + latSpan
) )
def parseLocationString(location_string): def parse_location_string(location_string):
l = location_string.split('+') l = location_string.split('+')
if len(l) == 1: if len(l) == 1:
l = location_string.split(';') l = location_string.split(';')

View file

@ -8,8 +8,8 @@ import hashlib
import os import os
import re import re
from normalize import normalizeName from normalize import normalize_name
from text import get_sort_name, findRe from text import get_sort_name, find_re
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid'] __all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
@ -308,14 +308,14 @@ def parse_movie_path(path):
if title.endswith('_'): if title.endswith('_'):
title = title[:-1] + '.' title = title[:-1] + '.'
year = findRe(title, '(\(\d{4}\))') year = find_re(title, '(\(\d{4}\))')
if not year: if not year:
year = findRe(title, '(\(\d{4}-\d*\))') year = find_re(title, '(\(\d{4}-\d*\))')
if year and title.endswith(year): if year and title.endswith(year):
title = title[:-len(year)].strip() title = title[:-len(year)].strip()
year = year[1:-1] year = year[1:-1]
if '-' in year: if '-' in year:
year = findRe(year, '\d{4}') year = find_re(year, '\d{4}')
#director #director
if len(parts) == 4: if len(parts) == 4:
@ -323,7 +323,7 @@ def parse_movie_path(path):
if director.endswith('_'): if director.endswith('_'):
director = "%s." % director[:-1] director = "%s." % director[:-1]
director = director.split('; ') director = director.split('; ')
director = [normalizeName(d).strip() for d in director] director = [normalize_name(d).strip() for d in director]
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director) director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
else: else:
director = [] director = []
@ -338,13 +338,13 @@ def parse_movie_path(path):
language = '' language = ''
#season/episode/episodeTitle #season/episode/episodeTitle
season = findRe(parts[-1], '\.Season (\d+)\.') season = find_re(parts[-1], '\.Season (\d+)\.')
if season: if season:
season = int(season) season = int(season)
else: else:
season = None season = None
episode = findRe(parts[-1], '\.Episode (\d+)\.') episode = find_re(parts[-1], '\.Episode (\d+)\.')
if episode: if episode:
episode = int(episode) episode = int(episode)
else: else:
@ -373,7 +373,7 @@ def parse_movie_path(path):
title = u'%s %s' % (title, episodeTitle) title = u'%s %s' % (title, episodeTitle)
#part #part
part = findRe(parts[-1], '\.Part (\d+)\.') part = find_re(parts[-1], '\.Part (\d+)\.')
if part: if part:
part = int(part) part = int(part)
else: else:

View file

@ -37,13 +37,13 @@ _noarticles = (
'i was', 'i was',
) )
def canonicalTitle(title): def canonical_title(title):
"""Return the title in the canonic format 'Movie Title, The'. """Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title') >>> canonical_title('The Movie Title')
'Movie Title, The' 'Movie Title, The'
>>> canonicalTitle('Los Angeles Plays Itself') >>> canonical_title('Los Angeles Plays Itself')
'Los Angeles Plays Itself' 'Los Angeles Plays Itself'
""" """
try: try:
@ -72,10 +72,10 @@ def canonicalTitle(title):
## break ## break
return title return title
def normalizeTitle(title): def normalize_title(title):
"""Return the title in the normal "The Title" format. """Return the title in the normal "The Title" format.
>>> normalizeTitle('Movie Title, The') >>> normalize_title('Movie Title, The')
'The Movie Title' 'The Movie Title'
""" """
stitle = title.split(', ') stitle = title.split(', ')
@ -85,14 +85,14 @@ def normalizeTitle(title):
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1])) title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title return title
def normalizeImdbId(imdbId): def normalize_imdbid(imdbId):
"""Return 7 digit imdbId. """Return 7 digit imdbId.
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/') >>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
'0159206' '0159206'
>>> normalizeImdbId(159206) >>> normalize_imdbid(159206)
'0159206' '0159206'
>>> normalizeImdbId('tt0159206') >>> normalize_imdbid('tt0159206')
'0159206' '0159206'
""" """
if isinstance(imdbId, basestring): if isinstance(imdbId, basestring):
@ -106,20 +106,20 @@ def normalizeImdbId(imdbId):
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van', _sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al') 'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
def canonicalName(name): def canonical_name(name):
"""Return the given name in canonical "Surname, Name" format. """Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format. It assumes that name is in the 'Name Surname' format.
>>> canonicalName('Jean Luc Godard') >>> canonical_name('Jean Luc Godard')
'Godard, Jean Luc' 'Godard, Jean Luc'
>>> canonicalName('Ivan Ivanov-Vano') >>> canonical_name('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan' 'Ivanov-Vano, Ivan'
>>> canonicalName('Gus Van Sant') >>> canonical_name('Gus Van Sant')
'Van Sant, Gus' 'Van Sant, Gus'
>>> canonicalName('Brian De Palma') >>> canonical_name('Brian De Palma')
'De Palma, Brian' 'De Palma, Brian'
""" """
@ -167,19 +167,19 @@ def canonicalName(name):
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1])) name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name return name
def normalizeName(name): def normalize_name(name):
"""Return a name in the normal "Name Surname" format. """Return a name in the normal "Name Surname" format.
>>> normalizeName('Godard, Jean Luc') >>> normalize_name('Godard, Jean Luc')
'Jean Luc Godard' 'Jean Luc Godard'
>>> normalizeName('Ivanov-Vano, Ivan') >>> normalize_name('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano' 'Ivan Ivanov-Vano'
>>> normalizeName('Van Sant, Gus') >>> normalize_name('Van Sant, Gus')
'Gus Van Sant' 'Gus Van Sant'
>>> normalizeName('De Palma, Brian') >>> normalize_name('De Palma, Brian')
'Brian De Palma' 'Brian De Palma'
""" """
sname = name.split(', ') sname = name.split(', ')
@ -187,12 +187,12 @@ def normalizeName(name):
name = '%s %s' % (sname[1], sname[0]) name = '%s %s' % (sname[1], sname[0])
return name return name
def normalizePath(path): def normalize_path(path):
path = path.replace(':', '_').replace('/', '_') path = path.replace(':', '_').replace('/', '_')
if path.endswith('.'): path = path[:-1] + '_' if path.endswith('.'): path = path[:-1] + '_'
return path return path
def stripAccents(s): def strip_accents(s):
if isinstance(s, str): if isinstance(s, str):
s = unicode(s) s = unicode(s)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4 # ci:si:et:sw=4:sts=4:ts=4
import re import re
from text import findRe from text import find_re
import cache import cache
from utils import json, ET from utils import json, ET
@ -13,14 +13,14 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html)) json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html)) xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
if json_oembed: if json_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"') oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth: if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth oembed_url += '&maxwidth=%d' % maxwidth
if maxheight: if maxheight:
oembed_url += '&maxheight=%d' % maxheight oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.readUrl(oembed_url)) embed = json.loads(cache.readUrl(oembed_url))
elif xml_oembed: elif xml_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"') oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth: if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth oembed_url += '&maxwidth=%d' % maxwidth
if maxheight: if maxheight:

View file

@ -11,7 +11,7 @@ import ox
__all__ = [] __all__ = []
def _detectEncoding(fp): def _detect_encoding(fp):
bomDict={ # bytepattern : name bomDict={ # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be", (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le", (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
@ -63,7 +63,7 @@ def load(filename, offset=0):
return offset + ox.time2ms(t.replace(',', '.')) / 1000 return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename) as f: with open(filename) as f:
encoding = _detectEncoding(f) encoding = _detect_encoding(f)
data = f.read() data = f.read()
try: try:
data = unicode(data, encoding) data = unicode(data, encoding)

View file

@ -257,24 +257,24 @@ def get_sort_title(title):
return title[length + spaces:] + ', ' + title[:length] return title[length + spaces:] + ', ' + title[:length]
return title return title
def findRe(string, regexp): def find_re(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string) result = re.compile(regexp, re.DOTALL).findall(string)
if result: if result:
return result[0].strip() return result[0].strip()
return '' return ''
def findString(string, string0='', string1 = ''): def find_string(string, string0='', string1 = ''):
"""Return the string between string0 and string1. """Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used. If string0 or string1 is left out, begining or end of string is used.
>>> findString('i am not there', string1=' not there') >>> find_string('i am not there', string1=' not there')
'i am' 'i am'
>>> findString('i am not there', 'i am ', ' there') >>> find_string('i am not there', 'i am ', ' there')
'not' 'not'
>>> findString('i am not there', 'i am not t') >>> find_string('i am not there', 'i am not t')
'here' 'here'
""" """
@ -286,7 +286,7 @@ def findString(string, string0='', string1 = ''):
string1 = re.escape(string1) string1 = re.escape(string1)
else: else:
string1 = '$' string1 = '$'
return findRe(string, string0 + '(.*?)' + string1) return find_re(string, string0 + '(.*?)' + string1)
def parse_useragent(useragent): def parse_useragent(useragent):
data = {} data = {}
@ -319,7 +319,7 @@ def parse_useragent(useragent):
break; break;
return data return data
def removeSpecialCharacters(text): def remove_special_characters(text):
""" """
Removes special characters inserted by Word. Removes special characters inserted by Word.
""" """
@ -346,22 +346,22 @@ def wrap(text, width):
text.split(' ') text.split(' ')
) )
def wrapString(string, length=80, separator='\n', balance=False): def wrap_string(string, length=80, separator='\n', balance=False):
''' '''
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16) >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille" u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrapString(u'All you can eat', 12, '\\n', True) >>> wrap_string(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat' u'All you \\ncan eat'
''' '''
words = string.split(' ') words = string.split(' ')
if balance: if balance:
# balance lines: test if same number of lines # balance lines: test if same number of lines
# can be achieved with a shorter line length # can be achieved with a shorter line length
lines = wrapString(string, length, separator, False).split(separator) lines = wrap_string(string, length, separator, False).split(separator)
if len(lines) > 1: if len(lines) > 1:
while length > max(map(lambda x : len(x), words)): while length > max(map(lambda x : len(x), words)):
length -= 1 length -= 1
if len(wrapString(string, length, separator, False).split(separator)) > len(lines): if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
length += 1 length += 1
break break
lines = [''] lines = ['']
@ -382,12 +382,12 @@ def wrapString(string, length=80, separator='\n', balance=False):
lines[len(lines) - 1] += u' ' lines[len(lines) - 1] += u' '
return separator.join(lines).strip() return separator.join(lines).strip()
def truncateString(string, length, padding='...', position='right'): def truncate_string(string, length, padding='...', position='right'):
# >>> truncateString('anticonstitutionellement', 16, '...', 'left') # >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
# '...utionellement' # '...utionellement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'center') # >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
# 'anticon...lement' # 'anticon...lement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'right') # >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...' # 'anticonstitut...'
stringLength = len(string); stringLength = len(string);
paddingLength = len(padding) paddingLength = len(padding)
@ -402,12 +402,12 @@ def truncateString(string, length, padding='...', position='right'):
string = '%s%s' % (string[:length - paddingLength], padding) string = '%s%s' % (string[:length - paddingLength], padding)
return string; return string;
def truncateWords(s, num): def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word """Truncates a string after a certain number of chacters, but ends with a word
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23) >>> truncate_string('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...' 'Truncates a string...'
>>> truncateString('Truncates a string', 23) >>> truncate_string('Truncates a string', 23)
'Truncates a string' 'Truncates a string'
""" """
@ -422,25 +422,25 @@ def truncateWords(s, num):
ts += "..." ts += "..."
return ts.strip() return ts.strip()
def trimString(string, num): def trim_string(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters """Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trimString('Truncates a string after a certain number of chacters', 23) >>> trim_string('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters' 'Truncates ...f chacters'
>>> trimString('Truncates a string', 23) >>> trim_string('Truncates a string', 23)
'Truncates a string' 'Truncates a string'
""" """
if len(string) > num: if len(string) > num:
string = string[:num - 13] + '...' + string[-10:] string = string[:num - 13] + '...' + string[-10:]
return string return string
def getValidFilename(s): def get_valid_filename(s):
""" """
Returns the given string converted to a string that can be used for a clean Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed; filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed. all non-filename-safe characters are removed.
>>> getValidFilename("john's portrait in 2004.jpg") >>> get_valid_filename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg' 'john_s_portrait_in_2004.jpg'
""" """
s = s.strip() s = s.strip()
@ -449,34 +449,34 @@ def getValidFilename(s):
s = s.replace('__', '_').replace('__', '_') s = s.replace('__', '_').replace('__', '_')
return s return s
def getTextList(list_, last_word='or'): def get_text_list(list_, last_word='or'):
""" """
>>> getTextList([u'a', u'b', u'c', u'd']) >>> get_text_list([u'a', u'b', u'c', u'd'])
u'a, b, c or d' u'a, b, c or d'
>>> getTextList([u'a', u'b', u'c'], 'and') >>> get_text_list([u'a', u'b', u'c'], 'and')
u'a, b and c' u'a, b and c'
>>> getTextList([u'a', u'b'], 'and') >>> get_text_list([u'a', u'b'], 'and')
u'a and b' u'a and b'
>>> getTextList([u'a']) >>> get_text_list([u'a'])
u'a' u'a'
>>> getTextList([]) >>> get_text_list([])
'' ''
""" """
if len(list_) == 0: return '' if len(list_) == 0: return ''
if len(list_) == 1: return list_[0] if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1]) return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'): def get_list_text(text, last_word='or'):
""" """
>>> getListText(u'a, b, c or d') >>> get_list_text(u'a, b, c or d')
[u'a', u'b', u'c', u'd'] [u'a', u'b', u'c', u'd']
>>> getListText(u'a, b and c', u'and') >>> get_list_text(u'a, b and c', u'and')
[u'a', u'b', u'c'] [u'a', u'b', u'c']
>>> getListText(u'a and b', u'and') >>> get_list_text(u'a and b', u'and')
[u'a', u'b'] [u'a', u'b']
>>> getListText(u'a') >>> get_list_text(u'a')
[u'a'] [u'a']
>>> getListText(u'') >>> get_list_text(u'')
[] []
""" """
list_ = [] list_ = []
@ -490,7 +490,7 @@ def getListText(text, last_word='or'):
list_.append(last[1].strip()) list_.append(last[1].strip())
return list_ return list_
def normalizeNewlines(text): def normalize_newlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text) return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text): def recapitalize(text):
@ -514,7 +514,7 @@ def phone2numeric(phone):
'y': '9', 'x': '9'}.get(m.group(0).lower()) 'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone) return letters.sub(char2number, phone)
def compressString(s): def compress_string(s):
import cStringIO, gzip import cStringIO, gzip
zbuf = cStringIO.StringIO() zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
@ -523,13 +523,13 @@ def compressString(s):
return zbuf.getvalue() return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text): def smart_split(text):
""" """
Generator that splits a string by spaces, leaving quoted phrases together. Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing backslashes. In the output, strings will keep their initial and trailing
quote marks. quote marks.
>>> list(smartSplit('This is "a person\\'s" test.')) >>> list(smart_split('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.'] ['This', 'is', '"a person\\'s"', 'test.']
""" """
for bit in smart_split_re.finditer(text): for bit in smart_split_re.finditer(text):

View file

@ -3,7 +3,7 @@
import re import re
import time import time
from ox import strip_tags, findRe from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
@ -28,22 +28,22 @@ def getData(id):
} }
html = read_url(data["url"], unicode=True) html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA') data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>') data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries') data['countries'] = parseList(html, 'countries')
data['director'] = parseEntry(html, 'directed by') data['director'] = parseEntry(html, 'directed by')
data['genres'] = parseList(html, 'genres') data['genres'] = parseList(html, 'genres')
data['keywords'] = parseList(html, 'keywords') data['keywords'] = parseList(html, 'keywords')
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')] data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parseList(html, 'produced by') data['produced'] = parseList(html, 'produced by')
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"') data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'released by') data['released'] = parseEntry(html, 'released by')
data['releasedate'] = parseList(html, 'release date') data['releasedate'] = parseList(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip() data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in') data['set'] = parseEntry(html, 'set in')
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes') data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'types') data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '<span class="year">.*?(\d+)') data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
@ -51,18 +51,18 @@ def getData(id):
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True) #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html) #data['credits'] = parseTable(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True) html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data return data
def getUrl(id): def getUrl(id):
return "http://allmovie.com/work/%s" % id return "http://allmovie.com/work/%s" % id
def parseEntry(html, title): def parseEntry(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title) html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip() return strip_tags(html).strip()
def parseList(html, title): def parseList(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower()) html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html)) r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html: if not r and html:
r = [strip_tags(html)] r = [strip_tags(html)]
@ -74,11 +74,11 @@ def parseTable(html):
lambda x: strip_tags(x).strip().replace('&nbsp;', ''), lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-') x.split('<td width="305">-')
), ),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1] find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
) )
def parseText(html, title): def parseText(html, title):
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip() return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__': if __name__ == '__main__':
print getData('129689') print getData('129689')

View file

@ -3,7 +3,7 @@
import re import re
from urllib import quote from urllib import quote
from ox import findRe, strip_tags, decodeHtml from ox import find_re, strip_tags, decode_html
from ox.cache import read_url from ox.cache import read_url
@ -12,7 +12,7 @@ def findISBN(title, author):
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id) data = getData(id)
if author in data['authors']: if author in data['authors']:
return data return data
@ -24,13 +24,13 @@ def getData(id):
def findData(key): def findData(key):
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip() return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {} r = {}
r['amazon'] = url r['amazon'] = url
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span') r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data) r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']]) r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
t = re.compile('>(.*?)</a> \(Translator\)').findall(data) t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
if t: if t:
r['translator'] = t r['translator'] = t
@ -38,15 +38,15 @@ def getData(id):
r['language'] = findData('Language') r['language'] = findData('Language')
r['isbn-10'] = findData('ISBN-10') r['isbn-10'] = findData('ISBN-10')
r['isbn-13'] = findData('ISBN-13').replace('-', '') r['isbn-13'] = findData('ISBN-13').replace('-', '')
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>') r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = findData('Paperback') r['pages'] = findData('Paperback')
if not r['pages']: if not r['pages']:
r['pages'] = findData('Hardcover') r['pages'] = findData('Hardcover')
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']: if r['cover']:

View file

@ -5,7 +5,7 @@ import re
import ox.cache import ox.cache
from ox.cache import read_url from ox.cache import read_url
from ox.html import strip_tags from ox.html import strip_tags
from ox.text import findRe, removeSpecialCharacters from ox.text import find_re, remove_special_characters
import imdb import imdb
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
html = read_url(data["url"], timeout=timeout, unicode=True) html = read_url(data["url"], timeout=timeout, unicode=True)
except: except:
html = ox.cache.read_url(data["url"], timeout=timeout) html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = findRe(html, "<li>Spine #(\d+)") data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]") data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0] data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>")) data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>') results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results) results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0] data["country"] = results[0]
data["year"] = results[1] data["year"] = results[1]
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>")) data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
result = findRe(html, "<div class=\"purchase\">(.*?)</div>") result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result: if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html) r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
if r: if r:
result = r[0] result = r[0]
result = findRe(result, "<a href=\"(.*?)\"") result = find_re(result, "<a href=\"(.*?)\"")
if not "/boxsets/" in result: if not "/boxsets/" in result:
data["posters"] = [result] data["posters"] = [result]
else: else:
html_ = read_url(result, unicode=True) html_ = read_url(result, unicode=True)
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id) result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = findRe(result, "src=\"(.*?)\"") result = find_re(result, "src=\"(.*?)\"")
if result: if result:
data["posters"] = [result.replace("_w100", "")] data["posters"] = [result.replace("_w100", "")]
else: else:
data["posters"] = [] data["posters"] = []
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result: if result:
data["stills"] = [result] data["stills"] = [result]
data["trailers"] = [] data["trailers"] = []
else: else:
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")]) data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")]) data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
if timeout == ox.cache.cache_timeout: if timeout == ox.cache.cache_timeout:
timeout = -1 timeout = -1

View file

@ -3,7 +3,7 @@
import re import re
import urllib import urllib
import ox import ox
from ox import strip_tags, decodeHtml from ox import strip_tags, decode_html
from ox.utils import json from ox.utils import json
from ox.cache import read_url from ox.cache import read_url
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
results = [] results = []
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>' regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data): for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2])))) results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
return results return results

View file

@ -3,7 +3,7 @@
import re import re
import time import time
from ox import strip_tags, findRe from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
import google import google
@ -23,8 +23,8 @@ def getShowUrl(title):
def getShowData(url): def getShowData(url):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
r = {} r = {}
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>')) r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>') r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {} r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data): for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):

View file

@ -5,7 +5,7 @@ import re
from lxml.html import document_fromstring from lxml.html import document_fromstring
from ox.cache import read_url from ox.cache import read_url
from ox import findRe, strip_tags from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined from ox.web.imdb import ImdbCombined

View file

@ -3,7 +3,7 @@
import json import json
from ox.cache import read_url from ox.cache import read_url
from ox import findRe from ox import find_re
class Imdb(dict): class Imdb(dict):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
@ -36,7 +36,7 @@ class Imdb(dict):
if 'nytimes' in self: if 'nytimes' in self:
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-')) self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/') self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')

View file

@ -4,7 +4,7 @@ import re
import urllib import urllib
import ox import ox
from ox import strip_tags, decodeHtml from ox import strip_tags, decode_html
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60 DEFAULT_TIMEOUT = 24*60*60
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
for a in re.compile( for a in re.compile(
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>' '<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
).findall(data): ).findall(data):
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2])))) results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results: if len(results) >= max_results:
break break
return results return results

View file

@ -8,8 +8,8 @@ import time
import unicodedata import unicodedata
import ox import ox
from ox import findRe, strip_tags from ox import find_re, strip_tags
from ox.normalize import normalizeTitle, normalizeImdbId from ox.normalize import normalize_title, normalize_imdbid
import ox.cache import ox.cache
from siteparser import SiteParser from siteparser import SiteParser
@ -50,7 +50,7 @@ class Imdb(SiteParser):
'page': 'business', 'page': 'business',
're': [ 're': [
'<h5>Budget</h5>\s*?\$(.*?)<br', '<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+') lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
], ],
'type': 'int' 'type': 'int'
}, },
@ -141,7 +141,7 @@ class Imdb(SiteParser):
'page': 'business', 'page': 'business',
're': [ 're': [
'<h5>Gross</h5>\s*?\$(.*?)<br', '<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: findRe(data.replace(',', ''), '\d+') lambda data: find_re(data.replace(',', ''), '\d+')
], ],
'type': 'int' 'type': 'int'
}, },
@ -314,7 +314,7 @@ class Imdb(SiteParser):
if 'runtime' in self and self['runtime']: if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60 if 'min' in self['runtime']: base=60
else: base=1 else: base=1
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']: if 'runtime' in self and not self['runtime']:
del self['runtime'] del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
#print google_query #print google_query
results = google.find(google_query, timeout=timeout) results = google.find(google_query, timeout=timeout)
if results: if results:
return findRe(results[0][1], 'title/tt(\d{7})') return find_re(results[0][1], 'title/tt(\d{7})')
#or nothing #or nothing
return '' return ''
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
if 'posterId' in info: if 'posterId' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId) url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url) data = read_url(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"') poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
return poster return poster
elif 'series' in info: elif 'series' in info:
return getMoviePoster(info['series']) return getMoviePoster(info['series'])

View file

@ -4,7 +4,7 @@ import re
from ox.cache import read_url from ox.cache import read_url
from ox.html import strip_tags from ox.html import strip_tags
from ox.text import findRe from ox.text import find_re
def getData(id): def getData(id):
@ -22,13 +22,13 @@ def getData(id):
'url': getUrl(id) 'url': getUrl(id)
} }
html = read_url(data['url'], unicode=True) html = read_url(data['url'], unicode=True)
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})') data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']: if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '') data['imdbId'] = _id_map.get(id, '')
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">')) data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)') data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = [] data['posters'] = []
poster = findRe(html, '<img src="(posters.*?)"') poster = find_re(html, '<img src="(posters.*?)"')
if poster: if poster:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster) poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
data['posters'].append(poster) data['posters'].append(poster)
@ -37,13 +37,13 @@ def getData(id):
result = result.replace('_xlg.html', '.html') result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
result = findRe(html, '<a href = (\w*?_xlg.html)') result = find_re(html, '<a href = (\w*?_xlg.html)')
if result: if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else: else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster) data['posters'].append(poster)
return data return data
@ -54,7 +54,7 @@ def getId(url):
split = split[4][:-5].split('_') split = split[4][:-5].split('_')
if split[-1] == 'xlg': if split[-1] == 'xlg':
split.pop() split.pop()
if findRe(split[-1], 'ver\d+$'): if find_re(split[-1], 'ver\d+$'):
split.pop() split.pop()
id = '%s/%s' % (year, '_'.join(split)) id = '%s/%s' % (year, '_'.join(split))
return id return id
@ -62,7 +62,7 @@ def getId(url):
def getIds(): def getIds():
ids = [] ids = []
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1 pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
for id in getIdsByPage(page): for id in getIdsByPage(page):
if not id in ids: if not id in ids:
@ -81,7 +81,7 @@ def getIdsByPage(page):
def getUrl(id): def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
if findRe(html, "No Movie Posters on This Page"): if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id url = u"http://www.impawards.com/%s_ver1.html" % id
return url return url

View file

@ -4,9 +4,9 @@ import re
import urllib import urllib
from ox.cache import read_url from ox.cache import read_url
from ox.html import decodeHtml, strip_tags from ox.html import decode_html, strip_tags
from ox.text import findRe from ox.text import find_re
from ox.text import findString from ox.text import find_string
# to sniff itunes traffic, use something like # to sniff itunes traffic, use something like
@ -65,26 +65,26 @@ def parseXmlDict(xml):
strings = xml.split('<key>') strings = xml.split('<key>')
for string in strings: for string in strings:
if string.find('</key>') != -1: if string.find('</key>') != -1:
key = findRe(string, '(.*?)</key>') key = find_re(string, '(.*?)</key>')
type = findRe(string, '</key><(.*?)>') type = find_re(string, '</key><(.*?)>')
if type == 'true/': if type == 'true/':
value = True value = True
else: else:
value = findRe(string, '<%s>(.*?)</%s>' % (type, type)) value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer': if type == 'integer':
value = int(value) value = int(value)
elif type == 'string': elif type == 'string':
value = decodeHtml(value) value = decode_html(value)
values[key] = value values[key] = value
return values return values
def parseCast(xml, title): def parseCast(xml, title):
list = [] list = []
try: try:
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>') strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop() strings.pop()
for string in strings: for string in strings:
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list return list
except: except:
return list return list
@ -92,12 +92,12 @@ def parseCast(xml, title):
def parseMovies(xml, title): def parseMovies(xml, title):
list = [] list = []
try: try:
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>') strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop() strings.pop()
for string in strings: for string in strings:
list.append({ list.append({
'id': findRe(string, 'viewMovie\?id=(.*?)&'), 'id': find_re(string, 'viewMovie\?id=(.*?)&'),
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>') 'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
}) })
return list return list
except: except:
@ -114,24 +114,24 @@ class ItunesAlbum:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&') id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = composeUrl('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS) xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>') data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>') data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<') data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<') data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = [] data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>') strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings: for string in strings:
data['tracks'].append(parseXmlDict(string)) data['tracks'].append(parseXmlDict(string))
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<') data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data return data
class ItunesMovie: class ItunesMovie:
@ -145,7 +145,7 @@ class ItunesMovie:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&') id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
@ -156,21 +156,21 @@ class ItunesMovie:
f.write(xml) f.write(xml)
f.close() f.close()
data['actors'] = parseCast(xml, 'actors') data['actors'] = parseCast(xml, 'actors')
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>') string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5 data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors') data['directors'] = parseCast(xml, 'directors')
data['format'] = findRe(xml, 'Format:(.*?)<') data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers') data['producers'] = parseCast(xml, 'producers')
data['rated'] = findRe(xml, 'Rated(.*?)<') data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies') data['relatedMovies'] = parseMovies(xml, 'related movies')
data['releaseDate'] = findRe(xml, 'Released(.*?)<') data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = findRe(xml, 'Run Time:(.*?)<') data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters') data['screenwriters'] = parseCast(xml, 'screenwriters')
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data return data
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,20 +1,20 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from ox.cache import read_url from ox.cache import read_url
from ox.html import decodeHtml from ox.html import decode_html
from ox.text import findRe from ox.text import find_re
def getLyrics(title, artist): def getLyrics(title, artist):
html = read_url('http://lyricsfly.com/api/') html = read_url('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>') key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = read_url(url) xml = read_url(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip() lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n') lyrics.replace('\n\n\n', '\n\n')
lyrics = decodeHtml(lyrics.replace('&amp;', '&')) lyrics = decode_html(lyrics.replace('&amp;', '&'))
return lyrics return lyrics
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -5,7 +5,7 @@ from urllib import quote
from lxml.html import document_fromstring from lxml.html import document_fromstring
from ox.cache import read_url from ox.cache import read_url
from ox import findRe, strip_tags from ox import find_re, strip_tags
def getUrl(id): def getUrl(id):
return 'http://www.metacritic.com/movie/%s' % id return 'http://www.metacritic.com/movie/%s' % id
@ -16,14 +16,14 @@ def getId(url):
def getUrlByImdb(imdb): def getUrlByImdb(imdb):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url) data = read_url(url)
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"') metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None return metacritic_url or None
def getMetacriticShowUrl(title): def getMetacriticShowUrl(title):
title = quote(title) title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url) data = read_url(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url): def getData(url):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)

View file

@ -6,8 +6,8 @@ import socket
from urllib import quote from urllib import quote
from ox.cache import read_url from ox.cache import read_url
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
from ox.normalize import normalizeImdbId from ox.normalize import normalize_imdbid
import ox import ox
from torrent import Torrent from torrent import Torrent
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
torrentDate = row[0] torrentDate = row[0]
torrentExtra = row[1] torrentExtra = row[1]
torrentId = row[2] torrentId = row[2]
torrentTitle = decodeHtml(row[3]).strip() torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra privateTracker = 'priv.gif' in torrentExtra
if not privateTracker: if not privateTracker:
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
'''find torrents on mininova for a given imdb id '''find torrents on mininova for a given imdb id
''' '''
results = [] results = []
imdbId = normalizeImdbId(imdbId) imdbId = normalize_imdbid(imdbId)
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True) data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data) return _parseResultsPage(data)
def getId(mininovaId): def getId(mininovaId):
mininovaId = unicode(mininovaId) mininovaId = unicode(mininovaId)
d = findRe(mininovaId, "/(\d+)") d = find_re(mininovaId, "/(\d+)")
if d: if d:
return d return d
mininovaId = mininovaId.split('/') mininovaId = mininovaId.split('/')
@ -81,14 +81,14 @@ def getData(mininovaId):
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip())) value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>') torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>') torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip() torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link']) t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent

View file

@ -4,7 +4,7 @@
import re import re
from ox.cache import read_url from ox.cache import read_url
from ox import findRe from ox import find_re
def getData(id): def getData(id):
''' '''
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results: for result in results:
html = read_url(result, timeout=timeout, unicode=True) html = read_url(result, timeout=timeout, unicode=True)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters return posters
def getUrl(id): def getUrl(id):

View file

@ -4,7 +4,7 @@ import re
import feedparser import feedparser
from ox.cache import read_url from ox.cache import read_url
from ox import findRe, strip_tags from ox import find_re, strip_tags
from ox import langCode2To3, langTo3Code from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if opensubtitleId: if opensubtitleId:
opensubtitleId = opensubtitleId[0] opensubtitleId = opensubtitleId[0]
else: else:
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/') opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId return opensubtitleId
def downloadSubtitleById(opensubtitle_id): def downloadSubtitleById(opensubtitle_id):

View file

@ -3,7 +3,7 @@
import re import re
from ox.cache import getHeaders, read_url from ox.cache import getHeaders, read_url
from ox import findRe, strip_tags from ox import find_re, strip_tags
def getUrlByImdb(imdb): def getUrlByImdb(imdb):
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
return None return None
def get_og(data, key): def get_og(data, key):
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key) return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url): def getData(url):
data = read_url(url) data = read_url(url)
r = {} r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>') r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']: if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)') r['year'] = find_re(r['title'], '\((\d*?)\)')
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip() r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip() r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']: if not r['summary']:
r['summary'] = get_og(data, 'description') r['summary'] = get_og(data, 'description')
@ -40,9 +40,9 @@ def getData(url):
meter = filter(lambda m: m[1].isdigit(), meter) meter = filter(lambda m: m[1].isdigit(), meter)
if meter: if meter:
r['tomatometer'] = meter[0][1] r['tomatometer'] = meter[0][1]
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>') r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>') r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5') r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
poster = get_og(data, 'image') poster = get_og(data, 'image')
if poster and not 'poster_default.gif' in poster: if poster and not 'poster_default.gif' in poster:
r['posters'] = [poster] r['posters'] = [poster]

View file

@ -3,7 +3,7 @@
import re import re
from ..cache import read_url from ..cache import read_url
from .. import strip_tags, decodeHtml from .. import strip_tags, decode_html
from ..utils import datetime from ..utils import datetime
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
if data: if data:
if isinstance(data[0], basestring): if isinstance(data[0], basestring):
#FIXME: some types need strip_tags #FIXME: some types need strip_tags
#data = [strip_tags(decodeHtml(p)).strip() for p in data] #data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decodeHtml(p).strip() for p in data] data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple): elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data] data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring): while len(data) == 1 and not isinstance(data, basestring):

View file

@ -5,7 +5,7 @@ import re
import time import time
import ox.cache import ox.cache
from ox.html import decodeHtml, strip_tags from ox.html import decode_html, strip_tags
import ox.net import ox.net
@ -44,8 +44,8 @@ def getNews(year, month, day):
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2]) new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else: else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml # fix decode_html
# new['description'] = formatString(decodeHtml(description)) # new['description'] = formatString(decode_html(description))
new['description'] = formatString(description) new['description'] = formatString(description)
new['imageUrl'] = imageUrl new['imageUrl'] = imageUrl
new['section'] = formatSection(section) new['section'] = formatSection(section)

View file

@ -6,8 +6,8 @@ import socket
from urllib import quote, urlencode from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
from ox.normalize import normalizeImdbId from ox.normalize import normalize_imdbid
import ox import ox
from torrent import Torrent from torrent import Torrent
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0] torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1] torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decodeHtml(row[2]) torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']: if torrentType in ['201']:
results.append((torrentTitle, torrentLink, '')) results.append((torrentTitle, torrentLink, ''))
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
return results return results
def findMovieByImdb(imdb): def findMovieByImdb(imdb):
return findMovies("tt" + normalizeImdbId(imdb)) return findMovies("tt" + normalize_imdbid(imdb))
def getId(piratebayId): def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'): if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1] piratebayId = piratebayId.split('org/')[1]
d = findRe(piratebayId, "tor/(\d+)") d = find_re(piratebayId, "tor/(\d+)")
if d: if d:
piratebayId = d piratebayId = d
d = findRe(piratebayId, "torrent/(\d+)") d = find_re(piratebayId, "torrent/(\d+)")
if d: if d:
piratebayId = d piratebayId = d
return piratebayId return piratebayId
@ -80,21 +80,21 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True) data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip() torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8')) title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip())) value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip() torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = _read_url(torrent[u'torrent_link']) t = _read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent

View file

@ -3,7 +3,7 @@
import re import re
import time import time
from ox import strip_tags, findRe from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
@ -16,11 +16,11 @@ def getEpisodeData(url):
''' '''
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
r = {} r = {}
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0]) r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>') r['show'] = find_re(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>') r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score #episode score
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>') r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data) match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match: if match:

View file

@ -5,7 +5,7 @@ from StringIO import StringIO
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from ox.cache import read_url from ox.cache import read_url
from ox import findString, findRe from ox import find_string, find_re
def getData(id): def getData(id):

View file

@ -5,7 +5,7 @@ from urllib import urlencode
from ox.utils import json from ox.utils import json
from ox.cache import read_url from ox.cache import read_url
from ox import findRe, decodeHtml from ox import find_re, decode_html
def getId(url): def getId(url):
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
if not wikipediaUrl.startswith('http'): if not wikipediaUrl.startswith('http'):
wikipediaUrl = getUrl(wikipediaUrl) wikipediaUrl = getUrl(wikipediaUrl)
data = getWikiData(wikipediaUrl) data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {} filmbox = {}
_box = filmbox_data.strip().split('|') _box = filmbox_data.strip().split('|')
for row in _box: for row in _box:
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id'] del filmbox['amg_id']
if 'Allmovie movie' in data: if 'Allmovie movie' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)') filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data: elif 'Allmovie title' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)') filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data: if 'Official website' in data:
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip() filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r: if r:
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
if r: if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '') filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data: if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]') filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data: if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox return filmbox
def getImageUrl(name): def getImageUrl(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20') url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"') url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url: if not url:
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"') url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
if url: if url:
url = 'http:' + url url = 'http:' + url
return url return url