replace all CammelCase with under_score in ox

This commit is contained in:
j 2012-08-14 16:12:43 +02:00
parent 2de989e188
commit bb35daa95c
31 changed files with 242 additions and 244 deletions

10
README
View File

@ -10,12 +10,12 @@ Depends:
Usage:
import ox
data = ox.cache.readUrl('http:/...')
text = ox.stripTags(data)
ox.normalizeNewlines(text)
ox.formatBytes(len(data))
data = ox.cache.read_url('http:/...')
text = ox.strip_tags(data)
ox.normalize_newlines(text)
ox.format_bytes(len(data))
ox.formatBytes(1234567890)
ox.format_bytes(1234567890)
'1.15 GB'
import ox.web.imdb

View File

@ -56,15 +56,15 @@ def strip_tags(value):
stripTags = strip_tags
def stripSpacesBetweenTags(value):
def strip_spaces_between_tags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def stripEntities(value):
def strip_entities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fixAmpersands(value):
def fix_ampersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
@ -113,11 +113,11 @@ def clean_html(text):
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalizeNewlines
text = normalizeNewlines(text)
from text import normalize_newlines
text = normalize_newlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fixAmpersands(text)
text = fix_ampersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
@ -168,8 +168,6 @@ def decode_html(html):
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
decodeHtml = decode_html
def highlight(text, query, hlClass="hl"):
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')

View File

@ -18,7 +18,7 @@ def latlngspan2latlng(lat, lng, latSpan, lngSpan):
lat_ne = lat + latSpan, lng_ne = lng + latSpan
)
def parseLocationString(location_string):
def parse_location_string(location_string):
l = location_string.split('+')
if len(l) == 1:
l = location_string.split(';')

View File

@ -8,8 +8,8 @@ import hashlib
import os
import re
from normalize import normalizeName
from text import get_sort_name, findRe
from normalize import normalize_name
from text import get_sort_name, find_re
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
@ -308,14 +308,14 @@ def parse_movie_path(path):
if title.endswith('_'):
title = title[:-1] + '.'
year = findRe(title, '(\(\d{4}\))')
year = find_re(title, '(\(\d{4}\))')
if not year:
year = findRe(title, '(\(\d{4}-\d*\))')
year = find_re(title, '(\(\d{4}-\d*\))')
if year and title.endswith(year):
title = title[:-len(year)].strip()
year = year[1:-1]
if '-' in year:
year = findRe(year, '\d{4}')
year = find_re(year, '\d{4}')
#director
if len(parts) == 4:
@ -323,7 +323,7 @@ def parse_movie_path(path):
if director.endswith('_'):
director = "%s." % director[:-1]
director = director.split('; ')
director = [normalizeName(d).strip() for d in director]
director = [normalize_name(d).strip() for d in director]
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
else:
director = []
@ -338,13 +338,13 @@ def parse_movie_path(path):
language = ''
#season/episode/episodeTitle
season = findRe(parts[-1], '\.Season (\d+)\.')
season = find_re(parts[-1], '\.Season (\d+)\.')
if season:
season = int(season)
else:
season = None
episode = findRe(parts[-1], '\.Episode (\d+)\.')
episode = find_re(parts[-1], '\.Episode (\d+)\.')
if episode:
episode = int(episode)
else:
@ -373,7 +373,7 @@ def parse_movie_path(path):
title = u'%s %s' % (title, episodeTitle)
#part
part = findRe(parts[-1], '\.Part (\d+)\.')
part = find_re(parts[-1], '\.Part (\d+)\.')
if part:
part = int(part)
else:

View File

@ -37,13 +37,13 @@ _noarticles = (
'i was',
)
def canonicalTitle(title):
def canonical_title(title):
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonicalTitle('The Movie Title')
>>> canonical_title('The Movie Title')
'Movie Title, The'
>>> canonicalTitle('Los Angeles Plays Itself')
>>> canonical_title('Los Angeles Plays Itself')
'Los Angeles Plays Itself'
"""
try:
@ -72,10 +72,10 @@ def canonicalTitle(title):
## break
return title
def normalizeTitle(title):
def normalize_title(title):
"""Return the title in the normal "The Title" format.
>>> normalizeTitle('Movie Title, The')
>>> normalize_title('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
@ -85,14 +85,14 @@ def normalizeTitle(title):
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
def normalizeImdbId(imdbId):
def normalize_imdbid(imdbId):
"""Return 7 digit imdbId.
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalizeImdbId(159206)
>>> normalize_imdbid(159206)
'0159206'
>>> normalizeImdbId('tt0159206')
>>> normalize_imdbid('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
@ -106,20 +106,20 @@ def normalizeImdbId(imdbId):
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
def canonicalName(name):
def canonical_name(name):
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonicalName('Jean Luc Godard')
>>> canonical_name('Jean Luc Godard')
'Godard, Jean Luc'
>>> canonicalName('Ivan Ivanov-Vano')
>>> canonical_name('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan'
>>> canonicalName('Gus Van Sant')
>>> canonical_name('Gus Van Sant')
'Van Sant, Gus'
>>> canonicalName('Brian De Palma')
>>> canonical_name('Brian De Palma')
'De Palma, Brian'
"""
@ -167,19 +167,19 @@ def canonicalName(name):
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalizeName(name):
def normalize_name(name):
"""Return a name in the normal "Name Surname" format.
>>> normalizeName('Godard, Jean Luc')
>>> normalize_name('Godard, Jean Luc')
'Jean Luc Godard'
>>> normalizeName('Ivanov-Vano, Ivan')
>>> normalize_name('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano'
>>> normalizeName('Van Sant, Gus')
>>> normalize_name('Van Sant, Gus')
'Gus Van Sant'
>>> normalizeName('De Palma, Brian')
>>> normalize_name('De Palma, Brian')
'Brian De Palma'
"""
sname = name.split(', ')
@ -187,12 +187,12 @@ def normalizeName(name):
name = '%s %s' % (sname[1], sname[0])
return name
def normalizePath(path):
def normalize_path(path):
path = path.replace(':', '_').replace('/', '_')
if path.endswith('.'): path = path[:-1] + '_'
return path
def stripAccents(s):
def strip_accents(s):
if isinstance(s, str):
s = unicode(s)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4
import re
from text import findRe
from text import find_re
import cache
from utils import json, ET
@ -13,14 +13,14 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
if json_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:
oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.readUrl(oembed_url))
elif xml_oembed:
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:

View File

@ -11,7 +11,7 @@ import ox
__all__ = []
def _detectEncoding(fp):
def _detect_encoding(fp):
bomDict={ # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
@ -63,7 +63,7 @@ def load(filename, offset=0):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename) as f:
encoding = _detectEncoding(f)
encoding = _detect_encoding(f)
data = f.read()
try:
data = unicode(data, encoding)

View File

@ -257,24 +257,24 @@ def get_sort_title(title):
return title[length + spaces:] + ', ' + title[:length]
return title
def findRe(string, regexp):
def find_re(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
def findString(string, string0='', string1 = ''):
def find_string(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
>>> findString('i am not there', string1=' not there')
>>> find_string('i am not there', string1=' not there')
'i am'
>>> findString('i am not there', 'i am ', ' there')
>>> find_string('i am not there', 'i am ', ' there')
'not'
>>> findString('i am not there', 'i am not t')
>>> find_string('i am not there', 'i am not t')
'here'
"""
@ -286,7 +286,7 @@ def findString(string, string0='', string1 = ''):
string1 = re.escape(string1)
else:
string1 = '$'
return findRe(string, string0 + '(.*?)' + string1)
return find_re(string, string0 + '(.*?)' + string1)
def parse_useragent(useragent):
data = {}
@ -319,7 +319,7 @@ def parse_useragent(useragent):
break;
return data
def removeSpecialCharacters(text):
def remove_special_characters(text):
"""
Removes special characters inserted by Word.
"""
@ -346,22 +346,22 @@ def wrap(text, width):
text.split(' ')
)
def wrapString(string, length=80, separator='\n', balance=False):
def wrap_string(string, length=80, separator='\n', balance=False):
'''
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrapString(u'All you can eat', 12, '\\n', True)
>>> wrap_string(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat'
'''
words = string.split(' ')
if balance:
# balance lines: test if same number of lines
# can be achieved with a shorter line length
lines = wrapString(string, length, separator, False).split(separator)
lines = wrap_string(string, length, separator, False).split(separator)
if len(lines) > 1:
while length > max(map(lambda x : len(x), words)):
length -= 1
if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
length += 1
break
lines = ['']
@ -382,12 +382,12 @@ def wrapString(string, length=80, separator='\n', balance=False):
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
def truncateString(string, length, padding='...', position='right'):
# >>> truncateString('anticonstitutionellement', 16, '...', 'left')
def truncate_string(string, length, padding='...', position='right'):
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
# '...utionellement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'center')
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
# 'anticon...lement'
# >>> truncateString('anticonstitutionellement', 16, '...', 'right')
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
paddingLength = len(padding)
@ -402,12 +402,12 @@ def truncateString(string, length, padding='...', position='right'):
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
def truncateWords(s, num):
def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
>>> truncate_string('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncateString('Truncates a string', 23)
>>> truncate_string('Truncates a string', 23)
'Truncates a string'
"""
@ -422,25 +422,25 @@ def truncateWords(s, num):
ts += "..."
return ts.strip()
def trimString(string, num):
def trim_string(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trimString('Truncates a string after a certain number of chacters', 23)
>>> trim_string('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trimString('Truncates a string', 23)
>>> trim_string('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def getValidFilename(s):
def get_valid_filename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> getValidFilename("john's portrait in 2004.jpg")
>>> get_valid_filename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
@ -449,34 +449,34 @@ def getValidFilename(s):
s = s.replace('__', '_').replace('__', '_')
return s
def getTextList(list_, last_word='or'):
def get_text_list(list_, last_word='or'):
"""
>>> getTextList([u'a', u'b', u'c', u'd'])
>>> get_text_list([u'a', u'b', u'c', u'd'])
u'a, b, c or d'
>>> getTextList([u'a', u'b', u'c'], 'and')
>>> get_text_list([u'a', u'b', u'c'], 'and')
u'a, b and c'
>>> getTextList([u'a', u'b'], 'and')
>>> get_text_list([u'a', u'b'], 'and')
u'a and b'
>>> getTextList([u'a'])
>>> get_text_list([u'a'])
u'a'
>>> getTextList([])
>>> get_text_list([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
def getListText(text, last_word='or'):
def get_list_text(text, last_word='or'):
"""
>>> getListText(u'a, b, c or d')
>>> get_list_text(u'a, b, c or d')
[u'a', u'b', u'c', u'd']
>>> getListText(u'a, b and c', u'and')
>>> get_list_text(u'a, b and c', u'and')
[u'a', u'b', u'c']
>>> getListText(u'a and b', u'and')
>>> get_list_text(u'a and b', u'and')
[u'a', u'b']
>>> getListText(u'a')
>>> get_list_text(u'a')
[u'a']
>>> getListText(u'')
>>> get_list_text(u'')
[]
"""
list_ = []
@ -490,7 +490,7 @@ def getListText(text, last_word='or'):
list_.append(last[1].strip())
return list_
def normalizeNewlines(text):
def normalize_newlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
@ -514,7 +514,7 @@ def phone2numeric(phone):
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compressString(s):
def compress_string(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
@ -523,13 +523,13 @@ def compressString(s):
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smartSplit(text):
def smart_split(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smartSplit('This is "a person\\'s" test.'))
>>> list(smart_split('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):

View File

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
@ -28,22 +28,22 @@ def getData(id):
}
html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries')
data['director'] = parseEntry(html, 'directed by')
data['genres'] = parseList(html, 'genres')
data['keywords'] = parseList(html, 'keywords')
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parseList(html, 'produced by')
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'released by')
data['releasedate'] = parseList(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in')
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
@ -51,18 +51,18 @@ def getData(id):
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data
def getUrl(id):
return "http://allmovie.com/work/%s" % id
def parseEntry(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip()
def parseList(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [strip_tags(html)]
@ -74,11 +74,11 @@ def parseTable(html):
lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-')
),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
)
def parseText(html, title):
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print getData('129689')

View File

@ -3,7 +3,7 @@
import re
from urllib import quote
from ox import findRe, strip_tags, decodeHtml
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
@ -12,7 +12,7 @@ def findISBN(title, author):
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id)
if author in data['authors']:
return data
@ -24,13 +24,13 @@ def getData(id):
def findData(key):
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {}
r['amazon'] = url
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span')
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']])
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
if t:
r['translator'] = t
@ -38,15 +38,15 @@ def getData(id):
r['language'] = findData('Language')
r['isbn-10'] = findData('ISBN-10')
r['isbn-13'] = findData('ISBN-13').replace('-', '')
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = findData('Paperback')
if not r['pages']:
r['pages'] = findData('Hardcover')
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:

View File

@ -5,7 +5,7 @@ import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe, removeSpecialCharacters
from ox.text import find_re, remove_special_characters
import imdb
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = findRe(html, "<li>Spine #(\d+)")
data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>')
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
if r:
result = r[0]
result = findRe(result, "<a href=\"(.*?)\"")
result = find_re(result, "<a href=\"(.*?)\"")
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = findRe(result, "src=\"(.*?)\"")
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
if timeout == ox.cache.cache_timeout:
timeout = -1

View File

@ -3,7 +3,7 @@
import re
import urllib
import ox
from ox import strip_tags, decodeHtml
from ox import strip_tags, decode_html
from ox.utils import json
from ox.cache import read_url
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
results = []
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
return results

View File

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
import google
@ -23,8 +23,8 @@ def getShowUrl(title):
def getShowData(url):
data = read_url(url, unicode=True)
r = {}
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):

View File

@ -5,7 +5,7 @@ import re
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined

View File

@ -3,7 +3,7 @@
import json
from ox.cache import read_url
from ox import findRe
from ox import find_re
class Imdb(dict):
def __init__(self, id, timeout=-1):
@ -36,7 +36,7 @@ class Imdb(dict):
if 'nytimes' in self:
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/')
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')

View File

@ -4,7 +4,7 @@ import re
import urllib
import ox
from ox import strip_tags, decodeHtml
from ox import strip_tags, decode_html
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
for a in re.compile(
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
).findall(data):
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results:
break
return results

View File

@ -8,8 +8,8 @@ import time
import unicodedata
import ox
from ox import findRe, strip_tags
from ox.normalize import normalizeTitle, normalizeImdbId
from ox import find_re, strip_tags
from ox.normalize import normalize_title, normalize_imdbid
import ox.cache
from siteparser import SiteParser
@ -50,7 +50,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+')
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
@ -141,7 +141,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: findRe(data.replace(',', ''), '\d+')
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
@ -314,7 +314,7 @@ class Imdb(SiteParser):
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
#print google_query
results = google.find(google_query, timeout=timeout)
if results:
return findRe(results[0][1], 'title/tt(\d{7})')
return find_re(results[0][1], 'title/tt(\d{7})')
#or nothing
return ''
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
if 'posterId' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
return getMoviePoster(info['series'])

View File

@ -4,7 +4,7 @@ import re
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe
from ox.text import find_re
def getData(id):
@ -22,13 +22,13 @@ def getData(id):
'url': getUrl(id)
}
html = read_url(data['url'], unicode=True)
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
poster = findRe(html, '<img src="(posters.*?)"')
poster = find_re(html, '<img src="(posters.*?)"')
if poster:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
data['posters'].append(poster)
@ -37,13 +37,13 @@ def getData(id):
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
result = findRe(html, '<a href = (\w*?_xlg.html)')
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
@ -54,7 +54,7 @@ def getId(url):
split = split[4][:-5].split('_')
if split[-1] == 'xlg':
split.pop()
if findRe(split[-1], 'ver\d+$'):
if find_re(split[-1], 'ver\d+$'):
split.pop()
id = '%s/%s' % (year, '_'.join(split))
return id
@ -62,7 +62,7 @@ def getId(url):
def getIds():
ids = []
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
if not id in ids:
@ -81,7 +81,7 @@ def getIdsByPage(page):
def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
if findRe(html, "No Movie Posters on This Page"):
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url

View File

@ -4,9 +4,9 @@ import re
import urllib
from ox.cache import read_url
from ox.html import decodeHtml, strip_tags
from ox.text import findRe
from ox.text import findString
from ox.html import decode_html, strip_tags
from ox.text import find_re
from ox.text import find_string
# to sniff itunes traffic, use something like
@ -65,26 +65,26 @@ def parseXmlDict(xml):
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = findRe(string, '(.*?)</key>')
type = findRe(string, '</key><(.*?)>')
key = find_re(string, '(.*?)</key>')
type = find_re(string, '</key><(.*?)>')
if type == 'true/':
value = True
else:
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer':
value = int(value)
elif type == 'string':
value = decodeHtml(value)
value = decode_html(value)
values[key] = value
return values
def parseCast(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list
except:
return list
@ -92,12 +92,12 @@ def parseCast(xml, title):
def parseMovies(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append({
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
})
return list
except:
@ -114,24 +114,24 @@ class ItunesAlbum:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
data['tracks'].append(parseXmlDict(string))
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data
class ItunesMovie:
@ -145,7 +145,7 @@ class ItunesMovie:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&')
id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id
def getData(self):
@ -156,21 +156,21 @@ class ItunesMovie:
f.write(xml)
f.close()
data['actors'] = parseCast(xml, 'actors')
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors')
data['format'] = findRe(xml, 'Format:(.*?)<')
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers')
data['rated'] = findRe(xml, 'Rated(.*?)<')
data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters')
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data
if __name__ == '__main__':

View File

@ -1,20 +1,20 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.cache import read_url
from ox.html import decodeHtml
from ox.text import findRe
from ox.html import decode_html
from ox.text import find_re
def getLyrics(title, artist):
html = read_url('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = read_url(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n')
lyrics = decodeHtml(lyrics.replace('&amp;', '&'))
lyrics = decode_html(lyrics.replace('&amp;', '&'))
return lyrics
if __name__ == '__main__':

View File

@ -5,7 +5,7 @@ from urllib import quote
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
def getUrl(id):
return 'http://www.metacritic.com/movie/%s' % id
@ -16,14 +16,14 @@ def getId(url):
def getUrlByImdb(imdb):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
def getMetacriticShowUrl(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url):
data = read_url(url, unicode=True)

View File

@ -6,8 +6,8 @@ import socket
from urllib import quote
from ox.cache import read_url
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
from ox.normalize import normalizeImdbId
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decodeHtml(row[3]).strip()
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
'''find torrents on mininova for a given imdb id
'''
results = []
imdbId = normalizeImdbId(imdbId)
imdbId = normalize_imdbid(imdbId)
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data)
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRe(mininovaId, "/(\d+)")
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
@ -81,14 +81,14 @@ def getData(mininovaId):
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip()))
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View File

@ -4,7 +4,7 @@
import re
from ox.cache import read_url
from ox import findRe
from ox import find_re
def getData(id):
'''
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = read_url(result, timeout=timeout, unicode=True)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters
def getUrl(id):

View File

@ -4,7 +4,7 @@ import re
import feedparser
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def downloadSubtitleById(opensubtitle_id):

View File

@ -3,7 +3,7 @@
import re
from ox.cache import getHeaders, read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
def getUrlByImdb(imdb):
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
return None
def get_og(data, key):
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url):
data = read_url(url)
r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)')
r['year'] = find_re(r['title'], '\((\d*?)\)')
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']:
r['summary'] = get_og(data, 'description')
@ -40,9 +40,9 @@ def getData(url):
meter = filter(lambda m: m[1].isdigit(), meter)
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5')
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
poster = get_og(data, 'image')
if poster and not 'poster_default.gif' in poster:
r['posters'] = [poster]

View File

@ -3,7 +3,7 @@
import re
from ..cache import read_url
from .. import strip_tags, decodeHtml
from .. import strip_tags, decode_html
from ..utils import datetime
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
if data:
if isinstance(data[0], basestring):
#FIXME: some types need strip_tags
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
data = [decodeHtml(p).strip() for p in data]
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring):

View File

@ -5,7 +5,7 @@ import re
import time
import ox.cache
from ox.html import decodeHtml, strip_tags
from ox.html import decode_html, strip_tags
import ox.net
@ -44,8 +44,8 @@ def getNews(year, month, day):
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
# fix decode_html
# new['description'] = formatString(decode_html(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)

View File

@ -6,8 +6,8 @@ import socket
from urllib import quote, urlencode
from urllib2 import URLError
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox.normalize import normalizeImdbId
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decodeHtml(row[2])
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
results.append((torrentTitle, torrentLink, ''))
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
return results
def findMovieByImdb(imdb):
return findMovies("tt" + normalizeImdbId(imdb))
return findMovies("tt" + normalize_imdbid(imdb))
def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1]
d = findRe(piratebayId, "tor/(\d+)")
d = find_re(piratebayId, "tor/(\d+)")
if d:
piratebayId = d
d = findRe(piratebayId, "torrent/(\d+)")
d = find_re(piratebayId, "torrent/(\d+)")
if d:
piratebayId = d
return piratebayId
@ -80,21 +80,21 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip()))
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = _read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View File

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
@ -16,11 +16,11 @@ def getEpisodeData(url):
'''
data = read_url(url, unicode=True)
r = {}
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = find_re(data, '<h1>(.*?)</h1>')
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match:

View File

@ -5,7 +5,7 @@ from StringIO import StringIO
import xml.etree.ElementTree as ET
from ox.cache import read_url
from ox import findString, findRe
from ox import find_string, find_re
def getData(id):

View File

@ -5,7 +5,7 @@ from urllib import urlencode
from ox.utils import json
from ox.cache import read_url
from ox import findRe, decodeHtml
from ox import find_re, decode_html
def getId(url):
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
if not wikipediaUrl.startswith('http'):
wikipediaUrl = getUrl(wikipediaUrl)
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {}
_box = filmbox_data.strip().split('|')
for row in _box:
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r:
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]')
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def getImageUrl(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url, unicode=True)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
if url:
url = 'http:' + url
return url