replace all CammelCase with under_score in ox
This commit is contained in:
parent
2de989e188
commit
bb35daa95c
31 changed files with 242 additions and 244 deletions
10
README
10
README
|
@ -10,12 +10,12 @@ Depends:
|
|||
Usage:
|
||||
import ox
|
||||
|
||||
data = ox.cache.readUrl('http:/...')
|
||||
text = ox.stripTags(data)
|
||||
ox.normalizeNewlines(text)
|
||||
ox.formatBytes(len(data))
|
||||
data = ox.cache.read_url('http:/...')
|
||||
text = ox.strip_tags(data)
|
||||
ox.normalize_newlines(text)
|
||||
ox.format_bytes(len(data))
|
||||
|
||||
ox.formatBytes(1234567890)
|
||||
ox.format_bytes(1234567890)
|
||||
'1.15 GB'
|
||||
|
||||
import ox.web.imdb
|
||||
|
|
14
ox/html.py
14
ox/html.py
|
@ -56,15 +56,15 @@ def strip_tags(value):
|
|||
|
||||
stripTags = strip_tags
|
||||
|
||||
def stripSpacesBetweenTags(value):
|
||||
def strip_spaces_between_tags(value):
|
||||
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||
return re.sub(r'>\s+<', '> <', value)
|
||||
|
||||
def stripEntities(value):
|
||||
def strip_entities(value):
|
||||
"Returns the given HTML with all entities (&something;) stripped"
|
||||
return re.sub(r'&(?:\w+|#\d);', '', value)
|
||||
|
||||
def fixAmpersands(value):
|
||||
def fix_ampersands(value):
|
||||
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||
return unencoded_ampersands_re.sub('&', value)
|
||||
|
||||
|
@ -113,11 +113,11 @@ def clean_html(text):
|
|||
* Removes stuff like "<p> </p>", but only if it's at the
|
||||
bottom of the text.
|
||||
"""
|
||||
from text import normalizeNewlines
|
||||
text = normalizeNewlines(text)
|
||||
from text import normalize_newlines
|
||||
text = normalize_newlines(text)
|
||||
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
||||
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
||||
text = fixAmpersands(text)
|
||||
text = fix_ampersands(text)
|
||||
# Remove all target="" attributes from <a> tags.
|
||||
text = link_target_attribute_re.sub('\\1', text)
|
||||
# Trim stupid HTML such as <br clear="all">.
|
||||
|
@ -168,8 +168,6 @@ def decode_html(html):
|
|||
return match.group(0)
|
||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||
|
||||
decodeHtml = decode_html
|
||||
|
||||
def highlight(text, query, hlClass="hl"):
|
||||
"""
|
||||
>>> highlight('me & you and $&%', 'and')
|
||||
|
|
|
@ -18,7 +18,7 @@ def latlngspan2latlng(lat, lng, latSpan, lngSpan):
|
|||
lat_ne = lat + latSpan, lng_ne = lng + latSpan
|
||||
)
|
||||
|
||||
def parseLocationString(location_string):
|
||||
def parse_location_string(location_string):
|
||||
l = location_string.split('+')
|
||||
if len(l) == 1:
|
||||
l = location_string.split(';')
|
||||
|
|
18
ox/movie.py
18
ox/movie.py
|
@ -8,8 +8,8 @@ import hashlib
|
|||
import os
|
||||
import re
|
||||
|
||||
from normalize import normalizeName
|
||||
from text import get_sort_name, findRe
|
||||
from normalize import normalize_name
|
||||
from text import get_sort_name, find_re
|
||||
|
||||
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
|
||||
|
||||
|
@ -308,14 +308,14 @@ def parse_movie_path(path):
|
|||
if title.endswith('_'):
|
||||
title = title[:-1] + '.'
|
||||
|
||||
year = findRe(title, '(\(\d{4}\))')
|
||||
year = find_re(title, '(\(\d{4}\))')
|
||||
if not year:
|
||||
year = findRe(title, '(\(\d{4}-\d*\))')
|
||||
year = find_re(title, '(\(\d{4}-\d*\))')
|
||||
if year and title.endswith(year):
|
||||
title = title[:-len(year)].strip()
|
||||
year = year[1:-1]
|
||||
if '-' in year:
|
||||
year = findRe(year, '\d{4}')
|
||||
year = find_re(year, '\d{4}')
|
||||
|
||||
#director
|
||||
if len(parts) == 4:
|
||||
|
@ -323,7 +323,7 @@ def parse_movie_path(path):
|
|||
if director.endswith('_'):
|
||||
director = "%s." % director[:-1]
|
||||
director = director.split('; ')
|
||||
director = [normalizeName(d).strip() for d in director]
|
||||
director = [normalize_name(d).strip() for d in director]
|
||||
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
|
||||
else:
|
||||
director = []
|
||||
|
@ -338,13 +338,13 @@ def parse_movie_path(path):
|
|||
language = ''
|
||||
|
||||
#season/episode/episodeTitle
|
||||
season = findRe(parts[-1], '\.Season (\d+)\.')
|
||||
season = find_re(parts[-1], '\.Season (\d+)\.')
|
||||
if season:
|
||||
season = int(season)
|
||||
else:
|
||||
season = None
|
||||
|
||||
episode = findRe(parts[-1], '\.Episode (\d+)\.')
|
||||
episode = find_re(parts[-1], '\.Episode (\d+)\.')
|
||||
if episode:
|
||||
episode = int(episode)
|
||||
else:
|
||||
|
@ -373,7 +373,7 @@ def parse_movie_path(path):
|
|||
title = u'%s %s' % (title, episodeTitle)
|
||||
|
||||
#part
|
||||
part = findRe(parts[-1], '\.Part (\d+)\.')
|
||||
part = find_re(parts[-1], '\.Part (\d+)\.')
|
||||
if part:
|
||||
part = int(part)
|
||||
else:
|
||||
|
|
|
@ -37,13 +37,13 @@ _noarticles = (
|
|||
'i was',
|
||||
)
|
||||
|
||||
def canonicalTitle(title):
|
||||
def canonical_title(title):
|
||||
"""Return the title in the canonic format 'Movie Title, The'.
|
||||
|
||||
>>> canonicalTitle('The Movie Title')
|
||||
>>> canonical_title('The Movie Title')
|
||||
'Movie Title, The'
|
||||
|
||||
>>> canonicalTitle('Los Angeles Plays Itself')
|
||||
>>> canonical_title('Los Angeles Plays Itself')
|
||||
'Los Angeles Plays Itself'
|
||||
"""
|
||||
try:
|
||||
|
@ -72,10 +72,10 @@ def canonicalTitle(title):
|
|||
## break
|
||||
return title
|
||||
|
||||
def normalizeTitle(title):
|
||||
def normalize_title(title):
|
||||
"""Return the title in the normal "The Title" format.
|
||||
|
||||
>>> normalizeTitle('Movie Title, The')
|
||||
>>> normalize_title('Movie Title, The')
|
||||
'The Movie Title'
|
||||
"""
|
||||
stitle = title.split(', ')
|
||||
|
@ -85,14 +85,14 @@ def normalizeTitle(title):
|
|||
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
|
||||
return title
|
||||
|
||||
def normalizeImdbId(imdbId):
|
||||
def normalize_imdbid(imdbId):
|
||||
"""Return 7 digit imdbId.
|
||||
|
||||
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
|
||||
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
|
||||
'0159206'
|
||||
>>> normalizeImdbId(159206)
|
||||
>>> normalize_imdbid(159206)
|
||||
'0159206'
|
||||
>>> normalizeImdbId('tt0159206')
|
||||
>>> normalize_imdbid('tt0159206')
|
||||
'0159206'
|
||||
"""
|
||||
if isinstance(imdbId, basestring):
|
||||
|
@ -106,20 +106,20 @@ def normalizeImdbId(imdbId):
|
|||
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
|
||||
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
|
||||
|
||||
def canonicalName(name):
|
||||
def canonical_name(name):
|
||||
"""Return the given name in canonical "Surname, Name" format.
|
||||
It assumes that name is in the 'Name Surname' format.
|
||||
|
||||
>>> canonicalName('Jean Luc Godard')
|
||||
>>> canonical_name('Jean Luc Godard')
|
||||
'Godard, Jean Luc'
|
||||
|
||||
>>> canonicalName('Ivan Ivanov-Vano')
|
||||
>>> canonical_name('Ivan Ivanov-Vano')
|
||||
'Ivanov-Vano, Ivan'
|
||||
|
||||
>>> canonicalName('Gus Van Sant')
|
||||
>>> canonical_name('Gus Van Sant')
|
||||
'Van Sant, Gus'
|
||||
|
||||
>>> canonicalName('Brian De Palma')
|
||||
>>> canonical_name('Brian De Palma')
|
||||
'De Palma, Brian'
|
||||
"""
|
||||
|
||||
|
@ -167,19 +167,19 @@ def canonicalName(name):
|
|||
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
|
||||
return name
|
||||
|
||||
def normalizeName(name):
|
||||
def normalize_name(name):
|
||||
"""Return a name in the normal "Name Surname" format.
|
||||
|
||||
>>> normalizeName('Godard, Jean Luc')
|
||||
>>> normalize_name('Godard, Jean Luc')
|
||||
'Jean Luc Godard'
|
||||
|
||||
>>> normalizeName('Ivanov-Vano, Ivan')
|
||||
>>> normalize_name('Ivanov-Vano, Ivan')
|
||||
'Ivan Ivanov-Vano'
|
||||
|
||||
>>> normalizeName('Van Sant, Gus')
|
||||
>>> normalize_name('Van Sant, Gus')
|
||||
'Gus Van Sant'
|
||||
|
||||
>>> normalizeName('De Palma, Brian')
|
||||
>>> normalize_name('De Palma, Brian')
|
||||
'Brian De Palma'
|
||||
"""
|
||||
sname = name.split(', ')
|
||||
|
@ -187,12 +187,12 @@ def normalizeName(name):
|
|||
name = '%s %s' % (sname[1], sname[0])
|
||||
return name
|
||||
|
||||
def normalizePath(path):
|
||||
def normalize_path(path):
|
||||
path = path.replace(':', '_').replace('/', '_')
|
||||
if path.endswith('.'): path = path[:-1] + '_'
|
||||
return path
|
||||
|
||||
def stripAccents(s):
|
||||
def strip_accents(s):
|
||||
if isinstance(s, str):
|
||||
s = unicode(s)
|
||||
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# ci:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from text import findRe
|
||||
from text import find_re
|
||||
import cache
|
||||
from utils import json, ET
|
||||
|
||||
|
@ -13,14 +13,14 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
|
|||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
if json_oembed:
|
||||
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
oembed_url += '&maxwidth=%d' % maxwidth
|
||||
if maxheight:
|
||||
oembed_url += '&maxheight=%d' % maxheight
|
||||
embed = json.loads(cache.readUrl(oembed_url))
|
||||
elif xml_oembed:
|
||||
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
oembed_url += '&maxwidth=%d' % maxwidth
|
||||
if maxheight:
|
||||
|
|
|
@ -11,7 +11,7 @@ import ox
|
|||
__all__ = []
|
||||
|
||||
|
||||
def _detectEncoding(fp):
|
||||
def _detect_encoding(fp):
|
||||
bomDict={ # bytepattern : name
|
||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||
|
@ -63,7 +63,7 @@ def load(filename, offset=0):
|
|||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||
|
||||
with open(filename) as f:
|
||||
encoding = _detectEncoding(f)
|
||||
encoding = _detect_encoding(f)
|
||||
data = f.read()
|
||||
try:
|
||||
data = unicode(data, encoding)
|
||||
|
|
80
ox/text.py
80
ox/text.py
|
@ -257,24 +257,24 @@ def get_sort_title(title):
|
|||
return title[length + spaces:] + ', ' + title[:length]
|
||||
return title
|
||||
|
||||
def findRe(string, regexp):
|
||||
def find_re(string, regexp):
|
||||
result = re.compile(regexp, re.DOTALL).findall(string)
|
||||
if result:
|
||||
return result[0].strip()
|
||||
return ''
|
||||
|
||||
def findString(string, string0='', string1 = ''):
|
||||
def find_string(string, string0='', string1 = ''):
|
||||
"""Return the string between string0 and string1.
|
||||
|
||||
If string0 or string1 is left out, begining or end of string is used.
|
||||
|
||||
>>> findString('i am not there', string1=' not there')
|
||||
>>> find_string('i am not there', string1=' not there')
|
||||
'i am'
|
||||
|
||||
>>> findString('i am not there', 'i am ', ' there')
|
||||
>>> find_string('i am not there', 'i am ', ' there')
|
||||
'not'
|
||||
|
||||
>>> findString('i am not there', 'i am not t')
|
||||
>>> find_string('i am not there', 'i am not t')
|
||||
'here'
|
||||
|
||||
"""
|
||||
|
@ -286,7 +286,7 @@ def findString(string, string0='', string1 = ''):
|
|||
string1 = re.escape(string1)
|
||||
else:
|
||||
string1 = '$'
|
||||
return findRe(string, string0 + '(.*?)' + string1)
|
||||
return find_re(string, string0 + '(.*?)' + string1)
|
||||
|
||||
def parse_useragent(useragent):
|
||||
data = {}
|
||||
|
@ -319,7 +319,7 @@ def parse_useragent(useragent):
|
|||
break;
|
||||
return data
|
||||
|
||||
def removeSpecialCharacters(text):
|
||||
def remove_special_characters(text):
|
||||
"""
|
||||
Removes special characters inserted by Word.
|
||||
"""
|
||||
|
@ -346,22 +346,22 @@ def wrap(text, width):
|
|||
text.split(' ')
|
||||
)
|
||||
|
||||
def wrapString(string, length=80, separator='\n', balance=False):
|
||||
def wrap_string(string, length=80, separator='\n', balance=False):
|
||||
'''
|
||||
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
|
||||
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
|
||||
u"Anticonstitution\\nellement, Paris \\ns'eveille"
|
||||
>>> wrapString(u'All you can eat', 12, '\\n', True)
|
||||
>>> wrap_string(u'All you can eat', 12, '\\n', True)
|
||||
u'All you \\ncan eat'
|
||||
'''
|
||||
words = string.split(' ')
|
||||
if balance:
|
||||
# balance lines: test if same number of lines
|
||||
# can be achieved with a shorter line length
|
||||
lines = wrapString(string, length, separator, False).split(separator)
|
||||
lines = wrap_string(string, length, separator, False).split(separator)
|
||||
if len(lines) > 1:
|
||||
while length > max(map(lambda x : len(x), words)):
|
||||
length -= 1
|
||||
if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
|
||||
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
|
||||
length += 1
|
||||
break
|
||||
lines = ['']
|
||||
|
@ -382,12 +382,12 @@ def wrapString(string, length=80, separator='\n', balance=False):
|
|||
lines[len(lines) - 1] += u' '
|
||||
return separator.join(lines).strip()
|
||||
|
||||
def truncateString(string, length, padding='...', position='right'):
|
||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'left')
|
||||
def truncate_string(string, length, padding='...', position='right'):
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
|
||||
# '...utionellement'
|
||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'center')
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
|
||||
# 'anticon...lement'
|
||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'right')
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
|
||||
# 'anticonstitut...'
|
||||
stringLength = len(string);
|
||||
paddingLength = len(padding)
|
||||
|
@ -402,12 +402,12 @@ def truncateString(string, length, padding='...', position='right'):
|
|||
string = '%s%s' % (string[:length - paddingLength], padding)
|
||||
return string;
|
||||
|
||||
def truncateWords(s, num):
|
||||
def truncate_words(s, num):
|
||||
"""Truncates a string after a certain number of chacters, but ends with a word
|
||||
|
||||
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
|
||||
>>> truncate_string('Truncates a string after a certain number of chacters, but ends with a word', 23)
|
||||
'Truncates a string...'
|
||||
>>> truncateString('Truncates a string', 23)
|
||||
>>> truncate_string('Truncates a string', 23)
|
||||
'Truncates a string'
|
||||
|
||||
"""
|
||||
|
@ -422,25 +422,25 @@ def truncateWords(s, num):
|
|||
ts += "..."
|
||||
return ts.strip()
|
||||
|
||||
def trimString(string, num):
|
||||
def trim_string(string, num):
|
||||
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
|
||||
|
||||
>>> trimString('Truncates a string after a certain number of chacters', 23)
|
||||
>>> trim_string('Truncates a string after a certain number of chacters', 23)
|
||||
'Truncates ...f chacters'
|
||||
>>> trimString('Truncates a string', 23)
|
||||
>>> trim_string('Truncates a string', 23)
|
||||
'Truncates a string'
|
||||
"""
|
||||
if len(string) > num:
|
||||
string = string[:num - 13] + '...' + string[-10:]
|
||||
return string
|
||||
|
||||
def getValidFilename(s):
|
||||
def get_valid_filename(s):
|
||||
"""
|
||||
Returns the given string converted to a string that can be used for a clean
|
||||
filename. Specifically, leading and trailing spaces are removed;
|
||||
all non-filename-safe characters are removed.
|
||||
|
||||
>>> getValidFilename("john's portrait in 2004.jpg")
|
||||
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||
'john_s_portrait_in_2004.jpg'
|
||||
"""
|
||||
s = s.strip()
|
||||
|
@ -449,34 +449,34 @@ def getValidFilename(s):
|
|||
s = s.replace('__', '_').replace('__', '_')
|
||||
return s
|
||||
|
||||
def getTextList(list_, last_word='or'):
|
||||
def get_text_list(list_, last_word='or'):
|
||||
"""
|
||||
>>> getTextList([u'a', u'b', u'c', u'd'])
|
||||
>>> get_text_list([u'a', u'b', u'c', u'd'])
|
||||
u'a, b, c or d'
|
||||
>>> getTextList([u'a', u'b', u'c'], 'and')
|
||||
>>> get_text_list([u'a', u'b', u'c'], 'and')
|
||||
u'a, b and c'
|
||||
>>> getTextList([u'a', u'b'], 'and')
|
||||
>>> get_text_list([u'a', u'b'], 'and')
|
||||
u'a and b'
|
||||
>>> getTextList([u'a'])
|
||||
>>> get_text_list([u'a'])
|
||||
u'a'
|
||||
>>> getTextList([])
|
||||
>>> get_text_list([])
|
||||
''
|
||||
"""
|
||||
if len(list_) == 0: return ''
|
||||
if len(list_) == 1: return list_[0]
|
||||
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
|
||||
|
||||
def getListText(text, last_word='or'):
|
||||
def get_list_text(text, last_word='or'):
|
||||
"""
|
||||
>>> getListText(u'a, b, c or d')
|
||||
>>> get_list_text(u'a, b, c or d')
|
||||
[u'a', u'b', u'c', u'd']
|
||||
>>> getListText(u'a, b and c', u'and')
|
||||
>>> get_list_text(u'a, b and c', u'and')
|
||||
[u'a', u'b', u'c']
|
||||
>>> getListText(u'a and b', u'and')
|
||||
>>> get_list_text(u'a and b', u'and')
|
||||
[u'a', u'b']
|
||||
>>> getListText(u'a')
|
||||
>>> get_list_text(u'a')
|
||||
[u'a']
|
||||
>>> getListText(u'')
|
||||
>>> get_list_text(u'')
|
||||
[]
|
||||
"""
|
||||
list_ = []
|
||||
|
@ -490,7 +490,7 @@ def getListText(text, last_word='or'):
|
|||
list_.append(last[1].strip())
|
||||
return list_
|
||||
|
||||
def normalizeNewlines(text):
|
||||
def normalize_newlines(text):
|
||||
return re.sub(r'\r\n|\r|\n', '\n', text)
|
||||
|
||||
def recapitalize(text):
|
||||
|
@ -514,7 +514,7 @@ def phone2numeric(phone):
|
|||
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||
return letters.sub(char2number, phone)
|
||||
|
||||
def compressString(s):
|
||||
def compress_string(s):
|
||||
import cStringIO, gzip
|
||||
zbuf = cStringIO.StringIO()
|
||||
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||
|
@ -523,13 +523,13 @@ def compressString(s):
|
|||
return zbuf.getvalue()
|
||||
|
||||
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||
def smartSplit(text):
|
||||
def smart_split(text):
|
||||
"""
|
||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||
Supports both single and double quotes, and supports escaping quotes with
|
||||
backslashes. In the output, strings will keep their initial and trailing
|
||||
quote marks.
|
||||
>>> list(smartSplit('This is "a person\\'s" test.'))
|
||||
>>> list(smart_split('This is "a person\\'s" test.'))
|
||||
['This', 'is', '"a person\\'s"', 'test.']
|
||||
"""
|
||||
for bit in smart_split_re.finditer(text):
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
@ -28,22 +28,22 @@ def getData(id):
|
|||
}
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parseList(html, 'AKA')
|
||||
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parseList(html, 'countries')
|
||||
data['director'] = parseEntry(html, 'directed by')
|
||||
data['genres'] = parseList(html, 'genres')
|
||||
data['keywords'] = parseList(html, 'keywords')
|
||||
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
|
||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||
data['produced'] = parseList(html, 'produced by')
|
||||
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||
data['released'] = parseEntry(html, 'released by')
|
||||
data['releasedate'] = parseList(html, 'release date')
|
||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parseEntry(html, 'set in')
|
||||
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parseList(html, 'themes')
|
||||
data['types'] = parseList(html, 'types')
|
||||
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
|
@ -51,18 +51,18 @@ def getData(id):
|
|||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parseTable(html)
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def getUrl(id):
|
||||
return "http://allmovie.com/work/%s" % id
|
||||
|
||||
def parseEntry(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parseList(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [strip_tags(html)]
|
||||
|
@ -74,11 +74,11 @@ def parseTable(html):
|
|||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||
x.split('<td width="305">-')
|
||||
),
|
||||
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
)
|
||||
|
||||
def parseText(html, title):
|
||||
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getData('129689')
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
from urllib import quote
|
||||
|
||||
from ox import findRe, strip_tags, decodeHtml
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
@ -12,7 +12,7 @@ def findISBN(title, author):
|
|||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = getData(id)
|
||||
if author in data['authors']:
|
||||
return data
|
||||
|
@ -24,13 +24,13 @@ def getData(id):
|
|||
|
||||
|
||||
def findData(key):
|
||||
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
|
||||
r = {}
|
||||
r['amazon'] = url
|
||||
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
||||
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']])
|
||||
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||
if t:
|
||||
r['translator'] = t
|
||||
|
@ -38,15 +38,15 @@ def getData(id):
|
|||
r['language'] = findData('Language')
|
||||
r['isbn-10'] = findData('ISBN-10')
|
||||
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
||||
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
|
||||
r['pages'] = findData('Paperback')
|
||||
if not r['pages']:
|
||||
r['pages'] = findData('Hardcover')
|
||||
|
||||
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe, removeSpecialCharacters
|
||||
from ox.text import find_re, remove_special_characters
|
||||
|
||||
import imdb
|
||||
|
||||
|
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||
data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
|
||||
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
||||
if r:
|
||||
result = r[0]
|
||||
result = findRe(result, "<a href=\"(.*?)\"")
|
||||
result = find_re(result, "<a href=\"(.*?)\"")
|
||||
if not "/boxsets/" in result:
|
||||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = findRe(result, "src=\"(.*?)\"")
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import urllib
|
||||
import ox
|
||||
from ox import strip_tags, decodeHtml
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
|
||||
|
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
|
|||
results = []
|
||||
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
return results
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
|
@ -23,8 +23,8 @@ def getShowUrl(title):
|
|||
def getShowData(url):
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['episodes'] = {}
|
||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import json
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
from ox import find_re
|
||||
|
||||
class Imdb(dict):
|
||||
def __init__(self, id, timeout=-1):
|
||||
|
@ -36,7 +36,7 @@ class Imdb(dict):
|
|||
|
||||
if 'nytimes' in self:
|
||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/')
|
||||
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import re
|
|||
import urllib
|
||||
|
||||
import ox
|
||||
from ox import strip_tags, decodeHtml
|
||||
from ox import strip_tags, decode_html
|
||||
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
DEFAULT_TIMEOUT = 24*60*60
|
||||
|
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
for a in re.compile(
|
||||
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
||||
).findall(data):
|
||||
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
|
||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
return results
|
||||
|
|
|
@ -8,8 +8,8 @@ import time
|
|||
import unicodedata
|
||||
|
||||
import ox
|
||||
from ox import findRe, strip_tags
|
||||
from ox.normalize import normalizeTitle, normalizeImdbId
|
||||
from ox import find_re, strip_tags
|
||||
from ox.normalize import normalize_title, normalize_imdbid
|
||||
import ox.cache
|
||||
|
||||
from siteparser import SiteParser
|
||||
|
@ -50,7 +50,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+')
|
||||
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
@ -141,7 +141,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: findRe(data.replace(',', ''), '\d+')
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
@ -314,7 +314,7 @@ class Imdb(SiteParser):
|
|||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
|
|||
#print google_query
|
||||
results = google.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
return findRe(results[0][1], 'title/tt(\d{7})')
|
||||
return find_re(results[0][1], 'title/tt(\d{7})')
|
||||
#or nothing
|
||||
return ''
|
||||
|
||||
|
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
|
|||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url)
|
||||
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return getMoviePoster(info['series'])
|
||||
|
|
|
@ -4,7 +4,7 @@ import re
|
|||
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
|
@ -22,13 +22,13 @@ def getData(id):
|
|||
'url': getUrl(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['posters'] = []
|
||||
poster = findRe(html, '<img src="(posters.*?)"')
|
||||
poster = find_re(html, '<img src="(posters.*?)"')
|
||||
if poster:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||
data['posters'].append(poster)
|
||||
|
@ -37,13 +37,13 @@ def getData(id):
|
|||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
@ -54,7 +54,7 @@ def getId(url):
|
|||
split = split[4][:-5].split('_')
|
||||
if split[-1] == 'xlg':
|
||||
split.pop()
|
||||
if findRe(split[-1], 'ver\d+$'):
|
||||
if find_re(split[-1], 'ver\d+$'):
|
||||
split.pop()
|
||||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
@ -62,7 +62,7 @@ def getId(url):
|
|||
def getIds():
|
||||
ids = []
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in getIdsByPage(page):
|
||||
if not id in ids:
|
||||
|
@ -81,7 +81,7 @@ def getIdsByPage(page):
|
|||
def getUrl(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
if findRe(html, "No Movie Posters on This Page"):
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
||||
|
|
|
@ -4,9 +4,9 @@ import re
|
|||
import urllib
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
from ox.text import findRe
|
||||
from ox.text import findString
|
||||
from ox.html import decode_html, strip_tags
|
||||
from ox.text import find_re
|
||||
from ox.text import find_string
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
|
@ -65,26 +65,26 @@ def parseXmlDict(xml):
|
|||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = findRe(string, '(.*?)</key>')
|
||||
type = findRe(string, '</key><(.*?)>')
|
||||
key = find_re(string, '(.*?)</key>')
|
||||
type = find_re(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decodeHtml(value)
|
||||
value = decode_html(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parseCast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
@ -92,12 +92,12 @@ def parseCast(xml, title):
|
|||
def parseMovies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
|
@ -114,24 +114,24 @@ class ItunesAlbum:
|
|||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = find_re(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
|
@ -145,7 +145,7 @@ class ItunesMovie:
|
|||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
|
@ -156,21 +156,21 @@ class ItunesMovie:
|
|||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml
|
||||
from ox.text import findRe
|
||||
from ox.html import decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def getLyrics(title, artist):
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = read_url(url)
|
||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
lyrics.replace('\n\n\n', '\n\n')
|
||||
lyrics = decodeHtml(lyrics.replace('&', '&'))
|
||||
lyrics = decode_html(lyrics.replace('&', '&'))
|
||||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -5,7 +5,7 @@ from urllib import quote
|
|||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
def getUrl(id):
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
@ -16,14 +16,14 @@ def getId(url):
|
|||
def getUrlByImdb(imdb):
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = read_url(url)
|
||||
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
|
||||
def getMetacriticShowUrl(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = read_url(url)
|
||||
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def getData(url):
|
||||
data = read_url(url, unicode=True)
|
||||
|
|
|
@ -6,8 +6,8 @@ import socket
|
|||
from urllib import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
|
|||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decodeHtml(row[3]).strip()
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
|
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
|
|||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
imdbId = normalize_imdbid(imdbId)
|
||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
|
@ -81,14 +81,14 @@ def getData(mininovaId):
|
|||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
from ox import find_re
|
||||
|
||||
def getData(id):
|
||||
'''
|
||||
|
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
|
|||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
def getUrl(id):
|
||||
|
|
|
@ -4,7 +4,7 @@ import re
|
|||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
from ox import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
|
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
|||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
|
||||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
|
||||
from ox.cache import getHeaders, read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
|
||||
def getUrlByImdb(imdb):
|
||||
|
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
|
|||
return None
|
||||
|
||||
def get_og(data, key):
|
||||
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def getData(url):
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
if '(' in r['title']:
|
||||
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
||||
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||
if not r['summary']:
|
||||
r['summary'] = get_og(data, 'description')
|
||||
|
@ -40,9 +40,9 @@ def getData(url):
|
|||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5')
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
||||
poster = get_og(data, 'image')
|
||||
if poster and not 'poster_default.gif' in poster:
|
||||
r['posters'] = [poster]
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
|
||||
from ..cache import read_url
|
||||
from .. import strip_tags, decodeHtml
|
||||
from .. import strip_tags, decode_html
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
|
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
|
|||
if data:
|
||||
if isinstance(data[0], basestring):
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
|
||||
data = [decodeHtml(p).strip() for p in data]
|
||||
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||
data = [decode_html(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
while len(data) == 1 and not isinstance(data, basestring):
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
import time
|
||||
|
||||
import ox.cache
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
from ox.html import decode_html, strip_tags
|
||||
import ox.net
|
||||
|
||||
|
||||
|
@ -44,8 +44,8 @@ def getNews(year, month, day):
|
|||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decodeHtml
|
||||
# new['description'] = formatString(decodeHtml(description))
|
||||
# fix decode_html
|
||||
# new['description'] = formatString(decode_html(description))
|
||||
new['description'] = formatString(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = formatSection(section)
|
||||
|
|
|
@ -6,8 +6,8 @@ import socket
|
|||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
|
|||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decodeHtml(row[2])
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
|
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
|
|||
return results
|
||||
|
||||
def findMovieByImdb(imdb):
|
||||
return findMovies("tt" + normalizeImdbId(imdb))
|
||||
return findMovies("tt" + normalize_imdbid(imdb))
|
||||
|
||||
def getId(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = findRe(piratebayId, "tor/(\d+)")
|
||||
d = find_re(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
d = findRe(piratebayId, "torrent/(\d+)")
|
||||
d = find_re(piratebayId, "torrent/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
@ -80,21 +80,21 @@ def getData(piratebayId):
|
|||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = _read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
|
10
ox/web/tv.py
10
ox/web/tv.py
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
@ -16,11 +16,11 @@ def getEpisodeData(url):
|
|||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
#episode score
|
||||
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
|
||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||
if match:
|
||||
|
|
|
@ -5,7 +5,7 @@ from StringIO import StringIO
|
|||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findString, findRe
|
||||
from ox import find_string, find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
|
|
|
@ -5,7 +5,7 @@ from urllib import urlencode
|
|||
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, decodeHtml
|
||||
from ox import find_re, decode_html
|
||||
|
||||
|
||||
def getId(url):
|
||||
|
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
|
|||
if not wikipediaUrl.startswith('http'):
|
||||
wikipediaUrl = getUrl(wikipediaUrl)
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
_box = filmbox_data.strip().split('|')
|
||||
for row in _box:
|
||||
|
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
|
|||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||
del filmbox['amg_id']
|
||||
if 'Allmovie movie' in data:
|
||||
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||
elif 'Allmovie title' in data:
|
||||
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||
|
||||
if 'Official website' in data:
|
||||
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
|
||||
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||
|
||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
|
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
|
|||
if r:
|
||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def getImageUrl(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url, unicode=True)
|
||||
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
if url:
|
||||
url = 'http:' + url
|
||||
return url
|
||||
|
|
Loading…
Reference in a new issue