replace all CammelCase with under_score in ox
This commit is contained in:
parent
2de989e188
commit
bb35daa95c
31 changed files with 242 additions and 244 deletions
10
README
10
README
|
@ -10,12 +10,12 @@ Depends:
|
||||||
Usage:
|
Usage:
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
data = ox.cache.readUrl('http:/...')
|
data = ox.cache.read_url('http:/...')
|
||||||
text = ox.stripTags(data)
|
text = ox.strip_tags(data)
|
||||||
ox.normalizeNewlines(text)
|
ox.normalize_newlines(text)
|
||||||
ox.formatBytes(len(data))
|
ox.format_bytes(len(data))
|
||||||
|
|
||||||
ox.formatBytes(1234567890)
|
ox.format_bytes(1234567890)
|
||||||
'1.15 GB'
|
'1.15 GB'
|
||||||
|
|
||||||
import ox.web.imdb
|
import ox.web.imdb
|
||||||
|
|
14
ox/html.py
14
ox/html.py
|
@ -56,15 +56,15 @@ def strip_tags(value):
|
||||||
|
|
||||||
stripTags = strip_tags
|
stripTags = strip_tags
|
||||||
|
|
||||||
def stripSpacesBetweenTags(value):
|
def strip_spaces_between_tags(value):
|
||||||
"Returns the given HTML with spaces between tags normalized to a single space"
|
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||||
return re.sub(r'>\s+<', '> <', value)
|
return re.sub(r'>\s+<', '> <', value)
|
||||||
|
|
||||||
def stripEntities(value):
|
def strip_entities(value):
|
||||||
"Returns the given HTML with all entities (&something;) stripped"
|
"Returns the given HTML with all entities (&something;) stripped"
|
||||||
return re.sub(r'&(?:\w+|#\d);', '', value)
|
return re.sub(r'&(?:\w+|#\d);', '', value)
|
||||||
|
|
||||||
def fixAmpersands(value):
|
def fix_ampersands(value):
|
||||||
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||||
return unencoded_ampersands_re.sub('&', value)
|
return unencoded_ampersands_re.sub('&', value)
|
||||||
|
|
||||||
|
@ -113,11 +113,11 @@ def clean_html(text):
|
||||||
* Removes stuff like "<p> </p>", but only if it's at the
|
* Removes stuff like "<p> </p>", but only if it's at the
|
||||||
bottom of the text.
|
bottom of the text.
|
||||||
"""
|
"""
|
||||||
from text import normalizeNewlines
|
from text import normalize_newlines
|
||||||
text = normalizeNewlines(text)
|
text = normalize_newlines(text)
|
||||||
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
||||||
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
||||||
text = fixAmpersands(text)
|
text = fix_ampersands(text)
|
||||||
# Remove all target="" attributes from <a> tags.
|
# Remove all target="" attributes from <a> tags.
|
||||||
text = link_target_attribute_re.sub('\\1', text)
|
text = link_target_attribute_re.sub('\\1', text)
|
||||||
# Trim stupid HTML such as <br clear="all">.
|
# Trim stupid HTML such as <br clear="all">.
|
||||||
|
@ -168,8 +168,6 @@ def decode_html(html):
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||||
|
|
||||||
decodeHtml = decode_html
|
|
||||||
|
|
||||||
def highlight(text, query, hlClass="hl"):
|
def highlight(text, query, hlClass="hl"):
|
||||||
"""
|
"""
|
||||||
>>> highlight('me & you and $&%', 'and')
|
>>> highlight('me & you and $&%', 'and')
|
||||||
|
|
|
@ -18,7 +18,7 @@ def latlngspan2latlng(lat, lng, latSpan, lngSpan):
|
||||||
lat_ne = lat + latSpan, lng_ne = lng + latSpan
|
lat_ne = lat + latSpan, lng_ne = lng + latSpan
|
||||||
)
|
)
|
||||||
|
|
||||||
def parseLocationString(location_string):
|
def parse_location_string(location_string):
|
||||||
l = location_string.split('+')
|
l = location_string.split('+')
|
||||||
if len(l) == 1:
|
if len(l) == 1:
|
||||||
l = location_string.split(';')
|
l = location_string.split(';')
|
||||||
|
|
18
ox/movie.py
18
ox/movie.py
|
@ -8,8 +8,8 @@ import hashlib
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from normalize import normalizeName
|
from normalize import normalize_name
|
||||||
from text import get_sort_name, findRe
|
from text import get_sort_name, find_re
|
||||||
|
|
||||||
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
|
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
|
||||||
|
|
||||||
|
@ -308,14 +308,14 @@ def parse_movie_path(path):
|
||||||
if title.endswith('_'):
|
if title.endswith('_'):
|
||||||
title = title[:-1] + '.'
|
title = title[:-1] + '.'
|
||||||
|
|
||||||
year = findRe(title, '(\(\d{4}\))')
|
year = find_re(title, '(\(\d{4}\))')
|
||||||
if not year:
|
if not year:
|
||||||
year = findRe(title, '(\(\d{4}-\d*\))')
|
year = find_re(title, '(\(\d{4}-\d*\))')
|
||||||
if year and title.endswith(year):
|
if year and title.endswith(year):
|
||||||
title = title[:-len(year)].strip()
|
title = title[:-len(year)].strip()
|
||||||
year = year[1:-1]
|
year = year[1:-1]
|
||||||
if '-' in year:
|
if '-' in year:
|
||||||
year = findRe(year, '\d{4}')
|
year = find_re(year, '\d{4}')
|
||||||
|
|
||||||
#director
|
#director
|
||||||
if len(parts) == 4:
|
if len(parts) == 4:
|
||||||
|
@ -323,7 +323,7 @@ def parse_movie_path(path):
|
||||||
if director.endswith('_'):
|
if director.endswith('_'):
|
||||||
director = "%s." % director[:-1]
|
director = "%s." % director[:-1]
|
||||||
director = director.split('; ')
|
director = director.split('; ')
|
||||||
director = [normalizeName(d).strip() for d in director]
|
director = [normalize_name(d).strip() for d in director]
|
||||||
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
|
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
|
||||||
else:
|
else:
|
||||||
director = []
|
director = []
|
||||||
|
@ -338,13 +338,13 @@ def parse_movie_path(path):
|
||||||
language = ''
|
language = ''
|
||||||
|
|
||||||
#season/episode/episodeTitle
|
#season/episode/episodeTitle
|
||||||
season = findRe(parts[-1], '\.Season (\d+)\.')
|
season = find_re(parts[-1], '\.Season (\d+)\.')
|
||||||
if season:
|
if season:
|
||||||
season = int(season)
|
season = int(season)
|
||||||
else:
|
else:
|
||||||
season = None
|
season = None
|
||||||
|
|
||||||
episode = findRe(parts[-1], '\.Episode (\d+)\.')
|
episode = find_re(parts[-1], '\.Episode (\d+)\.')
|
||||||
if episode:
|
if episode:
|
||||||
episode = int(episode)
|
episode = int(episode)
|
||||||
else:
|
else:
|
||||||
|
@ -373,7 +373,7 @@ def parse_movie_path(path):
|
||||||
title = u'%s %s' % (title, episodeTitle)
|
title = u'%s %s' % (title, episodeTitle)
|
||||||
|
|
||||||
#part
|
#part
|
||||||
part = findRe(parts[-1], '\.Part (\d+)\.')
|
part = find_re(parts[-1], '\.Part (\d+)\.')
|
||||||
if part:
|
if part:
|
||||||
part = int(part)
|
part = int(part)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -37,13 +37,13 @@ _noarticles = (
|
||||||
'i was',
|
'i was',
|
||||||
)
|
)
|
||||||
|
|
||||||
def canonicalTitle(title):
|
def canonical_title(title):
|
||||||
"""Return the title in the canonic format 'Movie Title, The'.
|
"""Return the title in the canonic format 'Movie Title, The'.
|
||||||
|
|
||||||
>>> canonicalTitle('The Movie Title')
|
>>> canonical_title('The Movie Title')
|
||||||
'Movie Title, The'
|
'Movie Title, The'
|
||||||
|
|
||||||
>>> canonicalTitle('Los Angeles Plays Itself')
|
>>> canonical_title('Los Angeles Plays Itself')
|
||||||
'Los Angeles Plays Itself'
|
'Los Angeles Plays Itself'
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
@ -72,10 +72,10 @@ def canonicalTitle(title):
|
||||||
## break
|
## break
|
||||||
return title
|
return title
|
||||||
|
|
||||||
def normalizeTitle(title):
|
def normalize_title(title):
|
||||||
"""Return the title in the normal "The Title" format.
|
"""Return the title in the normal "The Title" format.
|
||||||
|
|
||||||
>>> normalizeTitle('Movie Title, The')
|
>>> normalize_title('Movie Title, The')
|
||||||
'The Movie Title'
|
'The Movie Title'
|
||||||
"""
|
"""
|
||||||
stitle = title.split(', ')
|
stitle = title.split(', ')
|
||||||
|
@ -85,14 +85,14 @@ def normalizeTitle(title):
|
||||||
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
|
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
|
||||||
return title
|
return title
|
||||||
|
|
||||||
def normalizeImdbId(imdbId):
|
def normalize_imdbid(imdbId):
|
||||||
"""Return 7 digit imdbId.
|
"""Return 7 digit imdbId.
|
||||||
|
|
||||||
>>> normalizeImdbId('http://www.imdb.com/title/tt0159206/')
|
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
|
||||||
'0159206'
|
'0159206'
|
||||||
>>> normalizeImdbId(159206)
|
>>> normalize_imdbid(159206)
|
||||||
'0159206'
|
'0159206'
|
||||||
>>> normalizeImdbId('tt0159206')
|
>>> normalize_imdbid('tt0159206')
|
||||||
'0159206'
|
'0159206'
|
||||||
"""
|
"""
|
||||||
if isinstance(imdbId, basestring):
|
if isinstance(imdbId, basestring):
|
||||||
|
@ -106,20 +106,20 @@ def normalizeImdbId(imdbId):
|
||||||
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
|
_sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van',
|
||||||
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
|
'e', 'von', 'vom', 'the', 'di', 'du', 'el', 'al')
|
||||||
|
|
||||||
def canonicalName(name):
|
def canonical_name(name):
|
||||||
"""Return the given name in canonical "Surname, Name" format.
|
"""Return the given name in canonical "Surname, Name" format.
|
||||||
It assumes that name is in the 'Name Surname' format.
|
It assumes that name is in the 'Name Surname' format.
|
||||||
|
|
||||||
>>> canonicalName('Jean Luc Godard')
|
>>> canonical_name('Jean Luc Godard')
|
||||||
'Godard, Jean Luc'
|
'Godard, Jean Luc'
|
||||||
|
|
||||||
>>> canonicalName('Ivan Ivanov-Vano')
|
>>> canonical_name('Ivan Ivanov-Vano')
|
||||||
'Ivanov-Vano, Ivan'
|
'Ivanov-Vano, Ivan'
|
||||||
|
|
||||||
>>> canonicalName('Gus Van Sant')
|
>>> canonical_name('Gus Van Sant')
|
||||||
'Van Sant, Gus'
|
'Van Sant, Gus'
|
||||||
|
|
||||||
>>> canonicalName('Brian De Palma')
|
>>> canonical_name('Brian De Palma')
|
||||||
'De Palma, Brian'
|
'De Palma, Brian'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -167,19 +167,19 @@ def canonicalName(name):
|
||||||
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
|
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def normalizeName(name):
|
def normalize_name(name):
|
||||||
"""Return a name in the normal "Name Surname" format.
|
"""Return a name in the normal "Name Surname" format.
|
||||||
|
|
||||||
>>> normalizeName('Godard, Jean Luc')
|
>>> normalize_name('Godard, Jean Luc')
|
||||||
'Jean Luc Godard'
|
'Jean Luc Godard'
|
||||||
|
|
||||||
>>> normalizeName('Ivanov-Vano, Ivan')
|
>>> normalize_name('Ivanov-Vano, Ivan')
|
||||||
'Ivan Ivanov-Vano'
|
'Ivan Ivanov-Vano'
|
||||||
|
|
||||||
>>> normalizeName('Van Sant, Gus')
|
>>> normalize_name('Van Sant, Gus')
|
||||||
'Gus Van Sant'
|
'Gus Van Sant'
|
||||||
|
|
||||||
>>> normalizeName('De Palma, Brian')
|
>>> normalize_name('De Palma, Brian')
|
||||||
'Brian De Palma'
|
'Brian De Palma'
|
||||||
"""
|
"""
|
||||||
sname = name.split(', ')
|
sname = name.split(', ')
|
||||||
|
@ -187,12 +187,12 @@ def normalizeName(name):
|
||||||
name = '%s %s' % (sname[1], sname[0])
|
name = '%s %s' % (sname[1], sname[0])
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def normalizePath(path):
|
def normalize_path(path):
|
||||||
path = path.replace(':', '_').replace('/', '_')
|
path = path.replace(':', '_').replace('/', '_')
|
||||||
if path.endswith('.'): path = path[:-1] + '_'
|
if path.endswith('.'): path = path[:-1] + '_'
|
||||||
return path
|
return path
|
||||||
|
|
||||||
def stripAccents(s):
|
def strip_accents(s):
|
||||||
if isinstance(s, str):
|
if isinstance(s, str):
|
||||||
s = unicode(s)
|
s = unicode(s)
|
||||||
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# ci:si:et:sw=4:sts=4:ts=4
|
# ci:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
from text import findRe
|
from text import find_re
|
||||||
import cache
|
import cache
|
||||||
from utils import json, ET
|
from utils import json, ET
|
||||||
|
|
||||||
|
@ -13,14 +13,14 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||||
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||||
if json_oembed:
|
if json_oembed:
|
||||||
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||||
if maxwidth:
|
if maxwidth:
|
||||||
oembed_url += '&maxwidth=%d' % maxwidth
|
oembed_url += '&maxwidth=%d' % maxwidth
|
||||||
if maxheight:
|
if maxheight:
|
||||||
oembed_url += '&maxheight=%d' % maxheight
|
oembed_url += '&maxheight=%d' % maxheight
|
||||||
embed = json.loads(cache.readUrl(oembed_url))
|
embed = json.loads(cache.readUrl(oembed_url))
|
||||||
elif xml_oembed:
|
elif xml_oembed:
|
||||||
oembed_url = findRe(json_oembed[0], 'href="(.*?)"')
|
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||||
if maxwidth:
|
if maxwidth:
|
||||||
oembed_url += '&maxwidth=%d' % maxwidth
|
oembed_url += '&maxwidth=%d' % maxwidth
|
||||||
if maxheight:
|
if maxheight:
|
||||||
|
|
|
@ -11,7 +11,7 @@ import ox
|
||||||
__all__ = []
|
__all__ = []
|
||||||
|
|
||||||
|
|
||||||
def _detectEncoding(fp):
|
def _detect_encoding(fp):
|
||||||
bomDict={ # bytepattern : name
|
bomDict={ # bytepattern : name
|
||||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||||
|
@ -63,7 +63,7 @@ def load(filename, offset=0):
|
||||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||||
|
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
encoding = _detectEncoding(f)
|
encoding = _detect_encoding(f)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
try:
|
try:
|
||||||
data = unicode(data, encoding)
|
data = unicode(data, encoding)
|
||||||
|
|
80
ox/text.py
80
ox/text.py
|
@ -257,24 +257,24 @@ def get_sort_title(title):
|
||||||
return title[length + spaces:] + ', ' + title[:length]
|
return title[length + spaces:] + ', ' + title[:length]
|
||||||
return title
|
return title
|
||||||
|
|
||||||
def findRe(string, regexp):
|
def find_re(string, regexp):
|
||||||
result = re.compile(regexp, re.DOTALL).findall(string)
|
result = re.compile(regexp, re.DOTALL).findall(string)
|
||||||
if result:
|
if result:
|
||||||
return result[0].strip()
|
return result[0].strip()
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def findString(string, string0='', string1 = ''):
|
def find_string(string, string0='', string1 = ''):
|
||||||
"""Return the string between string0 and string1.
|
"""Return the string between string0 and string1.
|
||||||
|
|
||||||
If string0 or string1 is left out, begining or end of string is used.
|
If string0 or string1 is left out, begining or end of string is used.
|
||||||
|
|
||||||
>>> findString('i am not there', string1=' not there')
|
>>> find_string('i am not there', string1=' not there')
|
||||||
'i am'
|
'i am'
|
||||||
|
|
||||||
>>> findString('i am not there', 'i am ', ' there')
|
>>> find_string('i am not there', 'i am ', ' there')
|
||||||
'not'
|
'not'
|
||||||
|
|
||||||
>>> findString('i am not there', 'i am not t')
|
>>> find_string('i am not there', 'i am not t')
|
||||||
'here'
|
'here'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -286,7 +286,7 @@ def findString(string, string0='', string1 = ''):
|
||||||
string1 = re.escape(string1)
|
string1 = re.escape(string1)
|
||||||
else:
|
else:
|
||||||
string1 = '$'
|
string1 = '$'
|
||||||
return findRe(string, string0 + '(.*?)' + string1)
|
return find_re(string, string0 + '(.*?)' + string1)
|
||||||
|
|
||||||
def parse_useragent(useragent):
|
def parse_useragent(useragent):
|
||||||
data = {}
|
data = {}
|
||||||
|
@ -319,7 +319,7 @@ def parse_useragent(useragent):
|
||||||
break;
|
break;
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def removeSpecialCharacters(text):
|
def remove_special_characters(text):
|
||||||
"""
|
"""
|
||||||
Removes special characters inserted by Word.
|
Removes special characters inserted by Word.
|
||||||
"""
|
"""
|
||||||
|
@ -346,22 +346,22 @@ def wrap(text, width):
|
||||||
text.split(' ')
|
text.split(' ')
|
||||||
)
|
)
|
||||||
|
|
||||||
def wrapString(string, length=80, separator='\n', balance=False):
|
def wrap_string(string, length=80, separator='\n', balance=False):
|
||||||
'''
|
'''
|
||||||
>>> wrapString(u"Anticonstitutionellement, Paris s'eveille", 16)
|
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
|
||||||
u"Anticonstitution\\nellement, Paris \\ns'eveille"
|
u"Anticonstitution\\nellement, Paris \\ns'eveille"
|
||||||
>>> wrapString(u'All you can eat', 12, '\\n', True)
|
>>> wrap_string(u'All you can eat', 12, '\\n', True)
|
||||||
u'All you \\ncan eat'
|
u'All you \\ncan eat'
|
||||||
'''
|
'''
|
||||||
words = string.split(' ')
|
words = string.split(' ')
|
||||||
if balance:
|
if balance:
|
||||||
# balance lines: test if same number of lines
|
# balance lines: test if same number of lines
|
||||||
# can be achieved with a shorter line length
|
# can be achieved with a shorter line length
|
||||||
lines = wrapString(string, length, separator, False).split(separator)
|
lines = wrap_string(string, length, separator, False).split(separator)
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
while length > max(map(lambda x : len(x), words)):
|
while length > max(map(lambda x : len(x), words)):
|
||||||
length -= 1
|
length -= 1
|
||||||
if len(wrapString(string, length, separator, False).split(separator)) > len(lines):
|
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
|
||||||
length += 1
|
length += 1
|
||||||
break
|
break
|
||||||
lines = ['']
|
lines = ['']
|
||||||
|
@ -382,12 +382,12 @@ def wrapString(string, length=80, separator='\n', balance=False):
|
||||||
lines[len(lines) - 1] += u' '
|
lines[len(lines) - 1] += u' '
|
||||||
return separator.join(lines).strip()
|
return separator.join(lines).strip()
|
||||||
|
|
||||||
def truncateString(string, length, padding='...', position='right'):
|
def truncate_string(string, length, padding='...', position='right'):
|
||||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'left')
|
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
|
||||||
# '...utionellement'
|
# '...utionellement'
|
||||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'center')
|
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
|
||||||
# 'anticon...lement'
|
# 'anticon...lement'
|
||||||
# >>> truncateString('anticonstitutionellement', 16, '...', 'right')
|
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
|
||||||
# 'anticonstitut...'
|
# 'anticonstitut...'
|
||||||
stringLength = len(string);
|
stringLength = len(string);
|
||||||
paddingLength = len(padding)
|
paddingLength = len(padding)
|
||||||
|
@ -402,12 +402,12 @@ def truncateString(string, length, padding='...', position='right'):
|
||||||
string = '%s%s' % (string[:length - paddingLength], padding)
|
string = '%s%s' % (string[:length - paddingLength], padding)
|
||||||
return string;
|
return string;
|
||||||
|
|
||||||
def truncateWords(s, num):
|
def truncate_words(s, num):
|
||||||
"""Truncates a string after a certain number of chacters, but ends with a word
|
"""Truncates a string after a certain number of chacters, but ends with a word
|
||||||
|
|
||||||
>>> truncateString('Truncates a string after a certain number of chacters, but ends with a word', 23)
|
>>> truncate_string('Truncates a string after a certain number of chacters, but ends with a word', 23)
|
||||||
'Truncates a string...'
|
'Truncates a string...'
|
||||||
>>> truncateString('Truncates a string', 23)
|
>>> truncate_string('Truncates a string', 23)
|
||||||
'Truncates a string'
|
'Truncates a string'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -422,25 +422,25 @@ def truncateWords(s, num):
|
||||||
ts += "..."
|
ts += "..."
|
||||||
return ts.strip()
|
return ts.strip()
|
||||||
|
|
||||||
def trimString(string, num):
|
def trim_string(string, num):
|
||||||
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
|
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
|
||||||
|
|
||||||
>>> trimString('Truncates a string after a certain number of chacters', 23)
|
>>> trim_string('Truncates a string after a certain number of chacters', 23)
|
||||||
'Truncates ...f chacters'
|
'Truncates ...f chacters'
|
||||||
>>> trimString('Truncates a string', 23)
|
>>> trim_string('Truncates a string', 23)
|
||||||
'Truncates a string'
|
'Truncates a string'
|
||||||
"""
|
"""
|
||||||
if len(string) > num:
|
if len(string) > num:
|
||||||
string = string[:num - 13] + '...' + string[-10:]
|
string = string[:num - 13] + '...' + string[-10:]
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def getValidFilename(s):
|
def get_valid_filename(s):
|
||||||
"""
|
"""
|
||||||
Returns the given string converted to a string that can be used for a clean
|
Returns the given string converted to a string that can be used for a clean
|
||||||
filename. Specifically, leading and trailing spaces are removed;
|
filename. Specifically, leading and trailing spaces are removed;
|
||||||
all non-filename-safe characters are removed.
|
all non-filename-safe characters are removed.
|
||||||
|
|
||||||
>>> getValidFilename("john's portrait in 2004.jpg")
|
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||||
'john_s_portrait_in_2004.jpg'
|
'john_s_portrait_in_2004.jpg'
|
||||||
"""
|
"""
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
|
@ -449,34 +449,34 @@ def getValidFilename(s):
|
||||||
s = s.replace('__', '_').replace('__', '_')
|
s = s.replace('__', '_').replace('__', '_')
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def getTextList(list_, last_word='or'):
|
def get_text_list(list_, last_word='or'):
|
||||||
"""
|
"""
|
||||||
>>> getTextList([u'a', u'b', u'c', u'd'])
|
>>> get_text_list([u'a', u'b', u'c', u'd'])
|
||||||
u'a, b, c or d'
|
u'a, b, c or d'
|
||||||
>>> getTextList([u'a', u'b', u'c'], 'and')
|
>>> get_text_list([u'a', u'b', u'c'], 'and')
|
||||||
u'a, b and c'
|
u'a, b and c'
|
||||||
>>> getTextList([u'a', u'b'], 'and')
|
>>> get_text_list([u'a', u'b'], 'and')
|
||||||
u'a and b'
|
u'a and b'
|
||||||
>>> getTextList([u'a'])
|
>>> get_text_list([u'a'])
|
||||||
u'a'
|
u'a'
|
||||||
>>> getTextList([])
|
>>> get_text_list([])
|
||||||
''
|
''
|
||||||
"""
|
"""
|
||||||
if len(list_) == 0: return ''
|
if len(list_) == 0: return ''
|
||||||
if len(list_) == 1: return list_[0]
|
if len(list_) == 1: return list_[0]
|
||||||
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
|
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
|
||||||
|
|
||||||
def getListText(text, last_word='or'):
|
def get_list_text(text, last_word='or'):
|
||||||
"""
|
"""
|
||||||
>>> getListText(u'a, b, c or d')
|
>>> get_list_text(u'a, b, c or d')
|
||||||
[u'a', u'b', u'c', u'd']
|
[u'a', u'b', u'c', u'd']
|
||||||
>>> getListText(u'a, b and c', u'and')
|
>>> get_list_text(u'a, b and c', u'and')
|
||||||
[u'a', u'b', u'c']
|
[u'a', u'b', u'c']
|
||||||
>>> getListText(u'a and b', u'and')
|
>>> get_list_text(u'a and b', u'and')
|
||||||
[u'a', u'b']
|
[u'a', u'b']
|
||||||
>>> getListText(u'a')
|
>>> get_list_text(u'a')
|
||||||
[u'a']
|
[u'a']
|
||||||
>>> getListText(u'')
|
>>> get_list_text(u'')
|
||||||
[]
|
[]
|
||||||
"""
|
"""
|
||||||
list_ = []
|
list_ = []
|
||||||
|
@ -490,7 +490,7 @@ def getListText(text, last_word='or'):
|
||||||
list_.append(last[1].strip())
|
list_.append(last[1].strip())
|
||||||
return list_
|
return list_
|
||||||
|
|
||||||
def normalizeNewlines(text):
|
def normalize_newlines(text):
|
||||||
return re.sub(r'\r\n|\r|\n', '\n', text)
|
return re.sub(r'\r\n|\r|\n', '\n', text)
|
||||||
|
|
||||||
def recapitalize(text):
|
def recapitalize(text):
|
||||||
|
@ -514,7 +514,7 @@ def phone2numeric(phone):
|
||||||
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||||
return letters.sub(char2number, phone)
|
return letters.sub(char2number, phone)
|
||||||
|
|
||||||
def compressString(s):
|
def compress_string(s):
|
||||||
import cStringIO, gzip
|
import cStringIO, gzip
|
||||||
zbuf = cStringIO.StringIO()
|
zbuf = cStringIO.StringIO()
|
||||||
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||||
|
@ -523,13 +523,13 @@ def compressString(s):
|
||||||
return zbuf.getvalue()
|
return zbuf.getvalue()
|
||||||
|
|
||||||
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||||
def smartSplit(text):
|
def smart_split(text):
|
||||||
"""
|
"""
|
||||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||||
Supports both single and double quotes, and supports escaping quotes with
|
Supports both single and double quotes, and supports escaping quotes with
|
||||||
backslashes. In the output, strings will keep their initial and trailing
|
backslashes. In the output, strings will keep their initial and trailing
|
||||||
quote marks.
|
quote marks.
|
||||||
>>> list(smartSplit('This is "a person\\'s" test.'))
|
>>> list(smart_split('This is "a person\\'s" test.'))
|
||||||
['This', 'is', '"a person\\'s"', 'test.']
|
['This', 'is', '"a person\\'s"', 'test.']
|
||||||
"""
|
"""
|
||||||
for bit in smart_split_re.finditer(text):
|
for bit in smart_split_re.finditer(text):
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import strip_tags, findRe
|
from ox import strip_tags, find_re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,22 +28,22 @@ def getData(id):
|
||||||
}
|
}
|
||||||
html = read_url(data["url"], unicode=True)
|
html = read_url(data["url"], unicode=True)
|
||||||
data['aka'] = parseList(html, 'AKA')
|
data['aka'] = parseList(html, 'AKA')
|
||||||
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||||
data['countries'] = parseList(html, 'countries')
|
data['countries'] = parseList(html, 'countries')
|
||||||
data['director'] = parseEntry(html, 'directed by')
|
data['director'] = parseEntry(html, 'directed by')
|
||||||
data['genres'] = parseList(html, 'genres')
|
data['genres'] = parseList(html, 'genres')
|
||||||
data['keywords'] = parseList(html, 'keywords')
|
data['keywords'] = parseList(html, 'keywords')
|
||||||
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
|
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||||
data['produced'] = parseList(html, 'produced by')
|
data['produced'] = parseList(html, 'produced by')
|
||||||
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||||
data['released'] = parseEntry(html, 'released by')
|
data['released'] = parseEntry(html, 'released by')
|
||||||
data['releasedate'] = parseList(html, 'release date')
|
data['releasedate'] = parseList(html, 'release date')
|
||||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||||
data['set'] = parseEntry(html, 'set in')
|
data['set'] = parseEntry(html, 'set in')
|
||||||
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
data['themes'] = parseList(html, 'themes')
|
data['themes'] = parseList(html, 'themes')
|
||||||
data['types'] = parseList(html, 'types')
|
data['types'] = parseList(html, 'types')
|
||||||
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||||
|
@ -51,18 +51,18 @@ def getData(id):
|
||||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||||
#data['credits'] = parseTable(html)
|
#data['credits'] = parseTable(html)
|
||||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||||
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
return "http://allmovie.com/work/%s" % id
|
return "http://allmovie.com/work/%s" % id
|
||||||
|
|
||||||
def parseEntry(html, title):
|
def parseEntry(html, title):
|
||||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||||
return strip_tags(html).strip()
|
return strip_tags(html).strip()
|
||||||
|
|
||||||
def parseList(html, title):
|
def parseList(html, title):
|
||||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||||
if not r and html:
|
if not r and html:
|
||||||
r = [strip_tags(html)]
|
r = [strip_tags(html)]
|
||||||
|
@ -74,11 +74,11 @@ def parseTable(html):
|
||||||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||||
x.split('<td width="305">-')
|
x.split('<td width="305">-')
|
||||||
),
|
),
|
||||||
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||||
)
|
)
|
||||||
|
|
||||||
def parseText(html, title):
|
def parseText(html, title):
|
||||||
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getData('129689')
|
print getData('129689')
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from ox import findRe, strip_tags, decodeHtml
|
from ox import find_re, strip_tags, decode_html
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ def findISBN(title, author):
|
||||||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||||
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||||
data = getData(id)
|
data = getData(id)
|
||||||
if author in data['authors']:
|
if author in data['authors']:
|
||||||
return data
|
return data
|
||||||
|
@ -24,13 +24,13 @@ def getData(id):
|
||||||
|
|
||||||
|
|
||||||
def findData(key):
|
def findData(key):
|
||||||
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||||
|
|
||||||
r = {}
|
r = {}
|
||||||
r['amazon'] = url
|
r['amazon'] = url
|
||||||
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||||
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
||||||
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']])
|
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
||||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||||
if t:
|
if t:
|
||||||
r['translator'] = t
|
r['translator'] = t
|
||||||
|
@ -38,15 +38,15 @@ def getData(id):
|
||||||
r['language'] = findData('Language')
|
r['language'] = findData('Language')
|
||||||
r['isbn-10'] = findData('ISBN-10')
|
r['isbn-10'] = findData('ISBN-10')
|
||||||
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
||||||
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||||
|
|
||||||
r['pages'] = findData('Paperback')
|
r['pages'] = findData('Paperback')
|
||||||
if not r['pages']:
|
if not r['pages']:
|
||||||
r['pages'] = findData('Hardcover')
|
r['pages'] = findData('Hardcover')
|
||||||
|
|
||||||
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||||
if r['cover']:
|
if r['cover']:
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
import ox.cache
|
import ox.cache
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox.html import strip_tags
|
from ox.html import strip_tags
|
||||||
from ox.text import findRe, removeSpecialCharacters
|
from ox.text import find_re, remove_special_characters
|
||||||
|
|
||||||
import imdb
|
import imdb
|
||||||
|
|
||||||
|
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||||
except:
|
except:
|
||||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||||
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||||
|
|
||||||
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||||
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||||
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||||
data["country"] = results[0]
|
data["country"] = results[0]
|
||||||
data["year"] = results[1]
|
data["year"] = results[1]
|
||||||
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||||
|
|
||||||
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||||
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
||||||
if r:
|
if r:
|
||||||
result = r[0]
|
result = r[0]
|
||||||
result = findRe(result, "<a href=\"(.*?)\"")
|
result = find_re(result, "<a href=\"(.*?)\"")
|
||||||
if not "/boxsets/" in result:
|
if not "/boxsets/" in result:
|
||||||
data["posters"] = [result]
|
data["posters"] = [result]
|
||||||
else:
|
else:
|
||||||
html_ = read_url(result, unicode=True)
|
html_ = read_url(result, unicode=True)
|
||||||
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||||
result = findRe(result, "src=\"(.*?)\"")
|
result = find_re(result, "src=\"(.*?)\"")
|
||||||
if result:
|
if result:
|
||||||
data["posters"] = [result.replace("_w100", "")]
|
data["posters"] = [result.replace("_w100", "")]
|
||||||
else:
|
else:
|
||||||
data["posters"] = []
|
data["posters"] = []
|
||||||
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||||
if result:
|
if result:
|
||||||
data["stills"] = [result]
|
data["stills"] = [result]
|
||||||
data["trailers"] = []
|
data["trailers"] = []
|
||||||
else:
|
else:
|
||||||
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
|
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||||
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
|
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||||
|
|
||||||
if timeout == ox.cache.cache_timeout:
|
if timeout == ox.cache.cache_timeout:
|
||||||
timeout = -1
|
timeout = -1
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
import ox
|
import ox
|
||||||
from ox import strip_tags, decodeHtml
|
from ox import strip_tags, decode_html
|
||||||
from ox.utils import json
|
from ox.utils import json
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
|
||||||
results = []
|
results = []
|
||||||
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
||||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||||
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
|
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import strip_tags, findRe
|
from ox import strip_tags, find_re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
import google
|
import google
|
||||||
|
@ -23,8 +23,8 @@ def getShowUrl(title):
|
||||||
def getShowData(url):
|
def getShowData(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
|
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||||
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||||
r['episodes'] = {}
|
r['episodes'] = {}
|
||||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
from lxml.html import document_fromstring
|
from lxml.html import document_fromstring
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe, strip_tags
|
from ox import find_re, strip_tags
|
||||||
from ox.web.imdb import ImdbCombined
|
from ox.web.imdb import ImdbCombined
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe
|
from ox import find_re
|
||||||
|
|
||||||
class Imdb(dict):
|
class Imdb(dict):
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
|
@ -36,7 +36,7 @@ class Imdb(dict):
|
||||||
|
|
||||||
if 'nytimes' in self:
|
if 'nytimes' in self:
|
||||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||||
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/')
|
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
import ox
|
import ox
|
||||||
from ox import strip_tags, decodeHtml
|
from ox import strip_tags, decode_html
|
||||||
|
|
||||||
DEFAULT_MAX_RESULTS = 10
|
DEFAULT_MAX_RESULTS = 10
|
||||||
DEFAULT_TIMEOUT = 24*60*60
|
DEFAULT_TIMEOUT = 24*60*60
|
||||||
|
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||||
for a in re.compile(
|
for a in re.compile(
|
||||||
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
||||||
).findall(data):
|
).findall(data):
|
||||||
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
|
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||||
if len(results) >= max_results:
|
if len(results) >= max_results:
|
||||||
break
|
break
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -8,8 +8,8 @@ import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
import ox
|
import ox
|
||||||
from ox import findRe, strip_tags
|
from ox import find_re, strip_tags
|
||||||
from ox.normalize import normalizeTitle, normalizeImdbId
|
from ox.normalize import normalize_title, normalize_imdbid
|
||||||
import ox.cache
|
import ox.cache
|
||||||
|
|
||||||
from siteparser import SiteParser
|
from siteparser import SiteParser
|
||||||
|
@ -50,7 +50,7 @@ class Imdb(SiteParser):
|
||||||
'page': 'business',
|
'page': 'business',
|
||||||
're': [
|
're': [
|
||||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||||
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+')
|
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
|
@ -141,7 +141,7 @@ class Imdb(SiteParser):
|
||||||
'page': 'business',
|
'page': 'business',
|
||||||
're': [
|
're': [
|
||||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||||
lambda data: findRe(data.replace(',', ''), '\d+')
|
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
|
@ -314,7 +314,7 @@ class Imdb(SiteParser):
|
||||||
if 'runtime' in self and self['runtime']:
|
if 'runtime' in self and self['runtime']:
|
||||||
if 'min' in self['runtime']: base=60
|
if 'min' in self['runtime']: base=60
|
||||||
else: base=1
|
else: base=1
|
||||||
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||||
if 'runtime' in self and not self['runtime']:
|
if 'runtime' in self and not self['runtime']:
|
||||||
del self['runtime']
|
del self['runtime']
|
||||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||||
|
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
|
||||||
#print google_query
|
#print google_query
|
||||||
results = google.find(google_query, timeout=timeout)
|
results = google.find(google_query, timeout=timeout)
|
||||||
if results:
|
if results:
|
||||||
return findRe(results[0][1], 'title/tt(\d{7})')
|
return find_re(results[0][1], 'title/tt(\d{7})')
|
||||||
#or nothing
|
#or nothing
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
|
||||||
if 'posterId' in info:
|
if 'posterId' in info:
|
||||||
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
||||||
return poster
|
return poster
|
||||||
elif 'series' in info:
|
elif 'series' in info:
|
||||||
return getMoviePoster(info['series'])
|
return getMoviePoster(info['series'])
|
||||||
|
|
|
@ -4,7 +4,7 @@ import re
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox.html import strip_tags
|
from ox.html import strip_tags
|
||||||
from ox.text import findRe
|
from ox.text import find_re
|
||||||
|
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
|
@ -22,13 +22,13 @@ def getData(id):
|
||||||
'url': getUrl(id)
|
'url': getUrl(id)
|
||||||
}
|
}
|
||||||
html = read_url(data['url'], unicode=True)
|
html = read_url(data['url'], unicode=True)
|
||||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||||
if not data['imdbId']:
|
if not data['imdbId']:
|
||||||
data['imdbId'] = _id_map.get(id, '')
|
data['imdbId'] = _id_map.get(id, '')
|
||||||
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||||
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||||
data['posters'] = []
|
data['posters'] = []
|
||||||
poster = findRe(html, '<img src="(posters.*?)"')
|
poster = find_re(html, '<img src="(posters.*?)"')
|
||||||
if poster:
|
if poster:
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||||
data['posters'].append(poster)
|
data['posters'].append(poster)
|
||||||
|
@ -37,13 +37,13 @@ def getData(id):
|
||||||
result = result.replace('_xlg.html', '.html')
|
result = result.replace('_xlg.html', '.html')
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||||
if result:
|
if result:
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||||
else:
|
else:
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||||
data['posters'].append(poster)
|
data['posters'].append(poster)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
@ -54,7 +54,7 @@ def getId(url):
|
||||||
split = split[4][:-5].split('_')
|
split = split[4][:-5].split('_')
|
||||||
if split[-1] == 'xlg':
|
if split[-1] == 'xlg':
|
||||||
split.pop()
|
split.pop()
|
||||||
if findRe(split[-1], 'ver\d+$'):
|
if find_re(split[-1], 'ver\d+$'):
|
||||||
split.pop()
|
split.pop()
|
||||||
id = '%s/%s' % (year, '_'.join(split))
|
id = '%s/%s' % (year, '_'.join(split))
|
||||||
return id
|
return id
|
||||||
|
@ -62,7 +62,7 @@ def getId(url):
|
||||||
def getIds():
|
def getIds():
|
||||||
ids = []
|
ids = []
|
||||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||||
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||||
for page in range(pages, 0, -1):
|
for page in range(pages, 0, -1):
|
||||||
for id in getIdsByPage(page):
|
for id in getIdsByPage(page):
|
||||||
if not id in ids:
|
if not id in ids:
|
||||||
|
@ -81,7 +81,7 @@ def getIdsByPage(page):
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
url = u"http://www.impawards.com/%s.html" % id
|
url = u"http://www.impawards.com/%s.html" % id
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
if findRe(html, "No Movie Posters on This Page"):
|
if find_re(html, "No Movie Posters on This Page"):
|
||||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,9 @@ import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox.html import decodeHtml, strip_tags
|
from ox.html import decode_html, strip_tags
|
||||||
from ox.text import findRe
|
from ox.text import find_re
|
||||||
from ox.text import findString
|
from ox.text import find_string
|
||||||
|
|
||||||
|
|
||||||
# to sniff itunes traffic, use something like
|
# to sniff itunes traffic, use something like
|
||||||
|
@ -65,26 +65,26 @@ def parseXmlDict(xml):
|
||||||
strings = xml.split('<key>')
|
strings = xml.split('<key>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
if string.find('</key>') != -1:
|
if string.find('</key>') != -1:
|
||||||
key = findRe(string, '(.*?)</key>')
|
key = find_re(string, '(.*?)</key>')
|
||||||
type = findRe(string, '</key><(.*?)>')
|
type = find_re(string, '</key><(.*?)>')
|
||||||
if type == 'true/':
|
if type == 'true/':
|
||||||
value = True
|
value = True
|
||||||
else:
|
else:
|
||||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
|
||||||
if type == 'integer':
|
if type == 'integer':
|
||||||
value = int(value)
|
value = int(value)
|
||||||
elif type == 'string':
|
elif type == 'string':
|
||||||
value = decodeHtml(value)
|
value = decode_html(value)
|
||||||
values[key] = value
|
values[key] = value
|
||||||
return values
|
return values
|
||||||
|
|
||||||
def parseCast(xml, title):
|
def parseCast(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
strings.pop()
|
strings.pop()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
return list
|
return list
|
||||||
except:
|
except:
|
||||||
return list
|
return list
|
||||||
|
@ -92,12 +92,12 @@ def parseCast(xml, title):
|
||||||
def parseMovies(xml, title):
|
def parseMovies(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
strings.pop()
|
strings.pop()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
list.append({
|
list.append({
|
||||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
||||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||||
})
|
})
|
||||||
return list
|
return list
|
||||||
except:
|
except:
|
||||||
|
@ -114,24 +114,24 @@ class ItunesAlbum:
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewAlbum', {'id': self.id})
|
url = composeUrl('viewAlbum', {'id': self.id})
|
||||||
xml = read_url(url, None, ITUNES_HEADERS)
|
xml = read_url(url, None, ITUNES_HEADERS)
|
||||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
data['genre'] = find_re(xml, 'Genre:(.*?)<')
|
||||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||||
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['tracks'] = []
|
data['tracks'] = []
|
||||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
data['tracks'].append(parseXmlDict(string))
|
data['tracks'].append(parseXmlDict(string))
|
||||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
class ItunesMovie:
|
class ItunesMovie:
|
||||||
|
@ -145,7 +145,7 @@ class ItunesMovie:
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
|
@ -156,21 +156,21 @@ class ItunesMovie:
|
||||||
f.write(xml)
|
f.write(xml)
|
||||||
f.close()
|
f.close()
|
||||||
data['actors'] = parseCast(xml, 'actors')
|
data['actors'] = parseCast(xml, 'actors')
|
||||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||||
data['directors'] = parseCast(xml, 'directors')
|
data['directors'] = parseCast(xml, 'directors')
|
||||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||||
data['producers'] = parseCast(xml, 'producers')
|
data['producers'] = parseCast(xml, 'producers')
|
||||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox.html import decodeHtml
|
from ox.html import decode_html
|
||||||
from ox.text import findRe
|
from ox.text import find_re
|
||||||
|
|
||||||
|
|
||||||
def getLyrics(title, artist):
|
def getLyrics(title, artist):
|
||||||
html = read_url('http://lyricsfly.com/api/')
|
html = read_url('http://lyricsfly.com/api/')
|
||||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||||
xml = read_url(url)
|
xml = read_url(url)
|
||||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||||
lyrics.replace('\n\n\n', '\n\n')
|
lyrics.replace('\n\n\n', '\n\n')
|
||||||
lyrics = decodeHtml(lyrics.replace('&', '&'))
|
lyrics = decode_html(lyrics.replace('&', '&'))
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -5,7 +5,7 @@ from urllib import quote
|
||||||
from lxml.html import document_fromstring
|
from lxml.html import document_fromstring
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe, strip_tags
|
from ox import find_re, strip_tags
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
return 'http://www.metacritic.com/movie/%s' % id
|
return 'http://www.metacritic.com/movie/%s' % id
|
||||||
|
@ -16,14 +16,14 @@ def getId(url):
|
||||||
def getUrlByImdb(imdb):
|
def getUrlByImdb(imdb):
|
||||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||||
return metacritic_url or None
|
return metacritic_url or None
|
||||||
|
|
||||||
def getMetacriticShowUrl(title):
|
def getMetacriticShowUrl(title):
|
||||||
title = quote(title)
|
title = quote(title)
|
||||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||||
|
|
||||||
def getData(url):
|
def getData(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
|
|
|
@ -6,8 +6,8 @@ import socket
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
|
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
|
||||||
from ox.normalize import normalizeImdbId
|
from ox.normalize import normalize_imdbid
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
|
||||||
torrentDate = row[0]
|
torrentDate = row[0]
|
||||||
torrentExtra = row[1]
|
torrentExtra = row[1]
|
||||||
torrentId = row[2]
|
torrentId = row[2]
|
||||||
torrentTitle = decodeHtml(row[3]).strip()
|
torrentTitle = decode_html(row[3]).strip()
|
||||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||||
privateTracker = 'priv.gif' in torrentExtra
|
privateTracker = 'priv.gif' in torrentExtra
|
||||||
if not privateTracker:
|
if not privateTracker:
|
||||||
|
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
|
||||||
'''find torrents on mininova for a given imdb id
|
'''find torrents on mininova for a given imdb id
|
||||||
'''
|
'''
|
||||||
results = []
|
results = []
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalize_imdbid(imdbId)
|
||||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||||
return _parseResultsPage(data)
|
return _parseResultsPage(data)
|
||||||
|
|
||||||
def getId(mininovaId):
|
def getId(mininovaId):
|
||||||
mininovaId = unicode(mininovaId)
|
mininovaId = unicode(mininovaId)
|
||||||
d = findRe(mininovaId, "/(\d+)")
|
d = find_re(mininovaId, "/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
return d
|
return d
|
||||||
mininovaId = mininovaId.split('/')
|
mininovaId = mininovaId.split('/')
|
||||||
|
@ -81,14 +81,14 @@ def getData(mininovaId):
|
||||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(strip_tags(d[1].strip()))
|
value = decode_html(strip_tags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
|
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||||
if torrent['description']:
|
if torrent['description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||||
t = read_url(torrent[u'torrent_link'])
|
t = read_url(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe
|
from ox import find_re
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
'''
|
'''
|
||||||
|
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = read_url(result, timeout=timeout, unicode=True)
|
html = read_url(result, timeout=timeout, unicode=True)
|
||||||
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||||
return posters
|
return posters
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
|
|
|
@ -4,7 +4,7 @@ import re
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe, strip_tags
|
from ox import find_re, strip_tags
|
||||||
from ox import langCode2To3, langTo3Code
|
from ox import langCode2To3, langTo3Code
|
||||||
|
|
||||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
|
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
if opensubtitleId:
|
if opensubtitleId:
|
||||||
opensubtitleId = opensubtitleId[0]
|
opensubtitleId = opensubtitleId[0]
|
||||||
else:
|
else:
|
||||||
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
|
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||||
return opensubtitleId
|
return opensubtitleId
|
||||||
|
|
||||||
def downloadSubtitleById(opensubtitle_id):
|
def downloadSubtitleById(opensubtitle_id):
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import getHeaders, read_url
|
from ox.cache import getHeaders, read_url
|
||||||
from ox import findRe, strip_tags
|
from ox import find_re, strip_tags
|
||||||
|
|
||||||
|
|
||||||
def getUrlByImdb(imdb):
|
def getUrlByImdb(imdb):
|
||||||
|
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_og(data, key):
|
def get_og(data, key):
|
||||||
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||||
|
|
||||||
def getData(url):
|
def getData(url):
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||||
if '(' in r['title']:
|
if '(' in r['title']:
|
||||||
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
||||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||||
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||||
if not r['summary']:
|
if not r['summary']:
|
||||||
r['summary'] = get_og(data, 'description')
|
r['summary'] = get_og(data, 'description')
|
||||||
|
@ -40,9 +40,9 @@ def getData(url):
|
||||||
meter = filter(lambda m: m[1].isdigit(), meter)
|
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||||
if meter:
|
if meter:
|
||||||
r['tomatometer'] = meter[0][1]
|
r['tomatometer'] = meter[0][1]
|
||||||
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||||
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||||
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5')
|
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
||||||
poster = get_og(data, 'image')
|
poster = get_og(data, 'image')
|
||||||
if poster and not 'poster_default.gif' in poster:
|
if poster and not 'poster_default.gif' in poster:
|
||||||
r['posters'] = [poster]
|
r['posters'] = [poster]
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..cache import read_url
|
from ..cache import read_url
|
||||||
from .. import strip_tags, decodeHtml
|
from .. import strip_tags, decode_html
|
||||||
from ..utils import datetime
|
from ..utils import datetime
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
|
||||||
if data:
|
if data:
|
||||||
if isinstance(data[0], basestring):
|
if isinstance(data[0], basestring):
|
||||||
#FIXME: some types need strip_tags
|
#FIXME: some types need strip_tags
|
||||||
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
|
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||||
data = [decodeHtml(p).strip() for p in data]
|
data = [decode_html(p).strip() for p in data]
|
||||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||||
data = [cleanup(key, p, data_type) for p in data]
|
data = [cleanup(key, p, data_type) for p in data]
|
||||||
while len(data) == 1 and not isinstance(data, basestring):
|
while len(data) == 1 and not isinstance(data, basestring):
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import ox.cache
|
import ox.cache
|
||||||
from ox.html import decodeHtml, strip_tags
|
from ox.html import decode_html, strip_tags
|
||||||
import ox.net
|
import ox.net
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,8 +44,8 @@ def getNews(year, month, day):
|
||||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||||
else:
|
else:
|
||||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||||
# fix decodeHtml
|
# fix decode_html
|
||||||
# new['description'] = formatString(decodeHtml(description))
|
# new['description'] = formatString(decode_html(description))
|
||||||
new['description'] = formatString(description)
|
new['description'] = formatString(description)
|
||||||
new['imageUrl'] = imageUrl
|
new['imageUrl'] = imageUrl
|
||||||
new['section'] = formatSection(section)
|
new['section'] = formatSection(section)
|
||||||
|
|
|
@ -6,8 +6,8 @@ import socket
|
||||||
from urllib import quote, urlencode
|
from urllib import quote, urlencode
|
||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
|
|
||||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
|
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
|
||||||
from ox.normalize import normalizeImdbId
|
from ox.normalize import normalize_imdbid
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
|
||||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
torrentType = row[0]
|
torrentType = row[0]
|
||||||
torrentLink = "http://thepiratebay.org" + row[1]
|
torrentLink = "http://thepiratebay.org" + row[1]
|
||||||
torrentTitle = decodeHtml(row[2])
|
torrentTitle = decode_html(row[2])
|
||||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||||
if torrentType in ['201']:
|
if torrentType in ['201']:
|
||||||
results.append((torrentTitle, torrentLink, ''))
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
|
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findMovieByImdb(imdb):
|
def findMovieByImdb(imdb):
|
||||||
return findMovies("tt" + normalizeImdbId(imdb))
|
return findMovies("tt" + normalize_imdbid(imdb))
|
||||||
|
|
||||||
def getId(piratebayId):
|
def getId(piratebayId):
|
||||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||||
piratebayId = piratebayId.split('org/')[1]
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
d = findRe(piratebayId, "tor/(\d+)")
|
d = find_re(piratebayId, "tor/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
d = findRe(piratebayId, "torrent/(\d+)")
|
d = find_re(piratebayId, "torrent/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
return piratebayId
|
return piratebayId
|
||||||
|
@ -80,21 +80,21 @@ def getData(piratebayId):
|
||||||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||||
|
|
||||||
data = read_url(torrent['comment_link'], unicode=True)
|
data = read_url(torrent['comment_link'], unicode=True)
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
if not torrent[u'title']:
|
if not torrent[u'title']:
|
||||||
return None
|
return None
|
||||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||||
title = quote(torrent['title'].encode('utf-8'))
|
title = quote(torrent['title'].encode('utf-8'))
|
||||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(strip_tags(d[1].strip()))
|
value = decode_html(strip_tags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||||
if torrent[u'description']:
|
if torrent[u'description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||||
t = _read_url(torrent[u'torrent_link'])
|
t = _read_url(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
10
ox/web/tv.py
10
ox/web/tv.py
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import strip_tags, findRe
|
from ox import strip_tags, find_re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,11 +16,11 @@ def getEpisodeData(url):
|
||||||
'''
|
'''
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||||
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
||||||
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||||
#episode score
|
#episode score
|
||||||
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||||
|
|
||||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||||
if match:
|
if match:
|
||||||
|
|
|
@ -5,7 +5,7 @@ from StringIO import StringIO
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findString, findRe
|
from ox import find_string, find_re
|
||||||
|
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
|
|
|
@ -5,7 +5,7 @@ from urllib import urlencode
|
||||||
|
|
||||||
from ox.utils import json
|
from ox.utils import json
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import findRe, decodeHtml
|
from ox import find_re, decode_html
|
||||||
|
|
||||||
|
|
||||||
def getId(url):
|
def getId(url):
|
||||||
|
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
|
||||||
if not wikipediaUrl.startswith('http'):
|
if not wikipediaUrl.startswith('http'):
|
||||||
wikipediaUrl = getUrl(wikipediaUrl)
|
wikipediaUrl = getUrl(wikipediaUrl)
|
||||||
data = getWikiData(wikipediaUrl)
|
data = getWikiData(wikipediaUrl)
|
||||||
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
_box = filmbox_data.strip().split('|')
|
_box = filmbox_data.strip().split('|')
|
||||||
for row in _box:
|
for row in _box:
|
||||||
|
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
|
||||||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||||
del filmbox['amg_id']
|
del filmbox['amg_id']
|
||||||
if 'Allmovie movie' in data:
|
if 'Allmovie movie' in data:
|
||||||
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
|
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||||
elif 'Allmovie title' in data:
|
elif 'Allmovie title' in data:
|
||||||
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
|
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||||
|
|
||||||
if 'Official website' in data:
|
if 'Official website' in data:
|
||||||
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
|
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||||
|
|
||||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
|
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
|
||||||
if r:
|
if r:
|
||||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||||
if 'google video' in data:
|
if 'google video' in data:
|
||||||
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]')
|
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||||
if 'DEFAULTSORT' in data:
|
if 'DEFAULTSORT' in data:
|
||||||
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||||
return filmbox
|
return filmbox
|
||||||
|
|
||||||
def getImageUrl(name):
|
def getImageUrl(name):
|
||||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||||
if not url:
|
if not url:
|
||||||
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||||
if url:
|
if url:
|
||||||
url = 'http:' + url
|
url = 'http:' + url
|
||||||
return url
|
return url
|
||||||
|
|
Loading…
Reference in a new issue