replace all CammelCase with under_score in ox

This commit is contained in:
j 2012-08-14 16:12:43 +02:00
commit bb35daa95c
31 changed files with 242 additions and 244 deletions

View file

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
@ -28,22 +28,22 @@ def getData(id):
}
html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries')
data['director'] = parseEntry(html, 'directed by')
data['genres'] = parseList(html, 'genres')
data['keywords'] = parseList(html, 'keywords')
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parseList(html, 'produced by')
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'released by')
data['releasedate'] = parseList(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in')
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
@ -51,18 +51,18 @@ def getData(id):
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data
def getUrl(id):
return "http://allmovie.com/work/%s" % id
def parseEntry(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip()
def parseList(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [strip_tags(html)]
@ -74,11 +74,11 @@ def parseTable(html):
lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-')
),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
)
def parseText(html, title):
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print getData('129689')

View file

@ -3,7 +3,7 @@
import re
from urllib import quote
from ox import findRe, strip_tags, decodeHtml
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
@ -12,7 +12,7 @@ def findISBN(title, author):
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id)
if author in data['authors']:
return data
@ -24,13 +24,13 @@ def getData(id):
def findData(key):
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {}
r['amazon'] = url
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span')
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']])
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
if t:
r['translator'] = t
@ -38,15 +38,15 @@ def getData(id):
r['language'] = findData('Language')
r['isbn-10'] = findData('ISBN-10')
r['isbn-13'] = findData('ISBN-13').replace('-', '')
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = findData('Paperback')
if not r['pages']:
r['pages'] = findData('Hardcover')
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:

View file

@ -5,7 +5,7 @@ import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe, removeSpecialCharacters
from ox.text import find_re, remove_special_characters
import imdb
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = findRe(html, "<li>Spine #(\d+)")
data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>')
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
if r:
result = r[0]
result = findRe(result, "<a href=\"(.*?)\"")
result = find_re(result, "<a href=\"(.*?)\"")
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = findRe(result, "src=\"(.*?)\"")
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
if timeout == ox.cache.cache_timeout:
timeout = -1

View file

@ -3,7 +3,7 @@
import re
import urllib
import ox
from ox import strip_tags, decodeHtml
from ox import strip_tags, decode_html
from ox.utils import json
from ox.cache import read_url
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
results = []
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
return results

View file

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
import google
@ -23,8 +23,8 @@ def getShowUrl(title):
def getShowData(url):
data = read_url(url, unicode=True)
r = {}
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):

View file

@ -5,7 +5,7 @@ import re
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined

View file

@ -3,7 +3,7 @@
import json
from ox.cache import read_url
from ox import findRe
from ox import find_re
class Imdb(dict):
def __init__(self, id, timeout=-1):
@ -36,7 +36,7 @@ class Imdb(dict):
if 'nytimes' in self:
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/')
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')

View file

@ -4,7 +4,7 @@ import re
import urllib
import ox
from ox import strip_tags, decodeHtml
from ox import strip_tags, decode_html
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
for a in re.compile(
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
).findall(data):
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results:
break
return results

View file

@ -8,8 +8,8 @@ import time
import unicodedata
import ox
from ox import findRe, strip_tags
from ox.normalize import normalizeTitle, normalizeImdbId
from ox import find_re, strip_tags
from ox.normalize import normalize_title, normalize_imdbid
import ox.cache
from siteparser import SiteParser
@ -50,7 +50,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+')
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
@ -141,7 +141,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: findRe(data.replace(',', ''), '\d+')
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
@ -314,7 +314,7 @@ class Imdb(SiteParser):
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
#print google_query
results = google.find(google_query, timeout=timeout)
if results:
return findRe(results[0][1], 'title/tt(\d{7})')
return find_re(results[0][1], 'title/tt(\d{7})')
#or nothing
return ''
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
if 'posterId' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
return getMoviePoster(info['series'])

View file

@ -4,7 +4,7 @@ import re
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe
from ox.text import find_re
def getData(id):
@ -22,13 +22,13 @@ def getData(id):
'url': getUrl(id)
}
html = read_url(data['url'], unicode=True)
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
poster = findRe(html, '<img src="(posters.*?)"')
poster = find_re(html, '<img src="(posters.*?)"')
if poster:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
data['posters'].append(poster)
@ -37,13 +37,13 @@ def getData(id):
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
result = findRe(html, '<a href = (\w*?_xlg.html)')
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
@ -54,7 +54,7 @@ def getId(url):
split = split[4][:-5].split('_')
if split[-1] == 'xlg':
split.pop()
if findRe(split[-1], 'ver\d+$'):
if find_re(split[-1], 'ver\d+$'):
split.pop()
id = '%s/%s' % (year, '_'.join(split))
return id
@ -62,7 +62,7 @@ def getId(url):
def getIds():
ids = []
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
if not id in ids:
@ -81,7 +81,7 @@ def getIdsByPage(page):
def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
if findRe(html, "No Movie Posters on This Page"):
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url

View file

@ -4,9 +4,9 @@ import re
import urllib
from ox.cache import read_url
from ox.html import decodeHtml, strip_tags
from ox.text import findRe
from ox.text import findString
from ox.html import decode_html, strip_tags
from ox.text import find_re
from ox.text import find_string
# to sniff itunes traffic, use something like
@ -65,26 +65,26 @@ def parseXmlDict(xml):
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = findRe(string, '(.*?)</key>')
type = findRe(string, '</key><(.*?)>')
key = find_re(string, '(.*?)</key>')
type = find_re(string, '</key><(.*?)>')
if type == 'true/':
value = True
else:
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer':
value = int(value)
elif type == 'string':
value = decodeHtml(value)
value = decode_html(value)
values[key] = value
return values
def parseCast(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list
except:
return list
@ -92,12 +92,12 @@ def parseCast(xml, title):
def parseMovies(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append({
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
})
return list
except:
@ -114,24 +114,24 @@ class ItunesAlbum:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
data['tracks'].append(parseXmlDict(string))
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data
class ItunesMovie:
@ -145,7 +145,7 @@ class ItunesMovie:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&')
id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id
def getData(self):
@ -156,21 +156,21 @@ class ItunesMovie:
f.write(xml)
f.close()
data['actors'] = parseCast(xml, 'actors')
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors')
data['format'] = findRe(xml, 'Format:(.*?)<')
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers')
data['rated'] = findRe(xml, 'Rated(.*?)<')
data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters')
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data
if __name__ == '__main__':

View file

@ -1,20 +1,20 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.cache import read_url
from ox.html import decodeHtml
from ox.text import findRe
from ox.html import decode_html
from ox.text import find_re
def getLyrics(title, artist):
html = read_url('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = read_url(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n')
lyrics = decodeHtml(lyrics.replace('&amp;', '&'))
lyrics = decode_html(lyrics.replace('&amp;', '&'))
return lyrics
if __name__ == '__main__':

View file

@ -5,7 +5,7 @@ from urllib import quote
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
def getUrl(id):
return 'http://www.metacritic.com/movie/%s' % id
@ -16,14 +16,14 @@ def getId(url):
def getUrlByImdb(imdb):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
def getMetacriticShowUrl(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url):
data = read_url(url, unicode=True)

View file

@ -6,8 +6,8 @@ import socket
from urllib import quote
from ox.cache import read_url
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
from ox.normalize import normalizeImdbId
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decodeHtml(row[3]).strip()
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
'''find torrents on mininova for a given imdb id
'''
results = []
imdbId = normalizeImdbId(imdbId)
imdbId = normalize_imdbid(imdbId)
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data)
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRe(mininovaId, "/(\d+)")
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
@ -81,14 +81,14 @@ def getData(mininovaId):
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip()))
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View file

@ -4,7 +4,7 @@
import re
from ox.cache import read_url
from ox import findRe
from ox import find_re
def getData(id):
'''
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = read_url(result, timeout=timeout, unicode=True)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters
def getUrl(id):

View file

@ -4,7 +4,7 @@ import re
import feedparser
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def downloadSubtitleById(opensubtitle_id):

View file

@ -3,7 +3,7 @@
import re
from ox.cache import getHeaders, read_url
from ox import findRe, strip_tags
from ox import find_re, strip_tags
def getUrlByImdb(imdb):
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
return None
def get_og(data, key):
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url):
data = read_url(url)
r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)')
r['year'] = find_re(r['title'], '\((\d*?)\)')
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']:
r['summary'] = get_og(data, 'description')
@ -40,9 +40,9 @@ def getData(url):
meter = filter(lambda m: m[1].isdigit(), meter)
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5')
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
poster = get_og(data, 'image')
if poster and not 'poster_default.gif' in poster:
r['posters'] = [poster]

View file

@ -3,7 +3,7 @@
import re
from ..cache import read_url
from .. import strip_tags, decodeHtml
from .. import strip_tags, decode_html
from ..utils import datetime
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
if data:
if isinstance(data[0], basestring):
#FIXME: some types need strip_tags
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
data = [decodeHtml(p).strip() for p in data]
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring):

View file

@ -5,7 +5,7 @@ import re
import time
import ox.cache
from ox.html import decodeHtml, strip_tags
from ox.html import decode_html, strip_tags
import ox.net
@ -44,8 +44,8 @@ def getNews(year, month, day):
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
# fix decode_html
# new['description'] = formatString(decode_html(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)

View file

@ -6,8 +6,8 @@ import socket
from urllib import quote, urlencode
from urllib2 import URLError
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox.normalize import normalizeImdbId
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decodeHtml(row[2])
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
results.append((torrentTitle, torrentLink, ''))
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
return results
def findMovieByImdb(imdb):
return findMovies("tt" + normalizeImdbId(imdb))
return findMovies("tt" + normalize_imdbid(imdb))
def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1]
d = findRe(piratebayId, "tor/(\d+)")
d = find_re(piratebayId, "tor/(\d+)")
if d:
piratebayId = d
d = findRe(piratebayId, "torrent/(\d+)")
d = find_re(piratebayId, "torrent/(\d+)")
if d:
piratebayId = d
return piratebayId
@ -80,21 +80,21 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(strip_tags(d[1].strip()))
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = _read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View file

@ -3,7 +3,7 @@
import re
import time
from ox import strip_tags, findRe
from ox import strip_tags, find_re
from ox.cache import read_url
@ -16,11 +16,11 @@ def getEpisodeData(url):
'''
data = read_url(url, unicode=True)
r = {}
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = find_re(data, '<h1>(.*?)</h1>')
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match:

View file

@ -5,7 +5,7 @@ from StringIO import StringIO
import xml.etree.ElementTree as ET
from ox.cache import read_url
from ox import findString, findRe
from ox import find_string, find_re
def getData(id):

View file

@ -5,7 +5,7 @@ from urllib import urlencode
from ox.utils import json
from ox.cache import read_url
from ox import findRe, decodeHtml
from ox import find_re, decode_html
def getId(url):
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
if not wikipediaUrl.startswith('http'):
wikipediaUrl = getUrl(wikipediaUrl)
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {}
_box = filmbox_data.strip().split('|')
for row in _box:
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r:
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]')
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def getImageUrl(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url, unicode=True)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
if url:
url = 'http:' + url
return url