replace all CammelCase with under_score in ox
This commit is contained in:
parent
2de989e188
commit
bb35daa95c
31 changed files with 242 additions and 244 deletions
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
|
@ -28,22 +28,22 @@ def getData(id):
|
|||
}
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parseList(html, 'AKA')
|
||||
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parseList(html, 'countries')
|
||||
data['director'] = parseEntry(html, 'directed by')
|
||||
data['genres'] = parseList(html, 'genres')
|
||||
data['keywords'] = parseList(html, 'keywords')
|
||||
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
|
||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||
data['produced'] = parseList(html, 'produced by')
|
||||
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||
data['released'] = parseEntry(html, 'released by')
|
||||
data['releasedate'] = parseList(html, 'release date')
|
||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parseEntry(html, 'set in')
|
||||
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parseList(html, 'themes')
|
||||
data['types'] = parseList(html, 'types')
|
||||
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
|
|
@ -51,18 +51,18 @@ def getData(id):
|
|||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parseTable(html)
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def getUrl(id):
|
||||
return "http://allmovie.com/work/%s" % id
|
||||
|
||||
def parseEntry(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parseList(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [strip_tags(html)]
|
||||
|
|
@ -74,11 +74,11 @@ def parseTable(html):
|
|||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||
x.split('<td width="305">-')
|
||||
),
|
||||
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
)
|
||||
|
||||
def parseText(html, title):
|
||||
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getData('129689')
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
from urllib import quote
|
||||
|
||||
from ox import findRe, strip_tags, decodeHtml
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
|
@ -12,7 +12,7 @@ def findISBN(title, author):
|
|||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = getData(id)
|
||||
if author in data['authors']:
|
||||
return data
|
||||
|
|
@ -24,13 +24,13 @@ def getData(id):
|
|||
|
||||
|
||||
def findData(key):
|
||||
return findRe(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
|
||||
r = {}
|
||||
r['amazon'] = url
|
||||
r['title'] = findRe(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
||||
r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']])
|
||||
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||
if t:
|
||||
r['translator'] = t
|
||||
|
|
@ -38,15 +38,15 @@ def getData(id):
|
|||
r['language'] = findData('Language')
|
||||
r['isbn-10'] = findData('ISBN-10')
|
||||
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
||||
r['dimensions'] = findRe(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
|
||||
r['pages'] = findData('Paperback')
|
||||
if not r['pages']:
|
||||
r['pages'] = findData('Hardcover')
|
||||
|
||||
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import re
|
|||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe, removeSpecialCharacters
|
||||
from ox.text import find_re, remove_special_characters
|
||||
|
||||
import imdb
|
||||
|
||||
|
|
@ -33,40 +33,40 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||
data["title"] = find_re(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
data["synopsis"] = strip_tags(find_re(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
|
||||
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
||||
if r:
|
||||
result = r[0]
|
||||
result = findRe(result, "<a href=\"(.*?)\"")
|
||||
result = find_re(result, "<a href=\"(.*?)\"")
|
||||
if not "/boxsets/" in result:
|
||||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = findRe(result, "src=\"(.*?)\"")
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [findRe(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [findRe(html, "\"videoURL\", \"(.*?)\"")])
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import urllib
|
||||
import ox
|
||||
from ox import strip_tags, decodeHtml
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
|
||||
|
|
@ -17,6 +17,6 @@ def find(query, timeout=ox.cache.cache_timeout):
|
|||
results = []
|
||||
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
return results
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
|
|
@ -23,8 +23,8 @@ def getShowUrl(title):
|
|||
def getShowData(url):
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['episodes'] = {}
|
||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import re
|
|||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import json
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
from ox import find_re
|
||||
|
||||
class Imdb(dict):
|
||||
def __init__(self, id, timeout=-1):
|
||||
|
|
@ -36,7 +36,7 @@ class Imdb(dict):
|
|||
|
||||
if 'nytimes' in self:
|
||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||
self['amgId'] = findRe(self['nytimes'], 'movie/(\d+)/')
|
||||
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import re
|
|||
import urllib
|
||||
|
||||
import ox
|
||||
from ox import strip_tags, decodeHtml
|
||||
from ox import strip_tags, decode_html
|
||||
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
DEFAULT_TIMEOUT = 24*60*60
|
||||
|
|
@ -34,7 +34,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
for a in re.compile(
|
||||
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
||||
).findall(data):
|
||||
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
|
||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -8,8 +8,8 @@ import time
|
|||
import unicodedata
|
||||
|
||||
import ox
|
||||
from ox import findRe, strip_tags
|
||||
from ox.normalize import normalizeTitle, normalizeImdbId
|
||||
from ox import find_re, strip_tags
|
||||
from ox.normalize import normalize_title, normalize_imdbid
|
||||
import ox.cache
|
||||
|
||||
from siteparser import SiteParser
|
||||
|
|
@ -50,7 +50,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: findRe(ox.decodeHtml(data).replace(',', ''), '\d+')
|
||||
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
|
@ -141,7 +141,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: findRe(data.replace(',', ''), '\d+')
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
|
@ -314,7 +314,7 @@ class Imdb(SiteParser):
|
|||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
|
@ -551,7 +551,7 @@ def getMovieId(title, director='', year='', timeout=-1):
|
|||
#print google_query
|
||||
results = google.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
return findRe(results[0][1], 'title/tt(\d{7})')
|
||||
return find_re(results[0][1], 'title/tt(\d{7})')
|
||||
#or nothing
|
||||
return ''
|
||||
|
||||
|
|
@ -567,7 +567,7 @@ def getMoviePoster(imdbId):
|
|||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url)
|
||||
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return getMoviePoster(info['series'])
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import re
|
|||
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
|
|
@ -22,13 +22,13 @@ def getData(id):
|
|||
'url': getUrl(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['posters'] = []
|
||||
poster = findRe(html, '<img src="(posters.*?)"')
|
||||
poster = find_re(html, '<img src="(posters.*?)"')
|
||||
if poster:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||
data['posters'].append(poster)
|
||||
|
|
@ -37,13 +37,13 @@ def getData(id):
|
|||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
|
@ -54,7 +54,7 @@ def getId(url):
|
|||
split = split[4][:-5].split('_')
|
||||
if split[-1] == 'xlg':
|
||||
split.pop()
|
||||
if findRe(split[-1], 'ver\d+$'):
|
||||
if find_re(split[-1], 'ver\d+$'):
|
||||
split.pop()
|
||||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
|
@ -62,7 +62,7 @@ def getId(url):
|
|||
def getIds():
|
||||
ids = []
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in getIdsByPage(page):
|
||||
if not id in ids:
|
||||
|
|
@ -81,7 +81,7 @@ def getIdsByPage(page):
|
|||
def getUrl(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
if findRe(html, "No Movie Posters on This Page"):
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@ import re
|
|||
import urllib
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
from ox.text import findRe
|
||||
from ox.text import findString
|
||||
from ox.html import decode_html, strip_tags
|
||||
from ox.text import find_re
|
||||
from ox.text import find_string
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
|
|
@ -65,26 +65,26 @@ def parseXmlDict(xml):
|
|||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = findRe(string, '(.*?)</key>')
|
||||
type = findRe(string, '</key><(.*?)>')
|
||||
key = find_re(string, '(.*?)</key>')
|
||||
type = find_re(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decodeHtml(value)
|
||||
value = decode_html(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parseCast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
|
@ -92,12 +92,12 @@ def parseCast(xml, title):
|
|||
def parseMovies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
|
|
@ -114,24 +114,24 @@ class ItunesAlbum:
|
|||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = find_re(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
|
|
@ -145,7 +145,7 @@ class ItunesMovie:
|
|||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
|
|
@ -156,21 +156,21 @@ class ItunesMovie:
|
|||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -1,20 +1,20 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml
|
||||
from ox.text import findRe
|
||||
from ox.html import decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def getLyrics(title, artist):
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = read_url(url)
|
||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
lyrics.replace('\n\n\n', '\n\n')
|
||||
lyrics = decodeHtml(lyrics.replace('&', '&'))
|
||||
lyrics = decode_html(lyrics.replace('&', '&'))
|
||||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from urllib import quote
|
|||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
def getUrl(id):
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
|
@ -16,14 +16,14 @@ def getId(url):
|
|||
def getUrlByImdb(imdb):
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = read_url(url)
|
||||
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
|
||||
def getMetacriticShowUrl(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = read_url(url)
|
||||
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def getData(url):
|
||||
data = read_url(url, unicode=True)
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ import socket
|
|||
from urllib import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
|
@ -20,7 +20,7 @@ def _parseResultsPage(data, max_results=10):
|
|||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decodeHtml(row[3]).strip()
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
|
|
@ -38,13 +38,13 @@ def findMovieByImdb(imdbId):
|
|||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
imdbId = normalize_imdbid(imdbId)
|
||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
|
|
@ -81,14 +81,14 @@ def getData(mininovaId):
|
|||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
from ox import find_re
|
||||
|
||||
def getData(id):
|
||||
'''
|
||||
|
|
@ -33,7 +33,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
|
|||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
def getUrl(id):
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import re
|
|||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
from ox import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
|
|
@ -26,7 +26,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
|||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
|
||||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
|
||||
from ox.cache import getHeaders, read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
|
||||
def getUrlByImdb(imdb):
|
||||
|
|
@ -22,16 +22,16 @@ def getUrlByImdb(imdb):
|
|||
return None
|
||||
|
||||
def get_og(data, key):
|
||||
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def getData(url):
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
if '(' in r['title']:
|
||||
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
||||
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||
if not r['summary']:
|
||||
r['summary'] = get_og(data, 'description')
|
||||
|
|
@ -40,9 +40,9 @@ def getData(url):
|
|||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = findRe(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = findRe(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = findRe(data, 'Average Rating: ([\d.]+)/5')
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
||||
poster = get_og(data, 'image')
|
||||
if poster and not 'poster_default.gif' in poster:
|
||||
r['posters'] = [poster]
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
|
||||
from ..cache import read_url
|
||||
from .. import strip_tags, decodeHtml
|
||||
from .. import strip_tags, decode_html
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
|
|
@ -11,8 +11,8 @@ def cleanup(key, data, data_type):
|
|||
if data:
|
||||
if isinstance(data[0], basestring):
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
|
||||
data = [decodeHtml(p).strip() for p in data]
|
||||
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||
data = [decode_html(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
while len(data) == 1 and not isinstance(data, basestring):
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import re
|
|||
import time
|
||||
|
||||
import ox.cache
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
from ox.html import decode_html, strip_tags
|
||||
import ox.net
|
||||
|
||||
|
||||
|
|
@ -44,8 +44,8 @@ def getNews(year, month, day):
|
|||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decodeHtml
|
||||
# new['description'] = formatString(decodeHtml(description))
|
||||
# fix decode_html
|
||||
# new['description'] = formatString(decode_html(description))
|
||||
new['description'] = formatString(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = formatSection(section)
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ import socket
|
|||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
from ox import find_re, cache, strip_tags, decode_html, getTorrentInfo, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
|
@ -38,7 +38,7 @@ def findMovies(query, max_results=10):
|
|||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decodeHtml(row[2])
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
|
|
@ -48,15 +48,15 @@ def findMovies(query, max_results=10):
|
|||
return results
|
||||
|
||||
def findMovieByImdb(imdb):
|
||||
return findMovies("tt" + normalizeImdbId(imdb))
|
||||
return findMovies("tt" + normalize_imdbid(imdb))
|
||||
|
||||
def getId(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = findRe(piratebayId, "tor/(\d+)")
|
||||
d = find_re(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
d = findRe(piratebayId, "torrent/(\d+)")
|
||||
d = find_re(piratebayId, "torrent/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
|
@ -80,21 +80,21 @@ def getData(piratebayId):
|
|||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = _read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
|
|
|||
10
ox/web/tv.py
10
ox/web/tv.py
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, findRe
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
|
@ -16,11 +16,11 @@ def getEpisodeData(url):
|
|||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
#episode score
|
||||
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
|
||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||
if match:
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from StringIO import StringIO
|
|||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import findString, findRe
|
||||
from ox import find_string, find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from urllib import urlencode
|
|||
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, decodeHtml
|
||||
from ox import find_re, decode_html
|
||||
|
||||
|
||||
def getId(url):
|
||||
|
|
@ -54,7 +54,7 @@ def getMovieData(wikipediaUrl):
|
|||
if not wikipediaUrl.startswith('http'):
|
||||
wikipediaUrl = getUrl(wikipediaUrl)
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
_box = filmbox_data.strip().split('|')
|
||||
for row in _box:
|
||||
|
|
@ -72,12 +72,12 @@ def getMovieData(wikipediaUrl):
|
|||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||
del filmbox['amg_id']
|
||||
if 'Allmovie movie' in data:
|
||||
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||
elif 'Allmovie title' in data:
|
||||
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||
|
||||
if 'Official website' in data:
|
||||
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
|
||||
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||
|
||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
|
|
@ -99,17 +99,17 @@ def getMovieData(wikipediaUrl):
|
|||
if r:
|
||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def getImageUrl(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url, unicode=True)
|
||||
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
if url:
|
||||
url = 'http:' + url
|
||||
return url
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue