ox.web under_score api rewrite
This commit is contained in:
parent
bb35daa95c
commit
a4fd3c930f
29 changed files with 268 additions and 285 deletions
|
@ -307,6 +307,8 @@ def parse_movie_path(path):
|
|||
title = title.replace('_ ', ': ')
|
||||
if title.endswith('_'):
|
||||
title = title[:-1] + '.'
|
||||
if title.startswith('_'):
|
||||
title = '.' + title[1:]
|
||||
|
||||
year = find_re(title, '(\(\d{4}\))')
|
||||
if not year:
|
||||
|
@ -344,8 +346,9 @@ def parse_movie_path(path):
|
|||
else:
|
||||
season = None
|
||||
|
||||
episode = find_re(parts[-1], '\.Episode (\d+)\.')
|
||||
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
|
||||
if episode:
|
||||
episode = episode.split('+')[0]
|
||||
episode = int(episode)
|
||||
else:
|
||||
episode = None
|
||||
|
|
|
@ -7,7 +7,7 @@ from utils import json, ET
|
|||
|
||||
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||
embed = {}
|
||||
header = cache.getHeaders(url)
|
||||
header = cache.get_headers(url)
|
||||
if header.get('content-type', '').startswith('text/html'):
|
||||
html = cache.readUrl(url)
|
||||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
|
|
|
@ -7,68 +7,68 @@ from ox import strip_tags, find_re
|
|||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> getData('129689')['cast'][1][1]
|
||||
>>> get_data('129689')['cast'][1][1]
|
||||
u'Marianne'
|
||||
>>> getData('129689')['credits'][0][0]
|
||||
>>> get_data('129689')['credits'][0][0]
|
||||
u'Jean-Luc Godard'
|
||||
>>> getData('129689')['posters'][0]
|
||||
>>> get_data('129689')['posters'][0]
|
||||
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
||||
>>> getData('129689')['rating']
|
||||
>>> get_data('129689')['rating']
|
||||
u'4.5'
|
||||
'''
|
||||
if id.startswith('http'):
|
||||
id = getId(id)
|
||||
id = get_id(id)
|
||||
data = {
|
||||
"url": getUrl(id)
|
||||
"url": get_url(id)
|
||||
}
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parseList(html, 'AKA')
|
||||
data['aka'] = parse_list(html, 'AKA')
|
||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parseList(html, 'countries')
|
||||
data['director'] = parseEntry(html, 'directed by')
|
||||
data['genres'] = parseList(html, 'genres')
|
||||
data['keywords'] = parseList(html, 'keywords')
|
||||
data['countries'] = parse_list(html, 'countries')
|
||||
data['director'] = parse_entry(html, 'directed by')
|
||||
data['genres'] = parse_list(html, 'genres')
|
||||
data['keywords'] = parse_list(html, 'keywords')
|
||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||
data['produced'] = parseList(html, 'produced by')
|
||||
data['produced'] = parse_list(html, 'produced by')
|
||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||
data['released'] = parseEntry(html, 'released by')
|
||||
data['releasedate'] = parseList(html, 'release date')
|
||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parseEntry(html, 'set in')
|
||||
data['released'] = parse_entry(html, 'released by')
|
||||
data['releasedate'] = parse_list(html, 'release date')
|
||||
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parse_entry(html, 'set in')
|
||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parseList(html, 'themes')
|
||||
data['types'] = parseList(html, 'types')
|
||||
data['themes'] = parse_list(html, 'themes')
|
||||
data['types'] = parse_list(html, 'types')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
#data['cast'] = parseTable(html)
|
||||
#data['cast'] = parse_table(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parseTable(html)
|
||||
#data['credits'] = parse_table(html)
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://allmovie.com/work/%s" % id
|
||||
|
||||
def parseEntry(html, title):
|
||||
def parse_entry(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parseList(html, title):
|
||||
def parse_list(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [strip_tags(html)]
|
||||
return r
|
||||
|
||||
def parseTable(html):
|
||||
def parse_table(html):
|
||||
return map(
|
||||
lambda x: map(
|
||||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||
|
@ -77,10 +77,10 @@ def parseTable(html):
|
|||
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
)
|
||||
|
||||
def parseText(html, title):
|
||||
def parse_text(html, title):
|
||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getData('129689')
|
||||
# print getData('177524')
|
||||
print get_data('129689')
|
||||
# print get_data('177524')
|
||||
|
||||
|
|
|
@ -13,17 +13,17 @@ def findISBN(title, author):
|
|||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = getData(id)
|
||||
data = get_data(id)
|
||||
if author in data['authors']:
|
||||
return data
|
||||
return {}
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||
data = read_url(url, unicode=True)
|
||||
|
||||
|
||||
def findData(key):
|
||||
def find_data(key):
|
||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
|
||||
r = {}
|
||||
|
@ -34,15 +34,15 @@ def getData(id):
|
|||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||
if t:
|
||||
r['translator'] = t
|
||||
r['publisher'] = findData('Publisher')
|
||||
r['language'] = findData('Language')
|
||||
r['isbn-10'] = findData('ISBN-10')
|
||||
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
||||
r['publisher'] = find_data('Publisher')
|
||||
r['language'] = find_data('Language')
|
||||
r['isbn-10'] = find_data('ISBN-10')
|
||||
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
|
||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
|
||||
r['pages'] = findData('Paperback')
|
||||
r['pages'] = find_data('Paperback')
|
||||
if not r['pages']:
|
||||
r['pages'] = findData('Hardcover')
|
||||
r['pages'] = find_data('Hardcover')
|
||||
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ HEADERS = {
|
|||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
||||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||
|
||||
def getMovieData(title, director):
|
||||
def get_movie_data(title, director):
|
||||
if isinstance(title, unicode):
|
||||
title = title.encode('utf-8')
|
||||
if isinstance(director, unicode):
|
||||
|
@ -60,8 +60,8 @@ def getMovieData(title, director):
|
|||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getMovieData('Alphaville', 'Jean-Luc Godard')
|
||||
print getMovieData('Sin City', 'Roberto Rodriguez')
|
||||
print getMovieData('Breathless', 'Jean-Luc Godard')
|
||||
print getMovieData('Capitalism: A Love Story', 'Michael Moore')
|
||||
print getMovieData('Film Socialisme', 'Jean-Luc Godard')
|
||||
print get_movie_data('Alphaville', 'Jean-Luc Godard')
|
||||
print get_movie_data('Sin City', 'Roberto Rodriguez')
|
||||
print get_movie_data('Breathless', 'Jean-Luc Godard')
|
||||
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
|
||||
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')
|
||||
|
|
|
@ -3,15 +3,15 @@
|
|||
from .. import cache
|
||||
from ..utils import json
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://www.archive.org/details/%s" % id
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
data = {}
|
||||
url = getUrl(id)
|
||||
url = get_url(id)
|
||||
details = cache.read_url('%s?output=json' % url)
|
||||
details = json.loads(details)
|
||||
for key in ('title', 'description', 'runtime'):
|
||||
|
|
|
@ -9,25 +9,25 @@ from ox.text import find_re, remove_special_characters
|
|||
|
||||
import imdb
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://www.criterion.com/films/%s" % id
|
||||
|
||||
def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
'''
|
||||
>>> getData('1333')['imdbId']
|
||||
>>> get_data('1333')['imdbId']
|
||||
u'0060304'
|
||||
|
||||
>>> getData('236')['posters'][0]
|
||||
>>> get_data('236')['posters'][0]
|
||||
u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
|
||||
|
||||
>>> getData('786')['posters'][0]
|
||||
>>> get_data('786')['posters'][0]
|
||||
u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
|
||||
'''
|
||||
data = {
|
||||
"url": getUrl(id)
|
||||
"url": get_url(id)
|
||||
}
|
||||
try:
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
|
@ -71,21 +71,21 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
if get_imdb:
|
||||
data['imdbId'] = imdb.getMovieId(data['title'],
|
||||
data['imdbId'] = imdb.get_movie_id(data['title'],
|
||||
data['director'], data['year'], timeout=timeout)
|
||||
return data
|
||||
|
||||
def getIds():
|
||||
def get_ids():
|
||||
ids = []
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
for id in getIdsByPage(page):
|
||||
for id in get_idsByPage(page):
|
||||
ids.append(id)
|
||||
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
||||
|
||||
def getIdsByPage(page):
|
||||
def get_idsByPage(page):
|
||||
ids = []
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = read_url(url, unicode=True)
|
||||
|
@ -101,4 +101,4 @@ def getIdsByPage(page):
|
|||
return set(ids)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getIds()
|
||||
print get_ids()
|
||||
|
|
|
@ -5,7 +5,7 @@ from urllib import unquote
|
|||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getVideoUrl(url):
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
|
|
@ -9,7 +9,7 @@ from ox.cache import read_url
|
|||
import google
|
||||
|
||||
|
||||
def getShowUrl(title):
|
||||
def get_show_url(title):
|
||||
'''
|
||||
Search Epguide Url for Show via Show Title.
|
||||
Use Google to search the url, this is also done on Epguide.
|
||||
|
@ -20,7 +20,7 @@ def getShowUrl(title):
|
|||
return url
|
||||
return None
|
||||
|
||||
def getShowData(url):
|
||||
def get_show_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||
|
|
|
@ -9,28 +9,28 @@ from ox import find_re, strip_tags
|
|||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
def getData(id, timeout=-1):
|
||||
def get_data(id, timeout=-1):
|
||||
'''
|
||||
>>> getData('the-matrix')['poster']
|
||||
>>> get_data('the-matrix')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> getData('0133093')['poster']
|
||||
>>> get_data('0133093')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> getData('2-or-3-things-i-know-about-her')['poster']
|
||||
>>> get_data('2-or-3-things-i-know-about-her')['poster']
|
||||
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
|
||||
|
||||
>>> getData('0078875')['rottentomatoes_id']
|
||||
>>> get_data('0078875')['rottentomatoes_id']
|
||||
'http://www.rottentomatoes.com/m/the-tin-drum/'
|
||||
'''
|
||||
if len(id) == 7:
|
||||
try:
|
||||
int(id)
|
||||
id = getIdByImdb(id)
|
||||
id = get_id(imdb=id)
|
||||
except:
|
||||
pass
|
||||
data = {
|
||||
"url": getUrl(id),
|
||||
"url": get_url(id),
|
||||
}
|
||||
html = read_url(data['url'], timeout=timeout, timeout=True)
|
||||
doc = document_fromstring(html)
|
||||
|
@ -55,21 +55,20 @@ def getData(id, timeout=-1):
|
|||
return None
|
||||
return data
|
||||
|
||||
def getIdByImdb(imdbId):
|
||||
def get_id(url=None, imdb=None):
|
||||
'''
|
||||
>>> getIdByImdb('0133093')
|
||||
>>> get_id(imdb='0133093')
|
||||
u'the-matrix'
|
||||
|
||||
#>>> getIdByImdb('0060304')
|
||||
#>>> get_id(imdb='0060304')
|
||||
#u'2-or-3-things-i-know-about-her'
|
||||
'''
|
||||
i = ImdbCombined(imdbId)
|
||||
if imdb:
|
||||
i = ImdbCombined(imdb)
|
||||
title = i['title']
|
||||
return title.replace(' ', '-').lower().replace("'", '')
|
||||
|
||||
def getId(url):
|
||||
return url.split('/')[-1]
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://www.flixster.com/movie/%s"%id
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import json
|
|||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
class Imdb(dict):
|
||||
class Freebase(dict):
|
||||
def __init__(self, id, timeout=-1):
|
||||
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
|
||||
'''
|
||||
|
|
|
@ -20,7 +20,7 @@ def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.
|
|||
headers = headers.copy()
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
||||
class Imdb(SiteParser):
|
||||
|
@ -420,7 +420,7 @@ class ImdbCombined(Imdb):
|
|||
self.regex = _regex
|
||||
super(ImdbCombined, self).__init__(id, timeout)
|
||||
|
||||
def getMovieIdByTitle(title, timeout=-1):
|
||||
def get_movie_by_title(title, timeout=-1):
|
||||
'''
|
||||
This only works for exact title matches from the data dump
|
||||
Usually in the format
|
||||
|
@ -431,22 +431,22 @@ def getMovieIdByTitle(title, timeout=-1):
|
|||
If there is more than one film with that title for the year
|
||||
Title (Year/I)
|
||||
|
||||
>>> getMovieIdByTitle(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
u'1602860'
|
||||
|
||||
>>> getMovieIdByTitle(u'The Matrix (1999)')
|
||||
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||
u'0133093'
|
||||
|
||||
>>> getMovieIdByTitle(u'Little Egypt (1951)')
|
||||
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||
u'0043748'
|
||||
|
||||
>>> getMovieIdByTitle(u'Little Egypt (1897/I)')
|
||||
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||
u'0214882'
|
||||
|
||||
>>> getMovieIdByTitle(u'Little Egypt')
|
||||
>>> get_movie_by_title(u'Little Egypt')
|
||||
None
|
||||
|
||||
>>> getMovieIdByTitle(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
u'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
|
@ -465,21 +465,21 @@ def getMovieIdByTitle(title, timeout=-1):
|
|||
return results[0]
|
||||
return None
|
||||
|
||||
def getMovieId(title, director='', year='', timeout=-1):
|
||||
def get_movie_id(title, director='', year='', timeout=-1):
|
||||
'''
|
||||
>>> getMovieId('The Matrix')
|
||||
>>> get_movie_id('The Matrix')
|
||||
u'0133093'
|
||||
|
||||
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
u'0060304'
|
||||
|
||||
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
u'0060304'
|
||||
|
||||
>>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
|
||||
>>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
'''
|
||||
imdbId = {
|
||||
|
@ -555,12 +555,12 @@ def getMovieId(title, director='', year='', timeout=-1):
|
|||
#or nothing
|
||||
return ''
|
||||
|
||||
def getMoviePoster(imdbId):
|
||||
def get_movie_poster(imdbId):
|
||||
'''
|
||||
>>> getMoviePoster('0133093')
|
||||
>>> get_movie_poster('0133093')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||
|
||||
>>> getMoviePoster('0994352')
|
||||
>>> get_movie_poster('0994352')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||
'''
|
||||
info = ImdbCombined(imdbId)
|
||||
|
@ -570,10 +570,10 @@ def getMoviePoster(imdbId):
|
|||
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return getMoviePoster(info['series'])
|
||||
return get_movie_poster(info['series'])
|
||||
return ''
|
||||
|
||||
def maxVotes():
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = ox.cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
|
@ -581,7 +581,7 @@ def maxVotes():
|
|||
return votes
|
||||
|
||||
def guess(title, director='', timeout=-1):
|
||||
return getMovieId(title, director, timeout=timeout)
|
||||
return get_movie_id(title, director, timeout=timeout)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
|
|
|
@ -7,19 +7,19 @@ from ox.html import strip_tags
|
|||
from ox.text import find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> getData('1991/silence_of_the_lambs')['imdbId']
|
||||
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||
u'0102926'
|
||||
|
||||
>>> getData('1991/silence_of_the_lambs')['posters'][0]
|
||||
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
|
||||
>>> getData('1991/silence_of_the_lambs')['url']
|
||||
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
'''
|
||||
data = {
|
||||
'url': getUrl(id)
|
||||
'url': get_url(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
|
@ -48,7 +48,7 @@ def getData(id):
|
|||
|
||||
return data
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
split = url.split('/')
|
||||
year = split[3]
|
||||
split = split[4][:-5].split('_')
|
||||
|
@ -59,26 +59,25 @@ def getId(url):
|
|||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
||||
def getIds():
|
||||
ids = []
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in getIdsByPage(page):
|
||||
if not id in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def getIdsByPage(page):
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
ids.append(getId(url))
|
||||
ids.append(get_id(url))
|
||||
return set(ids)
|
||||
#get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in get_ids(page):
|
||||
if not id in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
|
@ -297,5 +296,5 @@ _id_map = {
|
|||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
ids = getIds()
|
||||
ids = get_ids()
|
||||
print sorted(ids), len(ids)
|
||||
|
|
|
@ -24,7 +24,7 @@ ITUNES_HEADERS = {
|
|||
'Connection': 'close',
|
||||
}
|
||||
|
||||
def composeUrl(request, parameters):
|
||||
def compose_url(request, parameters):
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
|
@ -60,7 +60,7 @@ def composeUrl(request, parameters):
|
|||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
|
||||
def parseXmlDict(xml):
|
||||
def parse_xml_dict(xml):
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
|
@ -78,7 +78,7 @@ def parseXmlDict(xml):
|
|||
values[key] = value
|
||||
return values
|
||||
|
||||
def parseCast(xml, title):
|
||||
def parse_cast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
|
@ -89,7 +89,7 @@ def parseCast(xml, title):
|
|||
except:
|
||||
return list
|
||||
|
||||
def parseMovies(xml, title):
|
||||
def parse_movies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
|
@ -109,17 +109,17 @@ class ItunesAlbum:
|
|||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
self.id = self.get_id()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
url = compose_url('viewAlbum', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||
|
@ -130,7 +130,7 @@ class ItunesAlbum:
|
|||
data['tracks'] = []
|
||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['tracks'].append(parse_xml_dict(string))
|
||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
|
@ -140,48 +140,48 @@ class ItunesMovie:
|
|||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
self.id = self.get_id()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewMovie', {'id': self.id})
|
||||
url = compose_url('viewMovie', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
data['actors'] = parse_cast(xml, 'actors')
|
||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['directors'] = parse_cast(xml, 'directors')
|
||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['producers'] = parse_cast(xml, 'producers')
|
||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['relatedMovies'] = parse_movies(xml, 'related movies')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
from ox.utils import json
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).getData()
|
||||
data = ItunesMovie(id = v['id']).get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').getData()
|
||||
data = ItunesMovie(id='272960052').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from ox.html import decode_html
|
|||
from ox.text import find_re
|
||||
|
||||
|
||||
def getLyrics(title, artist):
|
||||
def get_lyrics(title, artist):
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
|
|
|
@ -7,25 +7,24 @@ from lxml.html import document_fromstring
|
|||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
def getUrl(id):
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
||||
def getId(url):
|
||||
return url.split('/')[-1]
|
||||
|
||||
def getUrlByImdb(imdb):
|
||||
def get_url(id=None, imdb=None):
|
||||
if imdb:
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = read_url(url)
|
||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
||||
def getMetacriticShowUrl(title):
|
||||
def get_id(url):
|
||||
return url.split('/')[-1]
|
||||
|
||||
def get_show_url(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = read_url(url)
|
||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def getData(url):
|
||||
def get_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
|
@ -57,7 +56,7 @@ def getData(url):
|
|||
|
||||
return {
|
||||
'critics': metacritics,
|
||||
'id': getId(url),
|
||||
'id': get_id(url),
|
||||
'score': score,
|
||||
'url': url,
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ import ox
|
|||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parseResultsPage(data, max_results=10):
|
||||
def _parse_results_page(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
|
@ -27,22 +27,17 @@ def _parseResultsPage(data, max_results=10):
|
|||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def findMovie(query, max_results=10):
|
||||
def find_movie(query=None, imdb=None, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||
else:
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parseResultsPage(data, max_results)
|
||||
return _parse_results_page(data, max_results)
|
||||
|
||||
def findMovieByImdb(imdbId):
|
||||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalize_imdbid(imdbId)
|
||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
def get_id(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
|
@ -54,7 +49,7 @@ def getId(mininovaId):
|
|||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = getId(mininovaId)
|
||||
mininovaId = get_id(mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
|
@ -62,11 +57,11 @@ def exists(mininovaId):
|
|||
return False
|
||||
return True
|
||||
|
||||
def getData(mininovaId):
|
||||
def get_data(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = getId(mininovaId)
|
||||
mininovaId = get_id(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
|
@ -101,7 +96,7 @@ class Mininova(Torrent):
|
|||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = getData(mininovaId)
|
||||
self.data = get_data(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
|
|
|
@ -6,39 +6,39 @@ import re
|
|||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> getData('0060304')['posters'][0]
|
||||
>>> get_data('0060304')['posters'][0]
|
||||
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
||||
>>> getData('0123456')['posters']
|
||||
>>> get_data('0123456')['posters']
|
||||
[]
|
||||
'''
|
||||
data = {
|
||||
"url": getUrl(id)
|
||||
"url": get_url(id)
|
||||
}
|
||||
data["posters"] = getPostersByUrl(data["url"])
|
||||
data["posters"] = get_posters(data["url"])
|
||||
return data
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
return url.split("/")[-2]
|
||||
|
||||
def getPostersByUrl(url, group=True, timeout=-1):
|
||||
def get_posters(url, group=True, timeout=-1):
|
||||
posters = []
|
||||
html = read_url(url, timeout=timeout, unicode=True)
|
||||
if url in html:
|
||||
if group:
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
posters += getPostersByUrl(result, False)
|
||||
posters += get_posters(result, False)
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id):
|
||||
return "http://www.movieposterdb.com/movie/%s/" % id
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getData('0060304')
|
||||
print getData('0133093')
|
||||
print get_data('0060304')
|
||||
print get_data('0133093')
|
||||
|
|
|
@ -7,7 +7,7 @@ from ox.cache import read_url
|
|||
from ox import find_re, strip_tags
|
||||
from ox import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
|
@ -29,7 +29,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
|||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
def download_subtitle(opensubtitle_id):
|
||||
srts = {}
|
||||
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import ox.cache
|
||||
|
||||
def getPosterUrl(id):
|
||||
def get_poster_url(id):
|
||||
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
||||
if ox.cache.exists(url):
|
||||
return url
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import re
|
||||
from ox.net import read_url
|
||||
|
||||
def getPosterUrl(id):
|
||||
def get_poster_url(id):
|
||||
url = 'http://piratecinema.org/posters/'
|
||||
html = read_url(url, unicode=True)
|
||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||
|
@ -13,5 +13,5 @@ def getPosterUrl(id):
|
|||
return ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getPosterUrl('0749451')
|
||||
print get_poster_url('0749451')
|
||||
|
||||
|
|
|
@ -2,17 +2,18 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox.cache import getHeaders, read_url
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
|
||||
def getUrlByImdb(imdb):
|
||||
def get_url(id=None, imdb=None):
|
||||
#this would also wor but does not cache:
|
||||
'''
|
||||
from urllib2 import urlopen
|
||||
u = urlopen(url)
|
||||
return u.url
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||
data = read_url(url)
|
||||
if "movie_title" in data:
|
||||
|
@ -24,7 +25,7 @@ def getUrlByImdb(imdb):
|
|||
def get_og(data, key):
|
||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def getData(url):
|
||||
def get_data(url):
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
|
|
|
@ -27,7 +27,7 @@ class SiteParser(dict):
|
|||
baseUrl = ''
|
||||
regex = {}
|
||||
|
||||
def getUrl(self, page):
|
||||
def get_url(self, page):
|
||||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
|
@ -35,7 +35,7 @@ class SiteParser(dict):
|
|||
|
||||
def __init__(self, timeout=-1):
|
||||
for key in self.regex:
|
||||
url = self.getUrl(self.regex[key]['page'])
|
||||
url = self.get_url(self.regex[key]['page'])
|
||||
data = self.read_url(url, timeout)
|
||||
if isinstance(self.regex[key]['re'], basestring):
|
||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||
|
|
|
@ -9,7 +9,7 @@ from ox.html import decode_html, strip_tags
|
|||
import ox.net
|
||||
|
||||
|
||||
def getNews(year, month, day):
|
||||
def get_news(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||
|
@ -27,7 +27,7 @@ def getNews(year, month, day):
|
|||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
except:
|
||||
description = ''
|
||||
try:
|
||||
|
@ -35,7 +35,7 @@ def getNews(year, month, day):
|
|||
except:
|
||||
imageUrl = ''
|
||||
try:
|
||||
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||
except:
|
||||
title = ''
|
||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||
|
@ -45,12 +45,12 @@ def getNews(year, month, day):
|
|||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decode_html
|
||||
# new['description'] = formatString(decode_html(description))
|
||||
new['description'] = formatString(description)
|
||||
# new['description'] = format_string(decode_html(description))
|
||||
new['description'] = format_string(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = formatSection(section)
|
||||
new['title'] = formatString(title)
|
||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||
new['section'] = format_section(section)
|
||||
new['title'] = format_string(title)
|
||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||
if new['title1'][-1:] == ':':
|
||||
new['title1'] = new['title1'][0:-1]
|
||||
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||
|
@ -67,21 +67,21 @@ def getNews(year, month, day):
|
|||
'''
|
||||
return news
|
||||
|
||||
def splitTitle(title):
|
||||
def split_title(title):
|
||||
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||
return [title1, title2]
|
||||
|
||||
def formatString(string):
|
||||
def format_string(string):
|
||||
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||
return string
|
||||
|
||||
def formatSection(string):
|
||||
def format_section(string):
|
||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||
|
||||
def formatSubsection(string):
|
||||
def format_subsection(string):
|
||||
# SPIEGEL, SPIEGEL special
|
||||
subsection = {
|
||||
'abi': 'Abi - und dann?',
|
||||
|
@ -98,7 +98,7 @@ def formatSubsection(string):
|
|||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
def getIssue(year, week):
|
||||
def get_issue(year, week):
|
||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||
if not ox.net.exists(coverUrl):
|
||||
return None
|
||||
|
@ -122,7 +122,7 @@ def getIssue(year, week):
|
|||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||
|
||||
|
||||
def archiveIssues():
|
||||
def archive_issues():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
|
@ -140,8 +140,8 @@ def archiveIssues():
|
|||
else:
|
||||
wMax = 53
|
||||
for w in range(wMax, 0, -1):
|
||||
print 'getIssue(%d, %d)' % (y, w)
|
||||
issue = getIssue(y, w)
|
||||
print 'get_issue(%d, %d)' % (y, w)
|
||||
issue = get_issue(y, w)
|
||||
if issue:
|
||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||
if not os.path.exists(dirname):
|
||||
|
@ -188,7 +188,7 @@ def archiveIssues():
|
|||
print p['min'], p['sum'] / p['num'], p['max']
|
||||
|
||||
|
||||
def archiveNews():
|
||||
def archive_news():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
|
@ -235,7 +235,7 @@ def archiveNews():
|
|||
f.close()
|
||||
filename = filename[:-5] + '.txt'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = splitTitle(new['title'])
|
||||
data = split_title(new['title'])
|
||||
data.append(new['description'])
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
|
@ -256,19 +256,14 @@ def archiveNews():
|
|||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
strings = splitTitle(new['title'])
|
||||
strings = split_title(new['title'])
|
||||
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||
for key in sortDictByKey(count):
|
||||
for key in sorted(count):
|
||||
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||
for value in colon:
|
||||
print value
|
||||
|
||||
def sortDictByKey(d):
|
||||
keys = d.keys()
|
||||
keys.sort()
|
||||
return keys
|
||||
|
||||
if __name__ == '__main__':
|
||||
# spiegel = Spiegel(2008, 8)
|
||||
# print spiegel.getContents()
|
||||
|
@ -281,12 +276,12 @@ if __name__ == '__main__':
|
|||
news = getNews(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = formatSection(strings[3])
|
||||
string = format_section(strings[3])
|
||||
if len(strings) == 6:
|
||||
string += '/' + formatSubsection(strings[4])
|
||||
string += '/' + format_subsection(strings[4])
|
||||
if not string in x:
|
||||
x.append(string)
|
||||
print x
|
||||
'''
|
||||
# archiveIssues()
|
||||
archiveNews()
|
||||
# archive_issues()
|
||||
archive_news()
|
||||
|
|
|
@ -22,7 +22,9 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
|
|||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def findMovies(query, max_results=10):
|
||||
def find_movies(query=None, imdb=None, max_results=10):
|
||||
if imdb:
|
||||
query = "tt" + normalize_imdbid(imdb)
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
|
@ -47,10 +49,7 @@ def findMovies(query, max_results=10):
|
|||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
|
||||
def findMovieByImdb(imdb):
|
||||
return findMovies("tt" + normalize_imdbid(imdb))
|
||||
|
||||
def getId(piratebayId):
|
||||
def get_id(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = find_re(piratebayId, "tor/(\d+)")
|
||||
|
@ -62,10 +61,10 @@ def getId(piratebayId):
|
|||
return piratebayId
|
||||
|
||||
def exists(piratebayId):
|
||||
piratebayId = getId(piratebayId)
|
||||
piratebayId = get_id(piratebayId)
|
||||
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
||||
|
||||
def getData(piratebayId):
|
||||
def get_data(piratebayId):
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
|
@ -73,7 +72,7 @@ def getData(piratebayId):
|
|||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = getId(piratebayId)
|
||||
piratebayId = get_id(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
|
@ -108,7 +107,7 @@ class Thepiratebay(Torrent):
|
|||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = getData(piratebayId)
|
||||
self.data = get_data(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
|
|
|
@ -7,12 +7,12 @@ from ox import strip_tags, find_re
|
|||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getEpisodeData(url):
|
||||
def get_episode_data(url):
|
||||
'''
|
||||
prases informatin on tvcom episode pages
|
||||
returns dict with title, show, description, score
|
||||
example:
|
||||
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
|
|
|
@ -8,7 +8,7 @@ from ox.cache import read_url
|
|||
from ox import find_string, find_re
|
||||
|
||||
|
||||
def getData(id):
|
||||
def get_data(id):
|
||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||
xml = read_url(url)
|
||||
tree = ET.parse(StringIO(xml))
|
||||
|
|
|
@ -8,52 +8,45 @@ from ox.cache import read_url
|
|||
from ox import find_re, decode_html
|
||||
|
||||
|
||||
def getId(url):
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def getUrl(id):
|
||||
def get_url(id=None, imdb=None, allmovie=None):
|
||||
if imdb:
|
||||
query = '"%s"'% imdbId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
data = get_movie_data(url)
|
||||
if 'imdb_id' in data:
|
||||
return url
|
||||
return ""
|
||||
if allmovie:
|
||||
query = '"amg_id = 1:%s"'% allmovie
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
return "http://en.wikipedia.org/wiki/%s" % id
|
||||
|
||||
|
||||
def getMovieId(title, director='', year=''):
|
||||
def get_movie_id(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
|
||||
def getUrlByImdbId(imdbId):
|
||||
query = '"%s"'% imdbId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
data = getMovieData(url)
|
||||
if 'imdb_id' in data:
|
||||
return url
|
||||
return ""
|
||||
|
||||
def getUrlByImdb(imdbId):
|
||||
# deprecated, use getUrlByImdbId()
|
||||
return getUrlByImdbId(imdbId)
|
||||
|
||||
def getUrlByAllmovieId(allmovieId):
|
||||
query = '"amg_id = 1:%s"'% allmovieId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
|
||||
def getWikiData(wikipediaUrl):
|
||||
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
def get_wiki_data(wikipedia_url):
|
||||
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
url = "%s&action=raw" % url
|
||||
data = read_url(url).decode('utf-8')
|
||||
return data
|
||||
|
||||
def getMovieData(wikipediaUrl):
|
||||
if not wikipediaUrl.startswith('http'):
|
||||
wikipediaUrl = getUrl(wikipediaUrl)
|
||||
data = getWikiData(wikipediaUrl)
|
||||
def get_movie_data(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'):
|
||||
wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_wiki_data(wikipedia_url)
|
||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
_box = filmbox_data.strip().split('|')
|
||||
|
@ -104,7 +97,7 @@ def getMovieData(wikipediaUrl):
|
|||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def getImageUrl(name):
|
||||
def get_image_url(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url, unicode=True)
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
|
@ -114,19 +107,19 @@ def getImageUrl(name):
|
|||
url = 'http:' + url
|
||||
return url
|
||||
|
||||
def getPosterUrl(wikipediaUrl):
|
||||
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
|
||||
data = getMovieData(wikipediaUrl)
|
||||
def get_poster_url(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_movie_data(wikipedia_url)
|
||||
if 'image' in data:
|
||||
return getImageUrl(data['image'])
|
||||
return get_image_url(data['image'])
|
||||
return ''
|
||||
|
||||
def getMoviePoster(wikipediaUrl):
|
||||
# deprecated, use getPosterUrl()
|
||||
return getPosterUrl(wikipediaUrl)
|
||||
def get_movie_poster(wikipedia_url):
|
||||
# deprecated, use get_poster_url()
|
||||
return get_poster_url(wikipedia_url)
|
||||
|
||||
def getAllmovieId(wikipediaUrl):
|
||||
data = getMovieData(wikipediaUrl)
|
||||
def get_allmovie_id(wikipedia_url):
|
||||
data = get_movie_data(wikipedia_url)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
|
|
|
@ -8,7 +8,7 @@ import feedparser
|
|||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
||||
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
||||
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
||||
"""
|
||||
youtubeId - if of video
|
||||
format - video format, options: webm, 1080p, 720p, mp4, high
|
||||
|
|
Loading…
Reference in a new issue