back to oxlib, package_dir does not work with pip/python setup.py develop

This commit is contained in:
j 2009-10-12 17:18:59 +02:00
commit 1bd6615f16
27 changed files with 85 additions and 86 deletions

9
oxweb/__init__.py Normal file
View file

@ -0,0 +1,9 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
__version__ = '1.0.0'
import imdb
import wikipedia
import google
import piratecinema
import oxdb

78
oxweb/allmovie.py Normal file
View file

@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import readUrlUnicode
def getId(url):
return url.split("/")[-2]
def getData(id):
'''
>>> getData('129689')['cast'][1][1]
u'Marianne'
>>> getData('129689')['credits'][0][0]
u'Jean-Luc Godard'
>>> getData('129689')['posters'][0]
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
>>> getData('129689')['rating']
u'4.5'
'''
data = {
"url": getUrl(id)
}
html = readUrlUnicode(data["url"])
data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
data['countries'] = parseList(html, 'Countries')
data['director'] = parseEntry(html, 'Director')
data['genres'] = parseList(html, 'Genres')
data['keywords'] = parseList(html, 'Keywords')
data['posters'] = [findRe(html, '<img src="(http://image\..*?)"')]
data['produced'] = parseList(html, 'Produced by')
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'Released by')
data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')
data['runtime'] = findRe(html, '<td class="formed-sub" style="width: 86px;">(\d+) min.</td>')
data['set'] = parseEntry(html, 'Set In')
data['synopsis'] = parseText(html, 'Plot Synopsis')
data['themes'] = parseList(html, 'Themes')
data['types'] = parseList(html, 'Types')
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
data['cast'] = parseTable(html)
html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
data['credits'] = parseTable(html)
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
data['review'] = parseText(html, 'Review')
return data
def getUrl(id):
return "http://allmovie.com/work/%s/" % id
def parseEntry(html, title):
return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title)).strip()
def parseList(html, title):
html = findRe(html, '<span>%s</span>(.*?)</table>' % title)
return map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
def parseTable(html):
return map(
lambda x: map(
lambda x: stripTags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-')
),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
)
def parseText(html, title):
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print getData('129689')
# print getData('177524')

20
oxweb/auth.py Normal file
View file

@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2009
import os
import simplejson
def get(key):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
auth = {}
if os.path.exists(user_auth):
f = open(user_auth, "r")
data = f.read()
f.close()
auth = simplejson.loads(data)
if key in auth:
return auth[key]
print "please add key %s to json file '%s'" % (key, user_auth)
return ""

90
oxweb/criterion.py Normal file
View file

@ -0,0 +1,90 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import oxlib.cache
from oxlib.cache import readUrlUnicode
from oxlib.html import stripTags
from oxlib.text import findRe, removeSpecialCharacters
import imdb
def getId(url):
return url.split("/")[-1]
def getUrl(id):
return "http://www.criterion.com/films/%s" % id
def getData(id):
'''
>>> getData('1333')['imdbId']
'0060304'
>>> getData('236')['posters'][0]
'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
>>> getData('786')['posters'][0]
'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
'''
data = {
"url": getUrl(id)
}
try:
html = readUrlUnicode(data["url"])
except:
html = oxlib.cache.getUrl(data["url"])
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
data["country"] = results[0]
data["year"] = results[1]
result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
data["synopsis"] = findRe(result, "<p>(.*?)</p>")
result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
result = findRe(result, "<a href=\"(.*?)\">")
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = readUrlUnicode(result)
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"")
data["posters"] = [result.replace("_w100", "")]
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = [findRe(html, "\"thumbnailURL\", \"(.*?)\"")]
data["trailers"] = [findRe(html, "\"videoURL\", \"(.*?)\"")]
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data
def getIds():
ids = []
html = readUrlUnicode("http://www.criterion.com/library/dvd")
results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2])
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
ids.append(id)
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getIdsByPage(page):
ids = []
html = readUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
return set(ids)
if __name__ == '__main__':
print getIds()

22
oxweb/dailymotion.py Normal file
View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import unquote
from oxlib.cache import readUrl
def getVideoUrl(url):
'''
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0]
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv'
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
'''
data = readUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return "http://www.dailymotion.com" + v
return ''

49
oxweb/epguides.py Normal file
View file

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import readUrlUnicode
import google
def getShowUrl(title):
'''
Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide.
'''
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
if url.startswith('http://epguides.com'):
if re.search(title, name):
return url
return None
def getShowData(url):
data = readUrlUnicode(url)
r = {}
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
air_date = episode[3].strip()
#'22 Sep 04' -> 2004-09-22
try:
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
except:
pass
s = episode[1].split('-')[0].strip()
e = episode[1].split('-')[-1].strip()
try:
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
'prod code': episode[2],
'air date': air_date,
'url': episode[4],
'title':episode[5],
}
except:
print "oxweb.epguides failed,", url
return r

57
oxweb/google.py Normal file
View file

@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
import urllib
import urllib2
import weakref
import threading
import Queue
import simplejson
import oxlib
from oxlib import stripTags
'''
usage:
import google
google.find(query)
for result in google.find(query): result
result is title, url, description
google.find(query, max_results)
FIXME: how search depper than first page?
'''
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
def readUrl(url, data=None, headers=oxlib.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return oxlib.cache.readUrl(url, data, headers, timeout)
def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8'))
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url = "http://www.google.com/search?q=%s" % quote_plus(query)
data = readUrl(url, timeout=timeout)
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
results = []
for match in re.compile(link_re, re.DOTALL).finditer(data):
(name, url, desc) = match.group('name', 'url', 'desc')
results.append((stripTags(name), url, stripTags(desc)))
if len(results) > max_results:
results = results[:max_results]
return results
def _find(query):
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = simplejson.loads(oxlib.cache.readUrlUnicode(url))['responseData']['results']
return results

756
oxweb/imdb.py Normal file
View file

@ -0,0 +1,756 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib2
from urllib import quote, unquote
import re
import os
import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxlib
from oxlib import stripTags, decodeHtml, findRe, findString
import oxlib.cache
from oxlib.normalize import normalizeTitle, normalizeImdbId
from oxlib import *
import google
'''
never timeout imdb data, to update cache remove data from cache folder
'''
def readUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
return oxlib.cache.readUrlUnicode(url, data, headers, timeout)
'''
check if result is valid while updating
def validate(result, header):
return header['status'] == u'200'
try:
d = oxlib.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e:
print e.headers
'''
def getMovieId(title, director='', year=''):
'''
>>> getMovieId('The Matrix')
'0133093'
'''
if year:
title = "%s (%s)" % (title, year)
if director:
query = 'site:imdb.com %s "%s"' % (director, title)
else:
query = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(query, 3, timeout=-1):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
return ''
def getMovieData(imdbId):
return IMDb(imdbId).parse()
# internal functions below
def getUrlBase(imdbId):
return "http://www.imdb.com/title/tt%s/" % imdbId
def getRawMovieData(imdbId):
imdbId = normalizeImdbId(imdbId)
data = getMovieInfo(imdbId)
data['credits'] = getMovieCredits(imdbId)
data['poster'] = getMoviePoster(imdbId)
data['company credits'] = getMovieCompanyCredits(imdbId)
data['filming locations'] = getMovieLocations(imdbId)
data['movie connections'] = getMovieConnections(imdbId)
data['external reviews'] = getMovieExternalReviews(imdbId)
data['trivia'] = getMovieTrivia(imdbId)
data['keywords'] = getMovieKeywords(imdbId)
data['media'] = {}
data['media']['images'] = getMovieImages(imdbId)
data['media']['trailers'] = getMovieTrailers(imdbId)
data['plotsummary'] = getMoviePlot(imdbId)
data['release dates'] = getMovieReleaseDates(imdbId)
data['release date'] = getMovieReleaseDate(imdbId)
return data
def getMovieInfo(imdbId):
data = readUrlUnicode(getUrlBase(imdbId))
info = dict()
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
if info['poster'] and '_V' in info['poster']:
info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip()
def cleanUp(k):
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
if title not in ('plot', 'trivia', 'filming locations', 'mpaa', 'tagline', 'original air date'):
if '|' in txt:
txt = [cleanUp(k) for k in txt.split('|')]
elif ', ' in txt:
txt = [cleanUp(k) for k in txt.split(', ')]
elif title in ('country', 'language', 'genre'):
txt = [cleanUp(txt), ]
if title == 'tv series':
info['series_imdb'] = findRe(i[1], 'tt(\d{7})')
if title == 'original air date':
info['series_episode_info'] = txt.split('\n')[-1].strip()
txt = txt.split('\n')[0].strip()
if not title.startswith('moviemeter'):
info[title] = txt
for key in ('user comments', 'writers (wga)', 'plot keywords'):
if key in info:
del info[key]
if 'release date' in info:
if isinstance(info['release date'], list):
info['release date'] = info['release date'][0]
info['release date'] = info['release date'].split('\n')[0]
if 'plot' in info:
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
info['plot'] = info['plot'].split('| full synopsis')[0].strip()
if info['plot'] in ('add synopsis', 'full synopsis'):
info['plot'] = ''
#get Title
title = ''
year = ''
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
if not html_title:
html_title = findRe(data, '<title>(.*?)</title>')
else:
html_title = html_title.split('<span class="pro-link">')[0]
if html_title:
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = stripTags(html_title)
title = decodeHtml(title)
year = findRe(title, '\((\d{4})\)')
if not year:
year = findRe(title, '\((\d{4})')
_y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')].strip()
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
info['title'] = normalizeTitle(title)
info['year'] = year
#Series
if title.startswith('"') and title.find('"',1) > 0 and \
title.find('"',1) == title.rfind('"'):
episode_title = title[title.rfind('"')+1:]
episode_title = re.sub("\?{4}", "", episode_title).strip()
episode_title = re.sub("\d{4}", "", episode_title).strip()
if episode_title == '-': episode_title=''
title = normalizeTitle(title[1:title.rfind('"')])
if episode_title:
info['episode title'] = episode_title
info['series title'] = title
info['title'] = "%s: %s" % (title, episode_title)
else:
info['title'] = title
se = re.compile("Season (\d*), Episode (\d*)\)").findall(info.get('series_episode_info', ''))
if se:
info['season'] = int(se[0][0])
info['episode'] = int(se[0][1])
info['title'] = "%s (S%02dE%02d) %s" % (
info['series title'], info['season'], info['episode'], info['episode title'])
info['title'] = info['title'].strip()
del info['series_episode_info']
#Rating
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
if rating:
info['rating'] = float(rating)
else:
info['rating'] = -1
#Votes
info['votes'] = -1
if "user rating" in info:
votes = findRe(info['user rating'], '([\d,]*?) votes')
if votes:
info['votes'] = int(votes.replace(',', ''))
return info
def getMovieRuntimeSeconds(imdbId):
info = getMovieInfo(imdbId)
if 'runtime' in info:
value = info['runtime'][0]
parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
parsed_value = int(parsed_value)
else:
parsed_value = int(parsed_value) * 60
else:
parsed_value = -1
return parsed_value
def getMoviePoster(imdbId):
info = getMovieInfo(imdbId)
return info['poster']
def getMovieYear(imdbId):
'''
>>> getMovieYear('0315404')
u'1964'
>>> getMovieYear('0734840')
u'1990'
>>> getMovieYear('0815352')
u'1964'
'''
info = getMovieInfo(imdbId)
return info['year']
def getMovieTitle(imdbId):
'''
>>> getMovieTitle('0306414')
u'The Wire'
>>> getMovieTitle('0734840')
u'Twin Peaks (S01E02) Episode #1.2'
>>> getMovieTitle('0734840')
u'Twin Peaks (S01E02) Episode #1.2'
>>> getMovieTitle('0749451')
u'The Wire (S01E01) The Target'
'''
info = getMovieInfo(imdbId)
return info['title']
def getMovieAKATitles(imdbId):
'''
>>> getMovieAKATitle('0040980')
[(u'Frauen der Nacht', u'Germany'),
(u'Les femmes de la nuit', u'France'),
(u'Women of the Night', u'(undefined)')]
'''
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = readUrlUnicode(url)
titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>')
titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles)
return titles
def creditList(data, section=None):
if section == 'cast':
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
else:
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
credits = []
for c_ in credits_:
c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
if section=='writers':
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
if c[1].endswith(' and'): c[1] = c[1][:-4]
credits.append(c)
return credits
def getMovieCredits(imdbId):
credits = dict()
url = "%sfullcredits" % getUrlBase(imdbId)
data = readUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
if section:
credits[section[0]] = creditList(g, section[0])
return credits
def getMovieTrailers(imdbId):
url = "%strailers" % getUrlBase(imdbId)
data = readUrlUnicode(url)
soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"})
trailers = []
if videos:
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = readUrlUnicode(iframeUrl)
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
def getMovieQuotes(imdbId):
url = "%squotes" % getUrlBase(imdbId)
data = readUrlUnicode(url)
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes
def getMoviePlot(imdbId):
url = "%splotsummary" % getUrlBase(imdbId)
data = readUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
return plot.strip()
def getMovieTechnical(imdbId):
url = "%stechnical" % getUrlBase(imdbId)
data = readUrlUnicode(url)
results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip()
return results
def getMovieCompanyCredits(imdbId):
url = "%scompanycredits" % getUrlBase(imdbId)
data = readUrlUnicode(url)
results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = []
for company in re.compile('<li>(.*?)</li>').findall(c):
results[field.strip()].append(company)
return results
def getMovieLocations(imdbId):
url = "%slocations" % getUrlBase(imdbId)
data = readUrlUnicode(url)
locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {}
for key in keys:
url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = readUrlUnicode(url)
photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0]
title = s[0]
if key=='still_frame':
if not "_CR0" in s[1]:
photos[key][img] = title
else:
photos[key][img] = title
return photos
def getMovieStills(imdbId):
return getMovieImages(imdbId, ['still_frame'])['still_frame']
def getMoviePosters(imdbId):
posters = getMovieImages(imdbId, ['poster'])['poster']
poster = getMoviePoster(imdbId)
if poster:
posters[poster] = 'main poster'
return posters
def getMovieTrivia(imdbId):
url = "%strivia" % getUrlBase(imdbId)
data = readUrlUnicode(url)
trivia = re.compile('<li>(.*?)</li>').findall(data)
def clean(t):
t = decodeHtml(t)
t = t.replace(u'”', '"')
if t.endswith('<br><br>'):
t = t[:-8]
return t.strip()
trivia = [clean(t) for t in trivia]
return trivia
def getMovieConnections(imdbId):
url = "%smovieconnections" % getUrlBase(imdbId)
data = readUrlUnicode(url)
connections={}
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
return connections
def getMovieKeywords(imdbId):
url = "%skeywords" % getUrlBase(imdbId)
data = readUrlUnicode(url)
keywords = []
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
keyword = decodeHtml(keyword)
keyword = keyword.replace(u'\xa0', ' ')
keywords.append(keyword)
return keywords
def getMovieExternalReviews(imdbId):
url = "%sexternalreviews" % getUrlBase(imdbId)
data = readUrlUnicode(url)
_reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
reviews = {}
for r in _reviews:
reviews[r[0]] = r[1]
return reviews
def getMovieReleaseDate(imdbId):
releasedates = getMovieReleaseDates(imdbId)
first_release = None
for r in releasedates:
if not first_release or r[1] < first_release:
first_release = r[1]
return first_release
def _parseDate(d):
'''
>>>_parseDate('3 March 1972')
'1972-03-03'
'''
try:
parsed_date = time.strptime(d, "%d %B %Y")
parsed_date = '%s-%02d-%02d' % (parsed_date.tm_year, parsed_date.tm_mon, parsed_date.tm_mday)
return parsed_date
except:
try:
parsed_date = time.strptime(d, "%B %Y")
parsed_date = '%s-%02d-01' % (parsed_date.tm_year, parsed_date.tm_mon)
return parsed_date
except:
pass
try:
parsed_date = time.strptime(d, "%Y")
parsed_date = '%s-01-01' % (parsed_date.tm_year)
return parsed_date
except:
pass
return d
def getMovieReleaseDates(imdbId):
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = readUrlUnicode(url)
releasedates = []
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
for r in re.compile(regexp, re.DOTALL).findall(data):
r_ = (stripTags(r[0]).strip(),
_parseDate(stripTags(r[1]).strip()),
decodeHtml(stripTags(r[2]).strip()))
releasedates.append(r_)
return releasedates
def getMovieBusinessSum(imdbId):
business = getMovieBusiness(imdbId)
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
if 'budget' in business:
#b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
budget = filter(lambda x: x.startswith('$'), business['budget'])
if not budget:
budget = business['budget']
b_['budget'] = int(intValue(budget[0].replace(',', '')))
if 'gross' in business:
b_['gross'] = int(intValue(business['gross'][0].replace(',', '')))
#b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
#if 'weekend gross' in business:
# b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
if b_['budget'] and b_['gross']:
b_['profit'] = b_['gross'] - b_['budget']
return b_
def getMovieFlimingDates(imdbId):
business = getMovieBusiness(imdbId)
if 'filming dates' in business and business['filming dates']:
return business['filming dates'][0]
return ''
def getMovieBusiness(imdbId):
url = "%sbusiness" % getUrlBase(imdbId)
data = readUrlUnicode(url)
business = {}
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
key = stripTags(r[0]).strip().lower()
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
business[key] = value
return business
def getMovieEpisodes(imdbId):
url = "%sepisodes" % getUrlBase(imdbId)
data = readUrlUnicode(url)
episodes = {}
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
for r in re.compile(regexp, re.DOTALL).findall(data):
try:
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
episodes[episode] = {}
episodes[episode]['imdb'] = r[2]
episodes[episode]['title'] = r[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
episodes[episode]['title'] = u''
description = decodeHtml(r[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description.strip()
episodes[episode]['date'] = ''
try:
d = stripTags(r[4])
d = d.replace('Original Air Date: ', '')
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
episodes[episode]['date'] = d
except:
pass
except:
import traceback
print traceback.print_exc()
pass
return episodes
'''the old code below'''
class IMDb:
def __init__(self, imdbId):
self.imdb = imdbId
self.pageUrl = getUrlBase(imdbId)
def getPage(self):
return readUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = getMovieRuntimeSeconds(self.imdb)
elif key in ('country', 'language'):
parsed_value = value.split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'genre':
parsed_value = value.replace('more', '').strip().split(' / ')
if len(parsed_value) == 1:
parsed_value = parsed_value[0].split(' | ')
parsed_value = [v.strip() for v in parsed_value]
elif key == 'tagline':
parsed_value = value.replace('more', '').strip()
elif key == 'plot_outline':
parsed_value = value.replace('(view trailer)', '').strip()
if parsed_value.endswith('more'):
parsed_value = parsed_value[:-4].strip()
elif key == 'tv_series':
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
if m:
parsed_value = m[0][0]
else:
parsed_value = ''
elif key == 'also_known_as':
parsed_value = ''
m = re.compile('(.*) \(International: English title').findall(value)
if m:
parsed_value = m[0]
else:
m = re.compile('(.*) \(USA').findall(value)
if m:
parsed_value = m[0]
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
director = self.getCredits().get('director', None)
if director:
director = director[0]
parsed_value = parsed_value.replace(director, '')
if parsed_value.startswith("'s"):
parsed_value = parsed_value[2:].strip()
parsed_value = decodeHtml(parsed_value.strip())
else:
print value
parsed_value = value
return parsed_value
def parseYear(self):
year = ''
data = self.getPage()
soup = BeautifulSoup(data)
html_title = soup('div', {'id': 'tn15title'})
if not html_title:
html_title = soup('title')
if html_title:
html_title = unicode(html_title[0])
html_title = stripTags(html_title)
year = re.compile('\((\d{4})\)').findall(html_title)
if not year:
year = re.compile('\((\d{4})/').findall(html_title)
if year:
year = year[0]
else: year = ''
return year
def parse(self):
data = self.getPage()
IMDbDict ={}
info = getMovieInfo(self.imdb)
#Poster
IMDbDict['poster'] = getMoviePoster(self.imdb)
if not IMDbDict['poster']:
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
#Title, Year
IMDbDict['year'] = self.parseYear()
IMDbDict['title'] = getMovieTitle(self.imdb)
#Rating
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
if m:
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
else:
IMDbDict['rating'] = -1
#Votes
IMDbDict['votes'] = info['votes']
data = data.replace('\n',' ')
#some values
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
for key in keys:
IMDbDict[key] = ''
IMDbDict['runtime'] = 0
soup = BeautifulSoup(data)
for info in soup('div', {'class': 'info'}):
key = unicode(info).split('</h5>')[0].split('<h5>')
if len(key) > 1:
raw_value = unicode(info).split('</h5>')[1]
key = key[1][:-1].lower().replace(' ', '_')
if key in keys:
IMDbDict[key] = self.parse_raw_value(key, raw_value)
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
#is episode
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.getCredits()
IMDbDict['plot'] = getMoviePlot(self.imdb)
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
IMDbDict['connections'] = getMovieConnections(self.imdb)
IMDbDict['locations'] = getMovieLocations(self.imdb)
IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
IMDbDict['business'] = getMovieBusinessSum(self.imdb)
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
IMDbDict['stills'] = getMovieStills(self.imdb)
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
self.IMDbDict = IMDbDict
if IMDbDict['episode_of']:
episode_of = getMovieInfo(IMDbDict['episode_of'])
for key in ('country', 'language'):
if not IMDbDict[key]:
IMDbDict[key] = episode_of[key]
return self.IMDbDict
def getCredits(self):
raw_credits = getMovieCredits(self.imdb)
credits = {}
def getNames(creditList):
return [stripTags(decodeHtml(c[0])) for c in creditList]
credits['director'] = getNames(raw_credits.get('directors', ''))
credits['writer'] = getNames(raw_credits.get('writers', ''))
credits['producer'] = getNames(raw_credits.get('producers', ''))
credits['cinematographer'] = getNames(raw_credits.get('cinematographers', ''))
credits['editor'] = getNames(raw_credits.get('editors', ''))
credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
self.credits = credits
return self.credits
def guess(title, director=''):
#FIXME: proper file -> title
title = title.split('-')[0]
title = title.split('(')[0]
title = title.split('.')[0]
title = title.strip()
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
return_url = ''
#lest first try google
#i.e. site:imdb.com Michael Stevens Sin
if director:
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return normalizeImdbId(int(oxlib.intValue(url)))
try:
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
except:
return None
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
u.close()
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
return None
def getEpisodeData(title, episode, show_url = None):
'''
Collect information about an episode.
Returns dict with title, show, description and episode
'''
episodeData = {
'title': u'',
'show': title,
'description': u'',
'episode': episode,
}
description = u''
if not show_url:
imdbid = guess(title)
else:
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
if imdbid:
i = IMDb(imdbid).parse()
episodeData['title'] = i['episodes'][episode]['title']
episodeData['description'] = i['episodes'][episode]['description']
episodeData['imdb'] = i['episodes'][episode]['imdb']
return episodeData
def getPersonData(imdbId):
imdbId = normalizeImdbId(imdbId)
url = u'http://www.imdb.com/name/nm%s/' % imdbId
data = readUrlUnicode(url)
info = dict()
info['name'] = findRe(data, u'<title>(.*?)</title>')
filmo = data.split(u'<h3>Additional Details</h3>')[0]
movies = {}
for part in filmo.split(u'<div class="filmo"')[1:]:
section = findRe(part, u'a name=".*?">(.*?):</a></h5>')
section = decodeHtml(section)
movies[section] = re.compile(u'href="/title/tt(\d{7})/"').findall(part)
info['movies'] = movies
return info
if __name__ == '__main__':
import sys
#print parse(sys.argv[1])
print "imdb:", guess(sys.argv[1])

84
oxweb/impawards.py Normal file
View file

@ -0,0 +1,84 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
from oxlib.cache import readUrlUnicode
from oxlib.html import stripTags
from oxlib.text import findRe
import imdb
def getData(id):
'''
>>> getData('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> getData('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg'
>>> getData('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': getUrl(id)
}
html = readUrlUnicode(data['url'])
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url)
result = findRe(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
data['posters'].append(poster)
return data
def getId(url):
split = url.split('/')
year = split[3]
split = split[4][:-5].split('_')
if split[-1] == 'xlg':
split.pop()
if findRe(split[-1], 'ver\d+$'):
split.pop()
id = '%s/%s' % (year, '_'.join(split))
return id
def getIds():
ids = []
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
if not id in ids:
ids.append(id)
return ids
def getIdsByPage(page):
ids = []
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(getId(url))
return set(ids)
def getUrl(id):
url = "http://www.impawards.com/%s.html" % id
html = readUrlUnicode(url)
if findRe(html, "No Movie Posters on This Page"):
url = "http://www.impawards.com/%s_ver1.html" % id
return url
if __name__ == '__main__':
ids = getIds()
print sorted(ids), len(ids)

187
oxweb/itunes.py Normal file
View file

@ -0,0 +1,187 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
import urllib
from oxlib.cache import readUrl
from oxlib.html import decodeHtml, stripTags
from oxlib.text import findRe
from oxlib.text import findString
# to sniff itunes traffic, use something like
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
ITUNES_HEADERS = {
'X-Apple-Tz': '0',
'X-Apple-Storefront': '143441-1',
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
'Accept-Language': 'en-us, en;q=0.50',
'Accept-Encoding': 'gzip',
'Connection': 'close',
}
def composeUrl(request, parameters):
if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music':
url += urllib.urlencode({
'albumTerm': parameters['title'],
'allArtistNames': parameters['artist'],
'composerTerm': '',
'flavor': 0,
'genreIndex': 1,
'media': 'music',
'mediaType': 2,
'ringtone': 0,
'searchButton': 'submit',
'songTerm': ''
})
elif parameters['media'] == 'movie':
url += urllib.urlencode({
'actorTerm': '',
'closedCaption': 0,
'descriptionTerm': '',
'directorProducerName': parameters['director'],
'flavor': 0,
'media': 'movie',
'mediaType': 3,
'movieTerm': parameters['title'],
'ratingIndex': 1,
'releaseYearTerm': '',
'searchButton': 'submit'
})
elif request == 'viewAlbum':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
elif request == 'viewMovie':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url
def parseXmlDict(xml):
values = {}
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = findRe(string, '(.*?)</key>')
type = findRe(string, '</key><(.*?)>')
if type == 'true/':
value = True
else:
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer':
value = int(value)
elif type == 'string':
value = decodeHtml(value)
values[key] = value
return values
def parseCast(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list
except:
return list
def parseMovies(xml, title):
list = []
try:
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append({
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
})
return list
except:
return list
class ItunesAlbum:
def __init__(self, id = '', title = '', artist = ''):
self.id = id
self.title = title
self.artist = artist
if not id:
self.id = self.getId()
def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
data['tracks'].append(parseXmlDict(string))
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
return data
class ItunesMovie:
def __init__(self, id = '', title = '', director = ''):
self.id = id
self.title = title
self.director = director
if not id:
self.id = self.getId()
def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()
data['actors'] = parseCast(xml, 'actors')
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors')
data['format'] = findRe(xml, 'Format:(.*?)<')
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers')
data['rated'] = findRe(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters')
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
return data
if __name__ == '__main__':
import simplejson
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)

126
oxweb/karagarga.py Normal file
View file

@ -0,0 +1,126 @@
import re
from oxlib import cache
from oxlib.html import stripTags
from oxlib.text import findRe
import auth
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy()
headers["Cookie"] = auth.get("karagarga.cookie")
return cache.readUrl(url, data, headers, timeout)
def readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
def getData(id):
data = {
"url": getUrl(id)
}
html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
if 'No torrent with ID' in html:
return False
data['added'] = stripTags(parseTable(html, 'Added'))
data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"')
# data['description'] = parseTable(html, 'Description')
data['director'] = stripTags(parseTable(html, 'Director / Artist'))
data['files'] = []
result = findRe(html, '<table class=main border="1" cellspacing=0 cellpadding="5">(.*?)</table>')
results = re.compile('<td>(.*?)</td><td align="right">(.*?)</td>', re.DOTALL).findall(result)
for name, size in results:
data['files'].append({
'name': name,
'size': '%s %s' % (size[:-2], size[-2:].strip().upper())
})
data['format'] = ''
if html.find('genreimages/dvdr.png') != -1:
data['format'] = 'DVD'
elif html.find('genreimages/hdrip.png') != -1:
data['format'] = 'HD'
data['genre'] = []
result = parseTable(html, 'Genres')
for string in result.split('\n'):
string = stripTags(findRe(string, '<a href="browse.php\?genre=.*?">(.*?)</a>'))
if string:
data['genre'].append(string)
data['id'] = id
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
data['language'] = stripTags(parseTable(html, 'Language'))
data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)'))
data['link'] = stripTags(parseTable(html, 'Internet Link'))
data['links'] = []
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Description'))
for (url, title) in results:
if url.find('javascript') == -1:
data['links'].append({
'title': title,
'url': url.replace('http://anonym.to/?', '')
})
data['people'] = 0
result = stripTags(findRe(html, '<a href="top10others.php.*?>(.*?) people')).strip()
if result:
data['people'] = int(result)
data['posters'] = []
results = re.compile('<img border=0 src="(http://.*?)"', re.DOTALL).findall(html)
for result in results:
data['posters'].append(result)
data['seeders'] = int(findRe(html, '#seeders" class="sublink".*?colspan=2>(.*?) seeder\(s\)'))
data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', ''))
data['snatched'] = int(findRe(html, '<a name="snatchers">.*?colspan=2>(.*?) '))
data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)<hr>').replace('included: ', '')
data['subtitles'] = []
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Subtitles'))
for (url, language) in results:
data['subtitles'].append({
'language': language.replace('click here for ', ''),
'url': url
})
data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"')
data['year'] = stripTags(parseTable(html, 'Year'))
data['title'] = stripTags(findRe(html, '<h1>(.*?)</h1>')).strip()
data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title'])
data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title'])
return data
def getId(url):
return url.split("=")[-1]
def getTorrent(id):
return readUrl(getData(id)['torrent'])
def getIds(lastId = 20):
lastId = '%s' % lastId
ids = []
page = 0
while True:
for id in getIdsByPage(page):
if not id in ids:
ids.append(id)
if lastId in ids:
break
page += 1
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getIdsByPage(page):
ids = []
url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
strings = html.split('<td width="42" style="padding:0px;">')
strings.pop(0)
for string in strings:
ids.append(findRe(string, '"details.php\?id=(.*?)"'))
return ids
def getUrl(id):
return "http://karagarga.net/details.php?id=%s" % id
def parseTable(html, title):
if title == 'Genres':
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</table>' % title)
else:
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</td>' % title)
if __name__ == "__main__":
print getIds("79317")
print getData("79317")

21
oxweb/lyricsfly.py Normal file
View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from oxlib.cache import readUrl
from oxlib.html import decodeHtml
from oxlib.text import findRe
def getLyrics(title, artist):
html = readUrl('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = readUrl(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n')
lyrics = decodeHtml(lyrics.replace('&amp;', '&'))
return lyrics
if __name__ == '__main__':
print getLyrics('Election Day', 'Arcadia')

45
oxweb/metacritic.py Normal file
View file

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import quote
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findRe, decodeHtml, stripTags
def getMetacriticShowUrl(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = readUrl(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(title, url=None):
if not url:
url = getMetacriticShowUrl(title)
if not url:
return None
data = readUrlUnicode(url)
score = findRe(data, 'ALT="Metascore: (.*?)"')
if score:
score = int(score)
else:
score = -1
reviews = re.compile(
'<div class="scoreandreview"><div class="criticscore">(.*?)</div>'
'.*?<span class="publication">(.*?)</span>'
'.*?<span class="criticname">(.*?)</span></div>'
'.*?<div class="quote">(.*?)<br>'
'.*?<a href="(.*?)" ', re.DOTALL).findall(data)
metacritics = []
for review in reviews:
metacritics.append({
'score': int(review[0]),
'publication':review[1],
'critic':decodeHtml(review[2]),
'quote': stripTags(review[3]).strip(),
'link': review[4],
})
return dict(score = score, critics = metacritics, url = url)

126
oxweb/mininova.py Normal file
View file

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxlib.normalize import normalizeImdbId
import oxlib
from torrent import Torrent
def _parseResultsPage(data, max_results=10):
results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decodeHtml(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
results.append((torrentTitle, torrentLink, ''))
return results
def findMovie(query, max_results=10):
'''search for torrents on mininova
'''
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = readUrlUnicode(url)
return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId):
'''find torrents on mininova for a given imdb id
'''
results = []
imdbId = normalizeImdbId(imdbId)
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
return _parseResultsPage(data)
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRe(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
if len(mininovaId) == 1:
return mininovaId[0]
else:
return mininovaId[-1]
def exists(mininovaId):
mininovaId = getId(mininovaId)
data = oxlib.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
return False
return True
def getData(mininovaId):
_key_map = {
'by': u'uploader',
}
mininovaId = getId(mininovaId)
torrent = dict()
torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104'
'''
def __init__(self, mininovaId):
self.data = getData(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = intValue(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = intValue(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = intValue(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:
self['downloaded'] = -1
published = self.data['added on']
published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

44
oxweb/movieposterdb.py Normal file
View file

@ -0,0 +1,44 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from oxlib.cache import readUrlUnicode
from oxlib import findRe
def getData(id):
'''
>>> getData('0060304')['posters'][0]
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
>>> getData('0123456')['posters']
[]
'''
data = {
"url": getUrl(id)
}
data["posters"] = getPostersByUrl(data["url"])
return data
def getId(url):
return url.split("/")[-2]
def getPostersByUrl(url, group=True):
posters = []
html = readUrlUnicode(url)
if url in html:
if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
for result in results:
posters += getPostersByUrl(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = readUrlUnicode(result)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters
def getUrl(id):
return "http://www.movieposterdb.com/movie/%s/" % id
if __name__ == '__main__':
print getData('0060304')
print getData('0133093')

41
oxweb/opensubtitles.py Normal file
View file

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findRe, stripTags
from oxlib import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if len(language) == 2:
language = langCode2To3(language)
elif len(language) != 3:
language = langTo3Code(language)
url = "http://www.opensubtitles.org/en/search/"
if language:
url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = readUrl(url)
if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data)
opensubtitleId = None
if fd.entries:
link = fd.entries[0]['links'][0]['href']
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def downloadSubtitleById(opensubtitle_id):
srts = {}
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = stripTags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = readUrlUnicode(url)
return srts

10
oxweb/oxdb.py Normal file
View file

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache
def getPosterUrl(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
if oxlib.cache.exists(url):
return url
return ''

12
oxweb/piratecinema.py Normal file
View file

@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache
from oxlib.cache import exists
def getPosterUrl(id):
url = "http://piratecinema.org/posters/%s/%s.jpg" % (id[:4], id)
if oxlib.cache.exists(url):
return url
return ''

34
oxweb/rottentomatoes.py Normal file
View file

@ -0,0 +1,34 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from oxlib.cache import getHeaders, readUrl, readUrlUnicode
from oxlib import findRe, stripTags
def readUrlByImdb(imdb):
#this would also wor but does not cache:
'''
from urllib2 import urlopen
u = urlopen(url)
return u.url
'''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = readUrl(url)
if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data)
if movies:
return "http://www.rottentomatoes.com" + movies[0]
return None
def getData(url):
data = readUrlUnicode(url)
r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)')
r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip()
r['synopsis'] = findRe(data, '<span id="movie_synopsis_all".*?>(.*?)</span>')
r['average rating'] = findRe(data, '<div id="bubble_allCritics".*?>(.*?)</div>').strip()
return r

293
oxweb/spiegel.py Normal file
View file

@ -0,0 +1,293 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import time
from BeautifulSoup import BeautifulSoup
import oxlib.cache
from oxlib.html import decodeHtml, stripTags
import oxlib.net
def getNews(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = oxlib.net.readUrl(url)
else:
html = oxlib.cache.readUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decodeHtml
# new['description'] = formatString(decodeHtml(description))
new['description'] = formatString(description)
new['imageUrl'] = imageUrl
new['section'] = formatSection(section)
new['title'] = formatString(title)
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:]
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
if new['url'][:1] == '/':
new['url'] = 'http://www.spiegel.de' + new['url']
news.append(new)
# print '%s, %s' % (new['section'], dateString)
'''
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
'''
return news
def splitTitle(title):
title1 = re.compile('(.*?): ').findall(title)[0]
title2 = re.compile(': (.*?)$').findall(title)[0]
return [title1, title2]
def formatString(string):
string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def formatSection(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string):
# SPIEGEL, SPIEGEL special
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
def getIssue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxlib.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxlib.cache.readUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip()
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxlib.cache.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveIssues():
'''
this is just an example of an archiving application
'''
p = {}
import os
import simplejson
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print 'getIssue(%d, %d)' % (y, w)
issue = getIssue(y, w)
if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename):
data = simplejson.dumps(issue, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename):
data = []
for item in issue['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxlib.cache.readUrl(issue['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
for page in issue['pageUrl']:
url = issue['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = oxlib.cache.readUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
if not p:
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
else:
p['num'] += 1
p['sum'] += issue['pages']
if issue['pages'] < p['min']:
p['min'] = issue['pages']
if issue['pages'] > p['max']:
p['max'] = issue['pages']
print p['min'], p['sum'] / p['num'], p['max']
def archiveNews():
'''
this is just an example of an archiving application
'''
import os
import simplejson
import time
count = {}
colon = []
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime)) - 1
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = 12
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
elif m == 2 and y % 4 == 0 and y % 400 != 0:
dMax = days[m] + 1
else:
dMax = days[m]
for d in range(dMax, 0, -1):
print 'getNews(%d, %d, %d)' % (y, m, d)
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
if new['url'][-5:] == '.html':
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
data = simplejson.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True:
data = splitTitle(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = oxlib.cache.readUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
strings = new['url'].split('/')
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
strings = splitTitle(new['title'])
if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
for key in sortDictByKey(count):
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
for value in colon:
print value
def sortDictByKey(d):
keys = d.keys()
keys.sort()
return keys
if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
# print spiegel.getContents()
# news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = formatSection(strings[3])
if len(strings) == 6:
string += '/' + formatSubsection(strings[4])
if not string in x:
x.append(string)
print x
'''
# archiveIssues()
archiveNews()

122
oxweb/thepiratebay.py Normal file
View file

@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote, urlencode
from urllib2 import URLError
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxlib.normalize import normalizeImdbId
import oxlib
from torrent import Torrent
cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy()
headers['Cookie'] = 'language=en_EN'
return cache.readUrl(url, data, headers, timeout)
def _readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
def findMovies(query, max_results=10):
results = []
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
page_count = 1
while next and page_count < 4:
page_count += 1
url = next[0]
if not url.startswith('http'):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = _readUrlUnicode(url, timeout=cache_timeout)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decodeHtml(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
results.append((torrentTitle, torrentLink, ''))
if len(results) >= max_results:
return results
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
return results
def findMovieByImdb(imdb):
return findMovies("tt" + normalizeImdbId(imdb))
def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1]
d = findRe(piratebayId, "tor/(\d+)")
if d:
piratebayId = d
d = findRe(piratebayId, "torrent/(\d+)")
if d:
piratebayId = d
return piratebayId
def exists(piratebayId):
piratebayId = getId(piratebayId)
return oxlib.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def getData(piratebayId):
_key_map = {
'spoken language(s)': u'language',
'texted language(s)': u'subtitle language',
'by': u'uploader',
'leechers': 'leecher',
'seeders': 'seeder',
}
piratebayId = getId(piratebayId)
torrent = dict()
torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = _readUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b'
'''
def __init__(self, piratebayId):
self.data = getData(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

37
oxweb/torrent.py Normal file
View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from oxlib import intValue
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
self[key] = self.data.get(key, u'')
for key in self._dict_keys:
self[key] = self.data.get(key, {})
for key in self._list_keys:
self[key] = self.data.get(key, [])
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(intValue(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else:
self['files'] = 1

32
oxweb/tv.py Normal file
View file

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import readUrlUnicode
def getEpisodeData(url):
'''
prases informatin on tvcom episode pages
returns dict with title, show, description, score
example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
'''
data = readUrlUnicode(url)
r = {}
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match:
r['season'] = int(match[0][1])
r['episode'] = int(match[0][0])
#'Wednesday September 29, 2004' -> 2004-09-29
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
return r

120
oxweb/wikipedia.py Normal file
View file

@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from urllib import urlencode
import simplejson
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findRe, decodeHtml
def getId(url):
return url.split("/")[-1]
def getUrl(id):
return "http://en.wikipedia.org/wiki/%s" % id
def getMovieId(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1)
if result:
return result[0][1]
return ''
def getUrlByImdbId(imdbId):
query = '"%s"'% imdbId
result = find(query)
if result:
url = result[0][1]
return url
return ""
def getUrlByImdb(imdbId):
# deprecated, use getUrlByImdbId()
return getUrlByImdbId(imdbId)
def getUrlByAllmovieId(allmovieId):
query = '"amg_id = 1:%s"'% allmovieId
result = find(query)
if result:
url = result[0][1]
return url
return ''
def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
data = readUrlUnicode(url)
return data
def getMovieData(wikipediaUrl):
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
filmbox = {}
_box = filmbox_data.strip().split('\n|')
if len(_box) == 1:
_box = _box[0].split('|\n')
for row in _box:
d = row.split('=')
if len(d) == 2:
key = d[0].strip()
if key[0] == '|':
key = key[1:]
value = d[1].strip()
filmbox[key] = value
if 'imdb title' in data:
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
elif 'imdb episode' in data:
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
if 'Amg movie' in data:
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
if 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
filmbox['amg_id'] = filmbox['amg_id'][2:]
if 'rotten-tomatoes' in data:
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
if not filmbox['rottentomatoes_id']:
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
if 'google video' in data:
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def getImageUrl(name):
data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
return url
def getPosterUrl(wikipediaUrl):
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
data = getMovieData(wikipediaUrl)
if 'image' in data:
return getImageUrl(data['image'])
return ''
def getMoviePoster(wikipediaUrl):
# deprecated, use getPosterUrl()
return getPosterUrl(wikipediaUrl)
def getAllmovieId(wikipediaUrl):
data = getMovieData(wikipediaUrl)
return data.get('amg_id', '')
def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = readUrl(url)
if not data:
data = readUrl(url, timeout=0)
result = simplejson.loads(data)
results = []
if result and 'query' in result:
for r in result['query']['search']:
title = r['title']
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
results.append((title, url, ''))
return results

99
oxweb/youtube.py Normal file
View file

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from urllib import quote, unquote
import httplib
import xml.etree.ElementTree as ET
import re
import feedparser
from oxlib.cache import readUrl, readUrlUnicode
from oxlib import findString, findRe
def getVideoKey(youtubeId):
data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
match = re.compile("token=(.+)&thumbnail").findall(data)
if match:
return unquote(match[0])
return False
def getVideoUrl(youtubeId, format='mp4'):
youtubeKey = getVideoKey(youtubeId)
if format == '720p':
fmt=22
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
elif format == 'mp4':
fmt=18
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
else:
url = "http://youtube.com/get_video.php?video_id=%s&t=%s" % (youtubeId, youtubeKey)
return url
def getMovieInfo(youtubeId, video_url_base=None):
url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
data = readUrl(url)
fd = feedparser.parse(data)
return getInfoFromAtom(fd.entries[0], video_url_base)
def getInfoFromAtom(entry, video_url_base=None):
info = dict()
info['title'] = entry['title']
info['description'] = entry['description']
info['author'] = entry['author']
#info['published'] = entry['published_parsed']
if 'media_keywords' in entry:
info['keywords'] = entry['media_keywords'].split(', ')
info['url'] = entry['links'][0]['href']
info['id'] = findString(info['url'], "/watch?v=")
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
if video_url_base:
info['flv'] = "%s/%s.%s" % (video_url_base, info['id'], 'flv')
info['mp4'] = "%s/%s.%s" % (video_url_base, info['id'], 'mp4')
else:
info['flv'] = getVideoUrl(info['id'], 'flv')
info['mp4'] = getVideoUrl(info['id'], 'mp4')
info['720p'] = getVideoUrl(info['id'], '720p')
info['embed'] = '<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>' % (info['id'], info['id'])
return info
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = readUrlUnicode(url)
fd = feedparser.parse(data)
videos = []
for entry in fd.entries:
v = getInfoFromAtom(entry, video_url_base)
videos.append(v)
if len(videos) >= max_results:
return videos
return videos
'''
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
data = readUrlUnicode(url)
regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ')
regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)" class="vimg120" title="(.*?)" alt="video">')
id_title = regx.findall(data)
data_flat = data.replace('\n', ' ')
videos = {}
for video in id_title:
vid = video[0]
if vid not in videos:
v = dict()
v['id'] = vid
v['link'] = "http//youtube.com/watch.v=%s" % v['id']
v['title'] = video[2].strip()
if video_url_base:
v['video_link'] = "%s/%s" % (video_url_base, v['id'])
else:
v['video_url'] = getVideoUrl(v['id'])
v['description'] = findRe(data, 'BeginvidDesc%s">(.*?)</span>' % v['id']).strip().replace('<b>', ' ').replace('</b>', '')
v['thumbnail'] = video[1]
videos[vid] = v
if len(videos) >= max_results:
return videos.values()
return videos.values()
'''