rename ox -> oxweb
This commit is contained in:
parent
c466c35253
commit
6a16a0af30
17 changed files with 7 additions and 5 deletions
8
oxweb/__init__.py
Normal file
8
oxweb/__init__.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
__version__ = '0.1.0'
|
||||
|
||||
import imdb
|
||||
import wikipedia
|
||||
import google
|
||||
|
||||
64
oxweb/criterion.py
Normal file
64
oxweb/criterion.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from oxutils.cache import getUrlUnicode
|
||||
from oxutils.html import stripTags
|
||||
from oxutils.text import findRe, removeSpecialCharacters
|
||||
|
||||
import imdb
|
||||
|
||||
|
||||
def getData(criterionId):
|
||||
'''
|
||||
>>> getData(348)['imdbId']
|
||||
'0068205'
|
||||
'''
|
||||
data = {}
|
||||
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % criterionId)
|
||||
data['criterionId'] = criterionId
|
||||
data['posterUrl'] = getPosterUrl(criterionId)
|
||||
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
|
||||
result = re.compile("<title>The Criterion Collection: (.*?) by (.*?)</title>").findall(html)
|
||||
data['title'] = stripTags(result[0][0])
|
||||
data['director'] = stripTags(result[0][1])
|
||||
data['imdbId'] = imdb.getMovieId(data['title'], data['director'])
|
||||
return data
|
||||
|
||||
def getCriterionIds():
|
||||
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine')
|
||||
return re.compile('release.asp\?id=(.*?)"').findall(html)
|
||||
|
||||
def getPosterUrl(criterionId):
|
||||
return 'http://criterion.com/content/images/full_boxshot/%s_box_348x490.jpg' % criterionId
|
||||
|
||||
def getMovieId(title = '', director = '', imdbId = ''):
|
||||
if not imdbId:
|
||||
imdbId = imdb.getMovieId(title, director)
|
||||
html = getUrlUnicode('http://criterion.com/asp/list.asp?sort=spine', timeout = -1)
|
||||
strings = findRe(html, '<table cellspacing="0" id="browse-all-table">(.*?)</table>').split('<tr>')
|
||||
strings.pop(0)
|
||||
for string in strings:
|
||||
id = findRe(string, '"release.asp\?id=(.*?)"')
|
||||
criterionTitle = findRe(string, 'class="title">(.*?)</a>')
|
||||
criterionTitle = re.sub('(?<=\\w)<br>(?=\\w)', ' / ', criterionTitle)
|
||||
criterionTitle = criterionTitle.replace('<br>', '')
|
||||
criterionDirector = stripTags(findRe(string, '</a>.*?</td>(.*?)</td>')).strip()
|
||||
if imdb.getMovieId(criterionTitle, criterionDirector) == imdbId:
|
||||
return id
|
||||
return ''
|
||||
|
||||
def getMovieData(title = '', director = '', imdbId = ''):
|
||||
data = {}
|
||||
if not imdbId:
|
||||
imdbId = imdb.getMovieId(title, director)
|
||||
id = getMovieId(imdbId = imdbId)
|
||||
if id:
|
||||
html = getUrlUnicode('http://criterion.com/asp/release.asp?id=%s' % id)
|
||||
data['id'] = id
|
||||
data['posterUrl'] = getPosterUrl(id)
|
||||
data['synopsis'] = stripTags(findRe(html, '<h3>Synopsis</h3>(.*?)</div>'))
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getMovieData('Le mepris', 'Jean-Luc Godard')
|
||||
22
oxweb/dailymotion.py
Normal file
22
oxweb/dailymotion.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import unquote
|
||||
from oxutils.cache import getUrl
|
||||
|
||||
|
||||
def getVideoUrl(url):
|
||||
'''
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
||||
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
||||
'''
|
||||
data = getUrl(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return "http://www.dailymotion.com" + v
|
||||
return ''
|
||||
|
||||
50
oxweb/google.py
Normal file
50
oxweb/google.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
import weakref
|
||||
import threading
|
||||
import Queue
|
||||
|
||||
import oxutils
|
||||
from oxutils import stripTags
|
||||
|
||||
|
||||
'''
|
||||
usage:
|
||||
import google
|
||||
google.find(query)
|
||||
|
||||
for result in google.find(query): result
|
||||
|
||||
result is title, url, description
|
||||
|
||||
google.find(query, max_results)
|
||||
|
||||
FIXME: how search depper than first page?
|
||||
'''
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
|
||||
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
||||
google_timeout=24*60*60
|
||||
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
||||
|
||||
def quote_plus(s):
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
||||
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||
data = getUrl(url)
|
||||
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||
r'.*?(?:<br>|<table.*?>)' + \
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||
results = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||
results.append((stripTags(name), url, stripTags(desc)))
|
||||
if len(results) > max_results:
|
||||
results = results[:max_results]
|
||||
return results
|
||||
|
||||
670
oxweb/imdb.py
Normal file
670
oxweb/imdb.py
Normal file
|
|
@ -0,0 +1,670 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import urllib2
|
||||
from urllib import quote, unquote
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
import chardet
|
||||
import oxutils
|
||||
from oxutils import stripTags, decodeHtml, findRe, findString
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils.normalize import normalizeTitle, normalizeImdbId
|
||||
from oxutils import *
|
||||
|
||||
import google
|
||||
|
||||
|
||||
def getMovieId(title, director='', year=''):
|
||||
'''
|
||||
>>> getMovieId('The Matrix')
|
||||
'0133093'
|
||||
'''
|
||||
if year:
|
||||
title = "%s (%s)" % (title, year)
|
||||
if director:
|
||||
query = 'site:imdb.com %s "%s"' % (director, title)
|
||||
else:
|
||||
query = 'site:imdb.com "%s"' % title
|
||||
for (name, url, desc) in google.find(query, 3):
|
||||
if url.startswith('http://www.imdb.com/title/tt'):
|
||||
return url[28:35]
|
||||
return ''
|
||||
|
||||
def getMovieData(imdbId):
|
||||
return IMDb(imdbId).parse()
|
||||
|
||||
# internal functions below
|
||||
def getUrlBase(imdbId):
|
||||
return "http://www.imdb.com/title/tt%s" % imdbId
|
||||
|
||||
def getRawMovieData(imdbId):
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = getMovieInfo(imdbId)
|
||||
data['credits'] = getMovieCredits(imdbId)
|
||||
data['poster'] = getMoviePoster(imdbId)
|
||||
data['company credits'] = getMovieCompanyCredits(imdbId)
|
||||
data['filming locations'] = getMovieLocations(imdbId)
|
||||
data['movie connections'] = getMovieConnections(imdbId)
|
||||
data['external reviews'] = getMovieExternalReviews(imdbId)
|
||||
data['trivia'] = getMovieTrivia(imdbId)
|
||||
data['keywords'] = getMovieKeywords(imdbId)
|
||||
data['media'] = {}
|
||||
data['media']['images'] = getMovieImages(imdbId)
|
||||
data['media']['trailers'] = getMovieTrailers(imdbId)
|
||||
data['plotsummary'] = getMoviePlot(imdbId)
|
||||
data['release dates'] = getMovieReleaseDates(imdbId)
|
||||
data['release date'] = getMovieReleaseDate(imdbId)
|
||||
return data
|
||||
|
||||
def getMovieInfo(imdbId):
|
||||
data = getUrlUnicode(getUrlBase(imdbId))
|
||||
info = dict()
|
||||
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
if info['poster'] and '_V' in info['poster']:
|
||||
info['poster']= "%s.jpg" % info['poster'].split('._V')[0]
|
||||
|
||||
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
||||
title = stripTags(i[0]).strip().lower()
|
||||
txt= stripTags(i[1]).strip()
|
||||
def cleanUp(k):
|
||||
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
|
||||
if k.endswith('more'): k=k[:-len('more')].strip()
|
||||
return k
|
||||
txt = cleanUp(txt)
|
||||
if title not in ('plot', 'trivia', 'filming locations', 'mpaa'):
|
||||
if '|' in txt:
|
||||
txt = [cleanUp(k) for k in txt.split('|')]
|
||||
elif ', ' in txt:
|
||||
txt = [cleanUp(k) for k in txt.split(', ')]
|
||||
if not title.startswith('moviemeter'):
|
||||
info[title] = txt
|
||||
for key in ('user comments', 'writers (wga)'):
|
||||
if key in info:
|
||||
del info[key]
|
||||
if 'release date' in info:
|
||||
info['release date'] = info['release date'].split('\n')[0]
|
||||
if 'plot' in info:
|
||||
info['plot'] = info['plot'].split('| add synopsis')[0].strip()
|
||||
info['plot'] = info['plot'].split('| full synopsis')[0].strip()
|
||||
if info['plot'] in ('add synopsis', 'full synopsis'):
|
||||
info['plot'] = ''
|
||||
|
||||
#get Title
|
||||
title = ''
|
||||
year = ''
|
||||
html_title = findRe(data, '<div id="tn15title">(.*?)</div>')
|
||||
if not html_title:
|
||||
html_title = findRe(data, '<title>(.*?)</title>')
|
||||
if html_title:
|
||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||
title = decodeHtml(html_title)
|
||||
title = stripTags(title)
|
||||
year = findRe(title, '\((\d{4})\)')
|
||||
if not year:
|
||||
year = findRe(title, '\((\d{4})')
|
||||
_y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))')
|
||||
if _y:
|
||||
title = title.replace(_y, '')
|
||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
title = title.replace(t, '')
|
||||
title = title.strip()
|
||||
if title.find(u'\xa0') > -1:
|
||||
title = title[:title.find(u'\xa0')].strip()
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
title = title[1:-1]
|
||||
info['title'] = title
|
||||
info['year'] = year
|
||||
|
||||
#Rating
|
||||
rating = findRe(data, '<b>([\d\.]*?)/10</b>')
|
||||
if rating:
|
||||
info['rating'] = float(rating)
|
||||
else:
|
||||
info['rating'] = -1
|
||||
|
||||
#Votes
|
||||
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||
if votes:
|
||||
info['votes'] = int(votes.replace(',', ''))
|
||||
else:
|
||||
info['votes'] = -1
|
||||
return info
|
||||
|
||||
def getMoviePoster(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['poster']
|
||||
|
||||
def getMovieYear(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['year']
|
||||
|
||||
def getMovieTitle(imdbId):
|
||||
info = getMovieInfo(imdbId)
|
||||
return info['title']
|
||||
|
||||
def creditList(data, section=None):
|
||||
if section == 'cast':
|
||||
credits_ = re.compile('''<tr .*?<td class="nm">(.*?)</td><td class="ddd">.*?</td><td class="char">(.*?)</td></tr>''').findall(data)
|
||||
else:
|
||||
credits_ = re.compile('''<tr>.*?<td valign="top">(.*?)</td><td.*?</td><td valign="top">(.*?)</td></tr>''').findall(data)
|
||||
credits = []
|
||||
for c_ in credits_:
|
||||
c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()]
|
||||
if section=='writers':
|
||||
c[1] = c[1].replace('<br>', '').strip().replace(')', '').replace('(','')
|
||||
if c[1].endswith(' and'): c[1] = c[1][:-4]
|
||||
credits.append(c)
|
||||
return credits
|
||||
|
||||
def getMovieCredits(imdbId):
|
||||
credits = dict()
|
||||
url = "%s/fullcredits" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
groups = data.split('<h5>')
|
||||
for g in groups:
|
||||
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
|
||||
if section:
|
||||
credits[section[0]] = creditList(g, section[0])
|
||||
return credits
|
||||
|
||||
def getMovieTrailers(imdbId):
|
||||
url = "%s/trailers" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
videos = soup('div', {'class':"video-gallery"})
|
||||
trailers = []
|
||||
if videos:
|
||||
for a in videos[0]('a'):
|
||||
title = stripTags(unicode(a)).strip()
|
||||
url = 'http://www.imdb.com' + a['href']
|
||||
videoId = findRe(url, '/(vi\d*?)/')
|
||||
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
||||
iframe = getUrlUnicode(iframeUrl)
|
||||
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
|
||||
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||
return trailers
|
||||
|
||||
def getMovieQuotes(imdbId):
|
||||
url = "%s/quotes" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
|
||||
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
|
||||
return quotes
|
||||
|
||||
def getMoviePlot(imdbId):
|
||||
url = "%s/plotsummary" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
plot = findRe(data, '<p class="plotpar">(.*?)<i>')
|
||||
return plot
|
||||
|
||||
def getMovieTechnical(imdbId):
|
||||
url = "%s/technical" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
results = {}
|
||||
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
|
||||
results[t[0].strip()] = t[1].strip()
|
||||
return results
|
||||
|
||||
def getMovieCompanyCredits(imdbId):
|
||||
url = "%s/companycredits" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
results = {}
|
||||
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
|
||||
results[field.strip()] = []
|
||||
for company in re.compile('<li>(.*?)</li>').findall(c):
|
||||
results[field.strip()].append(company)
|
||||
return results
|
||||
|
||||
def getMovieLocations(imdbId):
|
||||
url = "%s/locations" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
locations = []
|
||||
for key in soup('a', {'href': re.compile('^/List')}):
|
||||
locations.append(decodeHtml(key.string))
|
||||
return locations
|
||||
|
||||
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
|
||||
photos = {}
|
||||
for key in keys:
|
||||
url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key)
|
||||
data = getUrlUnicode(url)
|
||||
photos[key] = {}
|
||||
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
|
||||
img= "%s.jpg" % s[1].split('._V')[0]
|
||||
title = s[0]
|
||||
if key=='still_frame':
|
||||
if not "_CR0" in s[1]:
|
||||
photos[key][img] = title
|
||||
else:
|
||||
photos[key][img] = title
|
||||
return photos
|
||||
|
||||
def getMovieStills(imdbId):
|
||||
return getMovieImages(imdbId, ['still_frame'])['still_frame']
|
||||
|
||||
def getMoviePosters(imdbId):
|
||||
posters = getMovieImages(imdbId, ['poster'])['poster']
|
||||
poster = getMoviePoster(imdbId)
|
||||
if poster:
|
||||
posters[poster] = 'main poster'
|
||||
return posters
|
||||
|
||||
def getMovieTrivia(imdbId):
|
||||
url = "%s/trivia" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
trivia = []
|
||||
triviaList = []
|
||||
for i in soup('ul', {'class': "trivia"}):
|
||||
for t in i('li'):
|
||||
t = unicode(t).replace('<br />', '').strip()
|
||||
if t.startswith('<li>') and t.endswith('</li>'):
|
||||
t = t[4:-5].strip()
|
||||
t=decodeHtml(t)
|
||||
trivia.append(t)
|
||||
return trivia
|
||||
|
||||
def getMovieConnections(imdbId):
|
||||
url = "%s/movieconnections" % getUrlBase(imdbId)
|
||||
data = getUrl(url)
|
||||
connections={}
|
||||
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
|
||||
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
|
||||
return connections
|
||||
|
||||
def getMovieKeywords(imdbId):
|
||||
url = "%s/keywords" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
keywords = []
|
||||
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
|
||||
keyword = decodeHtml(keyword)
|
||||
keyword = keyword.replace(u'\xa0', ' ')
|
||||
keywords.append(keyword)
|
||||
return keywords
|
||||
|
||||
def getMovieExternalReviews(imdbId):
|
||||
url = "%s/externalreviews" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
soup = BeautifulSoup(data)
|
||||
ol = soup('ol')
|
||||
if ol:
|
||||
ol = ol[0]
|
||||
ret = {}
|
||||
for li in ol('li'):
|
||||
try:
|
||||
a = li('a')[0]
|
||||
href = a.get('href')
|
||||
txt = a.contents[0]
|
||||
ret[href] = txt
|
||||
except:
|
||||
pass
|
||||
return ret
|
||||
return {}
|
||||
|
||||
def getMovieReleaseDate(imdbId):
|
||||
releasedates = getMovieReleaseDates(imdbId)
|
||||
first_release = None
|
||||
for r in releasedates:
|
||||
if not first_release or r[1] < first_release:
|
||||
first_release = r[1]
|
||||
return first_release
|
||||
|
||||
def _parseDate(d):
|
||||
try:
|
||||
parsed_date = time.strptime(d, "%d %B %Y")
|
||||
parsed_date = time.strftime('%Y-%m-%d', parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
try:
|
||||
parsed_date = time.strptime(d, "%B %Y")
|
||||
parsed_date = time.strftime('%Y-%m-01', parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
parsed_date = time.strptime(d, "%Y")
|
||||
parsed_date = time.strftime('%Y-01-01', parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
pass
|
||||
return d
|
||||
|
||||
def getMovieReleaseDates(imdbId):
|
||||
url = "%s/releaseinfo" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
releasedates = []
|
||||
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
|
||||
|
||||
for r in re.compile(regexp, re.DOTALL).findall(data):
|
||||
r_ = (stripTags(r[0]).strip(),
|
||||
_parseDate(stripTags(r[1]).strip()),
|
||||
decodeHtml(stripTags(r[2]).strip()))
|
||||
releasedates.append(r_)
|
||||
return releasedates
|
||||
|
||||
def getMovieBusinessSum(imdbId):
|
||||
business = getMovieBusiness(imdbId)
|
||||
b_ = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||
if 'budget' in business:
|
||||
b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']])
|
||||
if 'gross' in business:
|
||||
b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']])
|
||||
if 'weekend gross' in business:
|
||||
b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']])
|
||||
if b_['budget'] and b_['gross']:
|
||||
b_['profit'] = b_['gross'] - b_['budget']
|
||||
return b_
|
||||
|
||||
def getMovieFlimingDates(imdbId):
|
||||
business = getMovieBusiness(imdbId)
|
||||
if 'filming dates' in business and business['filming dates']:
|
||||
return business['filming dates'][0]
|
||||
return ''
|
||||
|
||||
def getMovieBusiness(imdbId):
|
||||
url = "%s/business" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
business = {}
|
||||
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
|
||||
key = stripTags(r[0]).strip().lower()
|
||||
value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('<br/>')]
|
||||
business[key] = value
|
||||
return business
|
||||
|
||||
def getMovieEpisodes(imdbId):
|
||||
url = "%s/episodes" % getUrlBase(imdbId)
|
||||
data = getUrlUnicode(url)
|
||||
episodes = {}
|
||||
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
|
||||
for r in re.compile(regexp, re.DOTALL).findall(data):
|
||||
try:
|
||||
episode = "S%02dE%02d" % (int(r[0]), int(r[1]))
|
||||
episodes[episode] = {}
|
||||
episodes[episode]['imdb'] = r[2]
|
||||
episodes[episode]['title'] = r[3].strip()
|
||||
if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])):
|
||||
episodes[episode]['title'] = u''
|
||||
description = decodeHtml(r[5])
|
||||
description = stripTags(description.split('Next US airings:')[0])
|
||||
episodes[episode]['description'] = description.strip()
|
||||
episodes[episode]['date'] = ''
|
||||
try:
|
||||
d = stripTags(r[4])
|
||||
d = d.replace('Original Air Date: ', '')
|
||||
d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
|
||||
episodes[episode]['date'] = d
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
import traceback
|
||||
print traceback.print_exc()
|
||||
pass
|
||||
return episodes
|
||||
|
||||
'''the old code below'''
|
||||
|
||||
class IMDb:
|
||||
def __init__(self, imdbId):
|
||||
self.imdb = imdbId
|
||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||
|
||||
def getPage(self):
|
||||
return getUrlUnicode(self.pageUrl)
|
||||
|
||||
def parse_raw_value(self, key, value):
|
||||
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
||||
value = stripTags(value).strip()
|
||||
if key == 'runtime':
|
||||
parsed_value = findRe(value, '(.*?) min')
|
||||
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = findRe(value, '(.*?) sec')
|
||||
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = 0
|
||||
else:
|
||||
parsed_value = int(parsed_value)
|
||||
else:
|
||||
parsed_value = int(parsed_value) * 60
|
||||
elif key in ('country', 'language'):
|
||||
parsed_value = value.split(' / ')
|
||||
if len(parsed_value) == 1:
|
||||
parsed_value = parsed_value[0].split(' | ')
|
||||
parsed_value = [v.strip() for v in parsed_value]
|
||||
elif key == 'genre':
|
||||
parsed_value = value.replace('more', '').strip().split(' / ')
|
||||
if len(parsed_value) == 1:
|
||||
parsed_value = parsed_value[0].split(' | ')
|
||||
parsed_value = [v.strip() for v in parsed_value]
|
||||
elif key == 'tagline':
|
||||
parsed_value = value.replace('more', '').strip()
|
||||
elif key == 'plot_outline':
|
||||
parsed_value = value.replace('(view trailer)', '').strip()
|
||||
if parsed_value.endswith('more'):
|
||||
parsed_value = parsed_value[:-4].strip()
|
||||
elif key == 'tv_series':
|
||||
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
|
||||
if m:
|
||||
parsed_value = m[0][0]
|
||||
else:
|
||||
parsed_value = ''
|
||||
elif key == 'also_known_as':
|
||||
parsed_value = ''
|
||||
m = re.compile('(.*) \(International: English title').findall(value)
|
||||
if m:
|
||||
parsed_value = m[0]
|
||||
else:
|
||||
m = re.compile('(.*) \(USA').findall(value)
|
||||
if m:
|
||||
parsed_value = m[0]
|
||||
parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
|
||||
director = self.getCredits().get('director', None)
|
||||
if director:
|
||||
director = director[0]
|
||||
parsed_value = parsed_value.replace(director, '')
|
||||
if parsed_value.startswith("'s"):
|
||||
parsed_value = parsed_value[2:].strip()
|
||||
parsed_value = decodeHtml(parsed_value.strip())
|
||||
else:
|
||||
print value
|
||||
parsed_value = value
|
||||
return parsed_value
|
||||
|
||||
def parseTitle(self):
|
||||
title = getMovieTitle(self.imdb)
|
||||
title = normalizeTitle(title)
|
||||
if title.startswith('"') and title.find('"',1) > 0 and \
|
||||
title.find('"',1) == title.rfind('"'):
|
||||
data = self.getPage()
|
||||
se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
|
||||
if se:
|
||||
se = se[0]
|
||||
se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1]))
|
||||
title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip()
|
||||
else:
|
||||
part2 = title[title.rfind('"')+1:]
|
||||
part2 = re.sub("[\d\?-]", "", part2).strip()
|
||||
title = normalizeTitle(title[1:title.rfind('"')])
|
||||
if part2:
|
||||
title += ':' + part2
|
||||
return normalizeTitle(title)
|
||||
|
||||
def parseYear(self):
|
||||
year = ''
|
||||
data = self.getPage()
|
||||
soup = BeautifulSoup(data)
|
||||
html_title = soup('div', {'id': 'tn15title'})
|
||||
if not html_title:
|
||||
html_title = soup('title')
|
||||
if html_title:
|
||||
html_title = unicode(html_title[0])
|
||||
html_title = stripTags(html_title)
|
||||
year = re.compile('\((\d{4})\)').findall(html_title)
|
||||
if not year:
|
||||
year = re.compile('\((\d{4})/').findall(html_title)
|
||||
if year:
|
||||
year = year[0]
|
||||
else: year = ''
|
||||
return year
|
||||
|
||||
def parse(self):
|
||||
data = self.getPage()
|
||||
IMDbDict ={}
|
||||
#Poster
|
||||
IMDbDict['poster'] = getMoviePoster(self.imdb)
|
||||
if not IMDbDict['poster']:
|
||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||
#Title, Year
|
||||
IMDbDict['year'] = self.parseYear()
|
||||
IMDbDict['title'] = self.parseTitle()
|
||||
|
||||
#Rating
|
||||
m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
|
||||
if m:
|
||||
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
|
||||
else:
|
||||
IMDbDict['rating'] = -1
|
||||
#Votes
|
||||
m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
|
||||
if m:
|
||||
IMDbDict['votes'] = int(m[0].replace(',', ''))
|
||||
else:
|
||||
IMDbDict['votes'] = -1
|
||||
|
||||
data = data.replace('\n',' ')
|
||||
#some values
|
||||
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
|
||||
for key in keys:
|
||||
IMDbDict[key] = ''
|
||||
IMDbDict['runtime'] = 0
|
||||
soup = BeautifulSoup(data)
|
||||
for info in soup('div', {'class': 'info'}):
|
||||
key = unicode(info).split('</h5>')[0].split('<h5>')
|
||||
if len(key) > 1:
|
||||
raw_value = unicode(info).split('</h5>')[1]
|
||||
key = key[1][:-1].lower().replace(' ', '_')
|
||||
if key in keys:
|
||||
IMDbDict[key] = self.parse_raw_value(key, raw_value)
|
||||
IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
|
||||
#is episode
|
||||
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||
|
||||
IMDbDict['episodes'] = getMovieEpisodes(self.imdb)
|
||||
if IMDbDict['episodes']:
|
||||
IMDbDict['tvshow'] = True
|
||||
else:
|
||||
IMDbDict['tvshow'] = False
|
||||
IMDbDict['credits'] = self.getCredits()
|
||||
IMDbDict['plot'] = getMoviePlot(self.imdb)
|
||||
IMDbDict['keywords'] = getMovieKeywords(self.imdb)
|
||||
IMDbDict['trivia'] = getMovieTrivia(self.imdb)
|
||||
IMDbDict['connections'] = getMovieConnections(self.imdb)
|
||||
IMDbDict['locations'] = getMovieLocations(self.imdb)
|
||||
IMDbDict['release_date'] = getMovieReleaseDate(self.imdb)
|
||||
IMDbDict['business'] = getMovieBusinessSum(self.imdb)
|
||||
IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
|
||||
IMDbDict['stills'] = getMovieStills(self.imdb)
|
||||
#IMDbDict['trailer'] = getMovieTrailer(self.imdb)
|
||||
self.IMDbDict = IMDbDict
|
||||
|
||||
if IMDbDict['episode_of']:
|
||||
episode_of =IMDb(IMDbDict['episode_of']).parse()
|
||||
for key in ('country', 'language'):
|
||||
if not IMDbDict[key]:
|
||||
IMDbDict[key] = episode_of[key]
|
||||
return self.IMDbDict
|
||||
|
||||
def getCredits(self):
|
||||
raw_credits = getMovieCredits(self.imdb)
|
||||
credits = {}
|
||||
|
||||
def getNames(creditList):
|
||||
return [stripTags(decodeHtml(c[0])) for c in creditList]
|
||||
|
||||
credits['director'] = getNames(raw_credits.get('directors', ''))
|
||||
credits['writer'] = getNames(raw_credits.get('writers', ''))
|
||||
credits['producer'] = getNames(raw_credits.get('producers', ''))
|
||||
credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])]
|
||||
|
||||
self.credits = credits
|
||||
return self.credits
|
||||
|
||||
|
||||
def guess(title, director=''):
|
||||
#FIXME: proper file -> title
|
||||
title = title.split('-')[0]
|
||||
title = title.split('(')[0]
|
||||
title = title.split('.')[0]
|
||||
title = title.strip()
|
||||
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
||||
return_url = ''
|
||||
|
||||
#lest first try google
|
||||
#i.e. site:imdb.com Michael Stevens Sin
|
||||
if director:
|
||||
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||
else:
|
||||
search = 'site:imdb.com "%s"' % title
|
||||
for (name, url, desc) in google.find(search, 2):
|
||||
if url.startswith('http://www.imdb.com/title/tt'):
|
||||
return url[28:35]
|
||||
|
||||
try:
|
||||
req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
|
||||
u = urllib2.urlopen(req)
|
||||
data = u.read()
|
||||
return_url = u.url
|
||||
u.close()
|
||||
except:
|
||||
return None
|
||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||
return return_url[28:35]
|
||||
if data:
|
||||
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||
if imdb_id:
|
||||
return imdb_id
|
||||
|
||||
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
||||
req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
|
||||
u = urllib2.urlopen(req)
|
||||
data = u.read()
|
||||
return_url = u.url
|
||||
u.close()
|
||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||
return return_url[28:35]
|
||||
|
||||
return None
|
||||
|
||||
def getEpisodeData(title, episode, show_url = None):
|
||||
'''
|
||||
Collect information about an episode.
|
||||
|
||||
Returns dict with title, show, description and episode
|
||||
'''
|
||||
episodeData = {
|
||||
'title': u'',
|
||||
'show': title,
|
||||
'description': u'',
|
||||
'episode': episode,
|
||||
}
|
||||
description = u''
|
||||
if not show_url:
|
||||
imdbid = guess(title)
|
||||
else:
|
||||
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
||||
if imdbid:
|
||||
i = IMDb(imdbid).parse()
|
||||
episodeData['title'] = i['episodes'][episode]['title']
|
||||
episodeData['description'] = i['episodes'][episode]['description']
|
||||
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||
return episodeData
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
#print parse(sys.argv[1])
|
||||
print "imdb:", guess(sys.argv[1])
|
||||
|
||||
89
oxweb/impawards.py
Normal file
89
oxweb/impawards.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
|
||||
from oxutils.cache import getUrlUnicode
|
||||
from oxutils.html import stripTags
|
||||
from oxutils.text import findRe
|
||||
|
||||
import imdb
|
||||
|
||||
|
||||
def getMovieData(title = '', director = '', imdbId = ''):
|
||||
data = {'posterUrls': []}
|
||||
if not imdbId:
|
||||
imdbId = imdb.getMovieId(title, director)
|
||||
print imdbId
|
||||
html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
|
||||
pages = int(findRe(html, '<a href = page(.*?).html>'))
|
||||
for page in range(pages + 1, 0, -1):
|
||||
print page
|
||||
if page <= pages:
|
||||
html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
|
||||
urls = parseArchivePage(html)
|
||||
print urls
|
||||
for url in urls:
|
||||
html = getUrlUnicode(url)
|
||||
d = parseMoviePage(html)
|
||||
print d
|
||||
if d['imdbId'] == imdbId:
|
||||
data['posterUrls'].append(d['posterUrl'])
|
||||
print d['posterUrl']
|
||||
data['posterUrls'].sort()
|
||||
return data
|
||||
|
||||
def parseArchivePage(html):
|
||||
urls = []
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
urls.append('http://impawards.com/%s' % result)
|
||||
return urls
|
||||
|
||||
def parseMoviePage(html):
|
||||
data = {}
|
||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
|
||||
data['title'] = stripTags(findRe(html, '<table WIDTH="400" BGCOLOR="#222222">(.*?) \(<a href="eligible.html">'))
|
||||
data['year'] = findRe(html, '\(<a href="eligible.html">(.*?)</a>\)')
|
||||
result = findRe(html, '<a href = (\w*?_xlg.html) target= _blank>')
|
||||
if result:
|
||||
url = 'http://impawards.com/%s/%s' % (data['year'], result)
|
||||
html = getUrlUnicode(url, timeout = -1)
|
||||
d = parsePosterPage(html, data['year'])
|
||||
data['posterUrl'] = d['posterUrl']
|
||||
else:
|
||||
data['posterUrl'] = 'http://impawards.com/%s/%s' % (data['year'], findRe(html, '<td align=center><br><img SRC="(.*?)"'))
|
||||
return data
|
||||
|
||||
def parsePosterPage(html, year):
|
||||
data = {}
|
||||
data['posterUrl'] = 'http://impawards.com/%s/%s' % (year, findRe(html, '<img SRC="(.*?)"'))
|
||||
return data
|
||||
|
||||
def archivePosters():
|
||||
import os
|
||||
from oxutils.net import getUrl
|
||||
pathname = '/Volumes/Rolux Home/Desktop/Data/impawards.com'
|
||||
html = getUrlUnicode('http://impawards.com/archives/latest.html', timeout = 0)
|
||||
pages = int(findRe(html, '<a href = page(.*?).html>'))
|
||||
for page in range(pages + 1, 0, -1):
|
||||
if page <= pages:
|
||||
html = getUrlUnicode('http://impawards.com/archives/page%s.html' % page, timeout = -1)
|
||||
urls = parseArchivePage(html)
|
||||
print urls
|
||||
for url in urls:
|
||||
html = getUrlUnicode(url)
|
||||
data = parseMoviePage(html)
|
||||
dirname = '%s/%s/%s' % (pathname, data['imdbId'][:4], data['imdbId'])
|
||||
filename = '%s/%s' % (dirname, os.path.split(data['posterUrl'])[1])
|
||||
if not os.path.exists(filename):
|
||||
jpg = getUrl(data['posterUrl'])
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
f = open(filename, 'w')
|
||||
f.write(jpg)
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
archivePosters()
|
||||
getMovieData('Brick', 'Rian Johnson')
|
||||
187
oxweb/itunes.py
Normal file
187
oxweb/itunes.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
import urllib
|
||||
|
||||
from oxutils.cache import getUrl
|
||||
from oxutils.html import decodeHtml, stripTags
|
||||
from oxutils.text import findRe
|
||||
from oxutils.text import findString
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
|
||||
|
||||
ITUNES_HEADERS = {
|
||||
'X-Apple-Tz': '0',
|
||||
'X-Apple-Storefront': '143441-1',
|
||||
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
|
||||
'Accept-Language': 'en-us, en;q=0.50',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'Connection': 'close',
|
||||
}
|
||||
|
||||
def composeUrl(request, parameters):
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
url += urllib.urlencode({
|
||||
'albumTerm': parameters['title'],
|
||||
'allArtistNames': parameters['artist'],
|
||||
'composerTerm': '',
|
||||
'flavor': 0,
|
||||
'genreIndex': 1,
|
||||
'media': 'music',
|
||||
'mediaType': 2,
|
||||
'ringtone': 0,
|
||||
'searchButton': 'submit',
|
||||
'songTerm': ''
|
||||
})
|
||||
elif parameters['media'] == 'movie':
|
||||
url += urllib.urlencode({
|
||||
'actorTerm': '',
|
||||
'closedCaption': 0,
|
||||
'descriptionTerm': '',
|
||||
'directorProducerName': parameters['director'],
|
||||
'flavor': 0,
|
||||
'media': 'movie',
|
||||
'mediaType': 3,
|
||||
'movieTerm': parameters['title'],
|
||||
'ratingIndex': 1,
|
||||
'releaseYearTerm': '',
|
||||
'searchButton': 'submit'
|
||||
})
|
||||
elif request == 'viewAlbum':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||
elif request == 'viewMovie':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
|
||||
def parseXmlDict(xml):
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = findRe(string, '(.*?)</key>')
|
||||
type = findRe(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decodeHtml(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parseCast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
def parseMovies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
class ItunesAlbum:
|
||||
def __init__(self, id = '', title = '', artist = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
def __init__(self, id = '', title = '', director = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewMovie', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
import simplejson
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
|
||||
21
oxweb/lyricsfly.py
Normal file
21
oxweb/lyricsfly.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from oxutils.cache import getUrl
|
||||
from oxutils.html import decodeHtml
|
||||
from oxutils.text import findRe
|
||||
|
||||
|
||||
def getLyrics(title, artist):
|
||||
html = getUrl('http://lyricsfly.com/api/')
|
||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = getUrl(url)
|
||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
lyrics.replace('\n\n\n', '\n\n')
|
||||
lyrics = decodeHtml(lyrics.replace('&', '&'))
|
||||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getLyrics('Election Day', 'Arcadia')
|
||||
126
oxweb/mininova.py
Normal file
126
oxweb/mininova.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||
from oxutils.normalize import normalizeImdbId
|
||||
import oxutils
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parseResultsPage(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decodeHtml(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def findMovie(query, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = getUrlUnicode(url)
|
||||
return _parseResultsPage(data, max_results)
|
||||
|
||||
def findMovieByImdb(imdbId):
|
||||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = getId(mininovaId)
|
||||
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def getData(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = getId(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = getData(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = intValue(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = intValue(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
41
oxweb/opensubtitles.py
Normal file
41
oxweb/opensubtitles.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
import oxutils
|
||||
from oxutils.lang import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
language = langTo3Code(language)
|
||||
url = "http://www.opensubtitles.org/en/search/"
|
||||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = getUrl(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
if fd.entries:
|
||||
link = fd.entries[0]['links'][0]['href']
|
||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
srts = {}
|
||||
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = oxutils.stripTags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = getUrlUnicode(url)
|
||||
return srts
|
||||
|
||||
293
oxweb/spiegel.py
Normal file
293
oxweb/spiegel.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import time
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
import oxutils.cache
|
||||
from oxutils.html import decodeHtml, stripTags
|
||||
import oxutils.net
|
||||
|
||||
|
||||
def getNews(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||
]
|
||||
dt = datetime(year, month, day)
|
||||
day = int(dt.strftime('%j'))
|
||||
date = dt.strftime('%d.%m.%Y')
|
||||
news = []
|
||||
for section in sections:
|
||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||
html = oxutils.net.getUrl(url)
|
||||
else:
|
||||
html = oxutils.cache.getUrl(url)
|
||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
except:
|
||||
description = ''
|
||||
try:
|
||||
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||
except:
|
||||
imageUrl = ''
|
||||
try:
|
||||
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||
except:
|
||||
title = ''
|
||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||
new = {}
|
||||
if len(dateString) == 10:
|
||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decodeHtml
|
||||
# new['description'] = formatString(decodeHtml(description))
|
||||
new['description'] = formatString(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = formatSection(section)
|
||||
new['title'] = formatString(title)
|
||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||
if new['title1'][-1:] == ':':
|
||||
new['title1'] = new['title1'][0:-1]
|
||||
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
|
||||
if new['url'][:1] == '/':
|
||||
new['url'] = 'http://www.spiegel.de' + new['url']
|
||||
news.append(new)
|
||||
# print '%s, %s' % (new['section'], dateString)
|
||||
'''
|
||||
elif dateString[:10] == date and not description:
|
||||
print dateString + ' - no description'
|
||||
elif dateString[:10] == date and not imageUrl:
|
||||
print dateString + ' - no image'
|
||||
'''
|
||||
return news
|
||||
|
||||
def splitTitle(title):
|
||||
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||
return [title1, title2]
|
||||
|
||||
def formatString(string):
|
||||
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||
return string
|
||||
|
||||
def formatSection(string):
|
||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||
|
||||
def formatSubsection(string):
|
||||
# SPIEGEL, SPIEGEL special
|
||||
subsection = {
|
||||
'abi': 'Abi - und dann?',
|
||||
'formel1': 'Formel 1',
|
||||
'jobundberuf': 'Job & Beruf',
|
||||
'leben': 'Leben U21',
|
||||
'mensch': 'Mensch & Technik',
|
||||
'sonst': '',
|
||||
'staedte': u'St\xc3dte',
|
||||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
def getIssue(year, week):
|
||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||
if not oxutils.net.exists(coverUrl):
|
||||
return None
|
||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||
contents = []
|
||||
soup = BeautifulSoup(oxutils.cache.getUrl(url))
|
||||
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
|
||||
item = str(item)
|
||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||
title = stripTags(item).strip()
|
||||
contents.append({'title': title, 'page': page})
|
||||
pageUrl = {}
|
||||
pages = page + 2
|
||||
for page in range(1, pages + 10):
|
||||
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
||||
if oxutils.cache.exists(url):
|
||||
pageUrl[page] = url
|
||||
else:
|
||||
pageUrl[page] = ''
|
||||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||
|
||||
|
||||
def archiveIssues():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
p = {}
|
||||
import os
|
||||
import simplejson
|
||||
import time
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
week = int(time.strftime('%W', localtime))
|
||||
for y in range(year, 1993, -1):
|
||||
if y == year:
|
||||
wMax = week + 1
|
||||
else:
|
||||
wMax = 53
|
||||
for w in range(wMax, 0, -1):
|
||||
print 'getIssue(%d, %d)' % (y, w)
|
||||
issue = getIssue(y, w)
|
||||
if issue:
|
||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = simplejson.dumps(issue, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = []
|
||||
for item in issue['contents']:
|
||||
data.append('%3d %s' % (item['page'], item['title']))
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = oxutils.cache.getUrl(issue['coverUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
for page in issue['pageUrl']:
|
||||
url = issue['pageUrl'][page]
|
||||
if url:
|
||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||
if not os.path.exists(filename):
|
||||
data = oxutils.cache.getUrl(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
if not p:
|
||||
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
|
||||
else:
|
||||
p['num'] += 1
|
||||
p['sum'] += issue['pages']
|
||||
if issue['pages'] < p['min']:
|
||||
p['min'] = issue['pages']
|
||||
if issue['pages'] > p['max']:
|
||||
p['max'] = issue['pages']
|
||||
print p['min'], p['sum'] / p['num'], p['max']
|
||||
|
||||
|
||||
def archiveNews():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
import os
|
||||
import simplejson
|
||||
import time
|
||||
|
||||
count = {}
|
||||
colon = []
|
||||
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
month = int(time.strftime('%m', localtime))
|
||||
day = int(time.strftime('%d', localtime)) - 1
|
||||
for y in range(year, 1999, -1):
|
||||
if y == year:
|
||||
mMax = month
|
||||
else:
|
||||
mMax = 12
|
||||
for m in range(mMax, 0, -1):
|
||||
if y == year and m == month:
|
||||
dMax = day
|
||||
elif m == 2 and y % 4 == 0 and y % 400 != 0:
|
||||
dMax = days[m] + 1
|
||||
else:
|
||||
dMax = days[m]
|
||||
for d in range(dMax, 0, -1):
|
||||
print 'getNews(%d, %d, %d)' % (y, m, d)
|
||||
news = getNews(y, m ,d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
if new['url'][-5:] == '.html':
|
||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||
else:
|
||||
filename = dirname + '/' + new['url'] + '.json'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = simplejson.dumps(new, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = filename[:-5] + '.txt'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = splitTitle(new['title'])
|
||||
data.append(new['description'])
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||
if not os.path.exists(filename):
|
||||
data = oxutils.cache.getUrl(new['imageUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
strings = new['url'].split('/')
|
||||
string = strings[3]
|
||||
if len(strings) == 6:
|
||||
string += '/' + strings[4]
|
||||
if not count.has_key(string):
|
||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
strings = splitTitle(new['title'])
|
||||
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||
for key in sortDictByKey(count):
|
||||
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||
for value in colon:
|
||||
print value
|
||||
|
||||
def sortDictByKey(d):
|
||||
keys = d.keys()
|
||||
keys.sort()
|
||||
return keys
|
||||
|
||||
if __name__ == '__main__':
|
||||
# spiegel = Spiegel(2008, 8)
|
||||
# print spiegel.getContents()
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print '2/%d' % d
|
||||
news = getNews(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = formatSection(strings[3])
|
||||
if len(strings) == 6:
|
||||
string += '/' + formatSubsection(strings[4])
|
||||
if not string in x:
|
||||
x.append(string)
|
||||
print x
|
||||
'''
|
||||
# archiveIssues()
|
||||
archiveNews()
|
||||
118
oxweb/thepiratebay.py
Normal file
118
oxweb/thepiratebay.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from oxutils.normalize import normalizeImdbId
|
||||
import oxutils
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
|
||||
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
|
||||
headers = cache.DEFAULT_HEADERS
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.getUrl(url, data, headers, timeout)
|
||||
|
||||
def _getUrlUnicode(url):
|
||||
return cache.getUrlUnicode(url, _getUrl=_getUrl)
|
||||
|
||||
def findMovies(query, max_results=10):
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
url = next[0]
|
||||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = _getUrlUnicode(url)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decodeHtml(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
if len(results) >= max_results:
|
||||
return results
|
||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
|
||||
def findMovieByImdb(imdb):
|
||||
return findMovies("tt" + normalizeImdbId(imdb))
|
||||
|
||||
def getId(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = findRe(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
||||
def exists(piratebayId):
|
||||
piratebayId = getId(piratebayId)
|
||||
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
|
||||
|
||||
def getData(piratebayId):
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
'by': u'uploader',
|
||||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = getId(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||
|
||||
data = _getUrlUnicode(torrent['comment_link'])
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = _getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = getData(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
37
oxweb/torrent.py
Normal file
37
oxweb/torrent.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from oxutils import intValue
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(intValue(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
72
oxweb/wikipedia.py
Normal file
72
oxweb/wikipedia.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import urlencode
|
||||
|
||||
import simplejson
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, decodeHtml
|
||||
|
||||
|
||||
def getMovieId(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
|
||||
def getUrlByImdb(imdbId):
|
||||
query = '"imdb_id = %s"'% imdbId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
if str(imdbId).startswith('0'):
|
||||
imdbId = imdbId[1:]
|
||||
return getUrlByImdb(imdbId)
|
||||
|
||||
def getUrlByAmbId(amg_id):
|
||||
query = '"amg_id = %s"'% amg_id
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
|
||||
def getWikiData(wikipediaUrl):
|
||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||
html = getUrlUnicode(url)
|
||||
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||
return data
|
||||
|
||||
def getMovieData(wikipediaUrl):
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||
filmbox = {}
|
||||
for row in filmbox_data.strip().split('|'):
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
key = d[0].strip()
|
||||
value = d[1].strip()
|
||||
filmbox[key] = value
|
||||
return filmbox
|
||||
|
||||
def getAmgId(wikipediaUrl):
|
||||
data = getMovieData(wikipediaUrl)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||
data = getUrl(url)
|
||||
if not data:
|
||||
data = getUrl(url, timeout=0)
|
||||
result = simplejson.loads(data)
|
||||
results = []
|
||||
for r in result['query']['search']:
|
||||
title = r['title']
|
||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||
results.append((title, url, ''))
|
||||
return results
|
||||
|
||||
56
oxweb/youtube.py
Normal file
56
oxweb/youtube.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import quote
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
import feedparser
|
||||
from oxutils.cache import getUrl
|
||||
from oxutils import findString
|
||||
|
||||
|
||||
def getVideoUrl(youtubeId, format='mp4'):
|
||||
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
|
||||
data = getUrl(url)
|
||||
xml = ET.fromstring(data)
|
||||
youtubeKey = xml.find('t').text
|
||||
if format == 'mp4':
|
||||
fmt=18
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
|
||||
else:
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
|
||||
return url
|
||||
|
||||
def getMovieInfo(youtubeId):
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
return getInfoFromAtom(fd.entries[0])
|
||||
|
||||
def getInfoFromAtom(entry):
|
||||
info = dict()
|
||||
info['title'] = entry['title']
|
||||
info['description'] = entry['description']
|
||||
info['author'] = entry['author']
|
||||
info['published'] = entry['published_parsed']
|
||||
info['keywords'] = entry['media_keywords'].split(', ')
|
||||
info['url'] = entry['links'][0]['href']
|
||||
info['id'] = findString(info['url'], "/watch?v=")
|
||||
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
||||
info['flv'] = getVideoUrl(info['id'], 'flv')
|
||||
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
||||
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
|
||||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for entry in fd.entries:
|
||||
v = getInfoFromAtom(entry)
|
||||
videos.append(v)
|
||||
if len(videos) >= max_results:
|
||||
return videos
|
||||
return videos
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue