add ox.web to this repos
This commit is contained in:
parent
0d354d2574
commit
06d61943ac
29 changed files with 2123 additions and 9 deletions
31
README
31
README
|
@ -1,22 +1,37 @@
|
||||||
python-oxlib some tools to build tools
|
python-ox some tools to build tools
|
||||||
|
|
||||||
Depends:
|
Depends:
|
||||||
python2.5
|
python2.5
|
||||||
python-chardet (http://chardet.feedparser.org/)
|
python-chardet (http://chardet.feedparser.org/)
|
||||||
|
python-feedparser (http://www.feedparser.org/)
|
||||||
|
python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
import oxlib
|
import ox
|
||||||
|
|
||||||
data = oxlib.cache.readUrl('http:/...')
|
data = ox.cache.readUrl('http:/...')
|
||||||
text = oxlib.stripTags(data)
|
text = ox.stripTags(data)
|
||||||
oxlib.normalizeNewlines(text)
|
ox.normalizeNewlines(text)
|
||||||
oxlib.formatBytes(len(data))
|
ox.formatBytes(len(data))
|
||||||
|
|
||||||
oxlib.formatBytes(1234567890)
|
ox.formatBytes(1234567890)
|
||||||
'1.15 GB'
|
'1.15 GB'
|
||||||
|
|
||||||
|
import ox.web.imdb
|
||||||
|
imdbId = ox.web.imdb.guess('The Matrix')
|
||||||
|
info = ox.web.imdb.Imdb(imdbId)
|
||||||
|
info['year']
|
||||||
|
1999
|
||||||
|
|
||||||
Install:
|
Install:
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|
||||||
|
Cookies:
|
||||||
|
some ox.web modules require user accont information or cookies to work,
|
||||||
|
those are saved in ~/.ox/auth.json, most basic form looks like this:
|
||||||
|
{
|
||||||
|
"key": "value"
|
||||||
|
}
|
||||||
|
|
||||||
Tests:
|
Tests:
|
||||||
nosetests --with-doctest oxlib
|
nosetests --with-doctest ox
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
# GPL 2008
|
# GPL 2008
|
||||||
__version__ = '1.0.0'
|
__version__ = '2.0.0'
|
||||||
|
|
||||||
from file import *
|
from file import *
|
||||||
from format import *
|
from format import *
|
||||||
|
|
9
ox/web/__init__.py
Normal file
9
ox/web/__init__.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# encoding: utf-8
|
||||||
|
__version__ = '1.0.0'
|
||||||
|
|
||||||
|
import imdb
|
||||||
|
import wikipedia
|
||||||
|
import google
|
||||||
|
import piratecinema
|
||||||
|
import oxdb
|
61
ox/web/aaaarg.py
Normal file
61
ox/web/aaaarg.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
|
||||||
|
from ox import cache
|
||||||
|
from ox.html import stripTags, decodeHtml
|
||||||
|
from ox.text import findRe
|
||||||
|
from ox.normalize import canonicalName
|
||||||
|
import auth
|
||||||
|
|
||||||
|
|
||||||
|
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
||||||
|
headers = headers.copy()
|
||||||
|
headers["Cookie"] = auth.get("aaaarg.cookie")
|
||||||
|
return cache.readUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
|
def readUrlUnicode(url, timeout=cache.cache_timeout):
|
||||||
|
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
||||||
|
|
||||||
|
def downloadText(id, filename=None):
|
||||||
|
#FIXME, what about the cache, this keeps all pdfs in oxcache...
|
||||||
|
url='http://a.aaaarg.org/node/%d/download' % id
|
||||||
|
data = readUrl(url, timeout=-1)
|
||||||
|
headers = cache.getHeaders(url, timeout=-1)
|
||||||
|
if filename:
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
return
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getTextByLetter(letter):
|
||||||
|
texts = []
|
||||||
|
url = 'http://a.aaaarg.org/library/%s' % letter
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
|
||||||
|
author = 'Unknown Author'
|
||||||
|
for r in txts:
|
||||||
|
if r[0] != ' ':
|
||||||
|
author = r[0]
|
||||||
|
link = r[1]
|
||||||
|
id = findRe(link, '/(\d+)')
|
||||||
|
title = decodeHtml(r[2])
|
||||||
|
author_foder = canonicalName(author)
|
||||||
|
author_foder = os.path.join(author_foder[0], author_foder)
|
||||||
|
filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id))
|
||||||
|
texts.append({
|
||||||
|
'author': author,
|
||||||
|
'title': title,
|
||||||
|
'id': id,
|
||||||
|
'filename': filename,
|
||||||
|
})
|
||||||
|
return texts
|
||||||
|
|
||||||
|
def getTexts():
|
||||||
|
texts = []
|
||||||
|
for letter in string.letters[:26]:
|
||||||
|
texts += getTextByLetter(letter)
|
||||||
|
return texts
|
||||||
|
|
78
ox/web/allmovie.py
Normal file
78
ox/web/allmovie.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
from ox import stripTags, findRe
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
return url.split("/")[-2]
|
||||||
|
|
||||||
|
def getData(id):
|
||||||
|
'''
|
||||||
|
>>> getData('129689')['cast'][1][1]
|
||||||
|
u'Marianne'
|
||||||
|
>>> getData('129689')['credits'][0][0]
|
||||||
|
u'Jean-Luc Godard'
|
||||||
|
>>> getData('129689')['posters'][0]
|
||||||
|
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
||||||
|
>>> getData('129689')['rating']
|
||||||
|
u'4.5'
|
||||||
|
'''
|
||||||
|
data = {
|
||||||
|
"url": getUrl(id)
|
||||||
|
}
|
||||||
|
html = readUrlUnicode(data["url"])
|
||||||
|
data['aka'] = parseList(html, 'AKA')
|
||||||
|
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
|
||||||
|
data['countries'] = parseList(html, 'Countries')
|
||||||
|
data['director'] = parseEntry(html, 'Director')
|
||||||
|
data['genres'] = parseList(html, 'Genres')
|
||||||
|
data['keywords'] = parseList(html, 'Keywords')
|
||||||
|
data['posters'] = [findRe(html, '<img src="(http://image\..*?)"')]
|
||||||
|
data['produced'] = parseList(html, 'Produced by')
|
||||||
|
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
||||||
|
data['released'] = parseEntry(html, 'Released by')
|
||||||
|
data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')
|
||||||
|
data['runtime'] = findRe(html, '<td class="formed-sub" style="width: 86px;">(\d+) min.</td>')
|
||||||
|
data['set'] = parseEntry(html, 'Set In')
|
||||||
|
data['synopsis'] = parseText(html, 'Plot Synopsis')
|
||||||
|
data['themes'] = parseList(html, 'Themes')
|
||||||
|
data['types'] = parseList(html, 'Types')
|
||||||
|
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
|
||||||
|
html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
|
||||||
|
data['cast'] = parseTable(html)
|
||||||
|
html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
|
||||||
|
data['credits'] = parseTable(html)
|
||||||
|
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
|
||||||
|
data['review'] = parseText(html, 'Review')
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
return "http://allmovie.com/work/%s/" % id
|
||||||
|
|
||||||
|
def parseEntry(html, title):
|
||||||
|
return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title)).strip()
|
||||||
|
|
||||||
|
def parseList(html, title):
|
||||||
|
html = findRe(html, '<span>%s</span>(.*?)</table>' % title)
|
||||||
|
return map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||||
|
|
||||||
|
def parseTable(html):
|
||||||
|
return map(
|
||||||
|
lambda x: map(
|
||||||
|
lambda x: stripTags(x).strip().replace(' ', ''),
|
||||||
|
x.split('<td width="305">-')
|
||||||
|
),
|
||||||
|
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||||
|
)
|
||||||
|
|
||||||
|
def parseText(html, title):
|
||||||
|
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print getData('129689')
|
||||||
|
# print getData('177524')
|
||||||
|
|
20
ox/web/auth.py
Normal file
20
ox/web/auth.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# GPL 2009
|
||||||
|
import os
|
||||||
|
import simplejson
|
||||||
|
|
||||||
|
|
||||||
|
def get(key):
|
||||||
|
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||||
|
auth = {}
|
||||||
|
if os.path.exists(user_auth):
|
||||||
|
f = open(user_auth, "r")
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
auth = simplejson.loads(data)
|
||||||
|
if key in auth:
|
||||||
|
return auth[key]
|
||||||
|
print "please add key %s to json file '%s'" % (key, user_auth)
|
||||||
|
return ""
|
||||||
|
|
90
ox/web/criterion.py
Normal file
90
ox/web/criterion.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
|
||||||
|
import ox.cache
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
from ox.html import stripTags
|
||||||
|
from ox.text import findRe, removeSpecialCharacters
|
||||||
|
|
||||||
|
import imdb
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
return url.split("/")[-1]
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
return "http://www.criterion.com/films/%s" % id
|
||||||
|
|
||||||
|
def getData(id):
|
||||||
|
'''
|
||||||
|
>>> getData('1333')['imdbId']
|
||||||
|
'0060304'
|
||||||
|
|
||||||
|
>>> getData('236')['posters'][0]
|
||||||
|
'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
|
||||||
|
|
||||||
|
>>> getData('786')['posters'][0]
|
||||||
|
'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
|
||||||
|
'''
|
||||||
|
data = {
|
||||||
|
"url": getUrl(id)
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
html = readUrlUnicode(data["url"])
|
||||||
|
except:
|
||||||
|
html = ox.cache.getUrl(data["url"])
|
||||||
|
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
|
||||||
|
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
|
||||||
|
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
|
||||||
|
results = re.compile("<p><strong>(.*?)</strong></p>").findall(html)
|
||||||
|
data["country"] = results[0]
|
||||||
|
data["year"] = results[1]
|
||||||
|
result = findRe(html, "<div class=\"synopsis contentbox lightgray\">(.*?)</div>")
|
||||||
|
data["synopsis"] = findRe(result, "<p>(.*?)</p>")
|
||||||
|
result = findRe(html, "<div class=\"editioninfo\">(.*?)</div>")
|
||||||
|
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||||
|
result = re.compile("<div class=\"editioninfo\">(.*?)</div>", re.DOTALL).findall(html)[1]
|
||||||
|
result = findRe(result, "<a href=\"(.*?)\">")
|
||||||
|
if not "/boxsets/" in result:
|
||||||
|
data["posters"] = [result]
|
||||||
|
else:
|
||||||
|
html_ = readUrlUnicode(result)
|
||||||
|
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
|
||||||
|
result = findRe(result, "src=\"(.*?)\"")
|
||||||
|
data["posters"] = [result.replace("_w100", "")]
|
||||||
|
result = findRe(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||||
|
if result:
|
||||||
|
data["stills"] = [result]
|
||||||
|
data["trailers"] = []
|
||||||
|
else:
|
||||||
|
data["stills"] = [findRe(html, "\"thumbnailURL\", \"(.*?)\"")]
|
||||||
|
data["trailers"] = [findRe(html, "\"videoURL\", \"(.*?)\"")]
|
||||||
|
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getIds():
|
||||||
|
ids = []
|
||||||
|
html = readUrlUnicode("http://www.criterion.com/library/dvd")
|
||||||
|
results = re.compile("page=(.*?)\"").findall(html)
|
||||||
|
pages = int(results[len(results) - 2])
|
||||||
|
for page in range(pages, 0, -1):
|
||||||
|
for id in getIdsByPage(page):
|
||||||
|
ids.append(id)
|
||||||
|
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
||||||
|
|
||||||
|
def getIdsByPage(page):
|
||||||
|
ids = []
|
||||||
|
html = readUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
|
||||||
|
results = re.compile("films/(.*?)\"").findall(html)
|
||||||
|
for result in results:
|
||||||
|
ids.append(result)
|
||||||
|
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||||
|
for result in results:
|
||||||
|
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
|
||||||
|
results = re.compile("films/(.*?)\"").findall(html)
|
||||||
|
for result in results:
|
||||||
|
ids.append(result)
|
||||||
|
return set(ids)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print getIds()
|
22
ox/web/dailymotion.py
Normal file
22
ox/web/dailymotion.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
from urllib import unquote
|
||||||
|
from ox.cache import readUrl
|
||||||
|
|
||||||
|
|
||||||
|
def getVideoUrl(url):
|
||||||
|
'''
|
||||||
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0]
|
||||||
|
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv'
|
||||||
|
|
||||||
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
|
||||||
|
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
|
||||||
|
'''
|
||||||
|
data = readUrl(url)
|
||||||
|
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||||
|
for v in video:
|
||||||
|
v = unquote(v).split('@@')[0]
|
||||||
|
return "http://www.dailymotion.com" + v
|
||||||
|
return ''
|
||||||
|
|
49
ox/web/epguides.py
Normal file
49
ox/web/epguides.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
from ox import stripTags, findRe
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
|
||||||
|
import google
|
||||||
|
|
||||||
|
|
||||||
|
def getShowUrl(title):
|
||||||
|
'''
|
||||||
|
Search Epguide Url for Show via Show Title.
|
||||||
|
Use Google to search the url, this is also done on Epguide.
|
||||||
|
'''
|
||||||
|
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
|
||||||
|
if url.startswith('http://epguides.com'):
|
||||||
|
if re.search(title, name):
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getShowData(url):
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
r = {}
|
||||||
|
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
|
||||||
|
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||||
|
r['episodes'] = {}
|
||||||
|
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||||
|
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||||
|
air_date = episode[3].strip()
|
||||||
|
#'22 Sep 04' -> 2004-09-22
|
||||||
|
try:
|
||||||
|
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
s = episode[1].split('-')[0].strip()
|
||||||
|
e = episode[1].split('-')[-1].strip()
|
||||||
|
try:
|
||||||
|
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
|
||||||
|
'prod code': episode[2],
|
||||||
|
'air date': air_date,
|
||||||
|
'url': episode[4],
|
||||||
|
'title':episode[5],
|
||||||
|
}
|
||||||
|
except:
|
||||||
|
print "oxweb.epguides failed,", url
|
||||||
|
return r
|
||||||
|
|
57
ox/web/google.py
Normal file
57
ox/web/google.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
import weakref
|
||||||
|
import threading
|
||||||
|
import Queue
|
||||||
|
import simplejson
|
||||||
|
|
||||||
|
|
||||||
|
import ox
|
||||||
|
from ox import stripTags
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
usage:
|
||||||
|
import google
|
||||||
|
google.find(query)
|
||||||
|
|
||||||
|
for result in google.find(query): result
|
||||||
|
|
||||||
|
result is title, url, description
|
||||||
|
|
||||||
|
google.find(query, max_results)
|
||||||
|
|
||||||
|
FIXME: how search depper than first page?
|
||||||
|
'''
|
||||||
|
DEFAULT_MAX_RESULTS = 10
|
||||||
|
DEFAULT_TIMEOUT = 24*60*60
|
||||||
|
|
||||||
|
def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||||
|
return ox.cache.readUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
|
def quote_plus(s):
|
||||||
|
return urllib.quote_plus(s.encode('utf-8'))
|
||||||
|
|
||||||
|
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||||
|
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||||
|
data = readUrl(url, timeout=timeout)
|
||||||
|
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||||
|
r'.*?(?:<br>|<table.*?>)' + \
|
||||||
|
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||||
|
results = []
|
||||||
|
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||||
|
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||||
|
results.append((stripTags(name), url, stripTags(desc)))
|
||||||
|
if len(results) > max_results:
|
||||||
|
results = results[:max_results]
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _find(query):
|
||||||
|
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
|
||||||
|
results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results']
|
||||||
|
return results
|
||||||
|
|
210
ox/web/imdb.py
Normal file
210
ox/web/imdb.py
Normal file
|
@ -0,0 +1,210 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import urllib2
|
||||||
|
from urllib import quote, unquote
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
import ox
|
||||||
|
from ox import findRe
|
||||||
|
from ox.normalize import normalizeTitle, normalizeImdbId
|
||||||
|
|
||||||
|
from siteparser import SiteParser
|
||||||
|
import google
|
||||||
|
|
||||||
|
class Imdb(SiteParser):
|
||||||
|
regex = {
|
||||||
|
'cast': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'cinematographers': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': [
|
||||||
|
'Cinematography by</a>(.*?)</table>',
|
||||||
|
'<a href="/name/.*?/">(.*?)</a>'
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'connections': {
|
||||||
|
'page': 'movieconnections',
|
||||||
|
're': '<h5>(.*?)</h5>(.*?)\n\n',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'countries': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<a href="/Sections/Countries/.*?/">(.*?)</a>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'directors': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': [
|
||||||
|
'Directed by</a>(.*?)</table>',
|
||||||
|
'<a href="/name/.*?/">(.*?)</a>'
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'editors': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': [
|
||||||
|
'Film Editing by</a>(.*?)</table>',
|
||||||
|
'<a href="/name/.*?/">(.*?)</a>'
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'filming_locations': {
|
||||||
|
'page': 'locations',
|
||||||
|
're': '<a href="/search/title\?locations=.*?">(.*?)</a>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'genres': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<a href="/Sections/Genres/.*?/">(.*?)</a>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'keywords': {
|
||||||
|
'page': 'keywords',
|
||||||
|
're': '<a href="/keyword/.*?/">(.*?)</a>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'languages': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<a href="/Sections/Languages/.*?/">(.*?)</a>',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'plot': {
|
||||||
|
'page': 'plotsummary',
|
||||||
|
're': '<p class="plotpar">(.*?)<i>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
|
'poster_id': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '/primary-photo/media/rm(.*?)/tt',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'poster_ids': {
|
||||||
|
'page': 'posters',
|
||||||
|
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'producers': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': [
|
||||||
|
'Produced by</a>(.*?)</table>',
|
||||||
|
'<a href="/name/.*?/">(.*?)</a>'
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'rating': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<div class="starbar-meta">.*?<b>(.*?)/10</b>',
|
||||||
|
'type': 'float'
|
||||||
|
},
|
||||||
|
'release_date': {
|
||||||
|
'page': 'releaseinfo',
|
||||||
|
're': '<a href="/date/(\d{2})-(\d{2})/">.*?</a> <a href="/year/(\d{4})/">',
|
||||||
|
'type': 'date'
|
||||||
|
},
|
||||||
|
'runtime': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
|
'title': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<h1>(.*?) <span>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
|
'trivia': {
|
||||||
|
'page': 'trivia',
|
||||||
|
're': '<div class="sodatext">(.*?)<br>',
|
||||||
|
'type': 'list',
|
||||||
|
},
|
||||||
|
'votes': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<a href="ratings" class="tn15more">(.*?) votes</a>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
|
'writers': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': [
|
||||||
|
'Writing credits</a>(.*?)</table>',
|
||||||
|
'<a href="/name/.*?/">(.*?)</a>'
|
||||||
|
],
|
||||||
|
'type': 'list'
|
||||||
|
},
|
||||||
|
'year': {
|
||||||
|
'page': 'combined',
|
||||||
|
're': '<a href="/year/(\d{4})/">',
|
||||||
|
'type': 'int'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, id):
|
||||||
|
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||||
|
super(Imdb, self).__init__()
|
||||||
|
|
||||||
|
if 'runtime' in self:
|
||||||
|
if 'min' in self['runtime']: base=60
|
||||||
|
else: base=1
|
||||||
|
self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base
|
||||||
|
|
||||||
|
if 'connections' in self:
|
||||||
|
cc={}
|
||||||
|
for rel, data in self['connections']:
|
||||||
|
cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">').findall(data)
|
||||||
|
self['connections'] = cc
|
||||||
|
|
||||||
|
def guess(title, director='', timeout=google.DEFAULT_TIMEOUT):
|
||||||
|
#FIXME: proper file -> title
|
||||||
|
title = title.split('-')[0]
|
||||||
|
title = title.split('(')[0]
|
||||||
|
title = title.split('.')[0]
|
||||||
|
title = title.strip()
|
||||||
|
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
||||||
|
return_url = ''
|
||||||
|
|
||||||
|
#lest first try google
|
||||||
|
#i.e. site:imdb.com Michael Stevens Sin
|
||||||
|
if director:
|
||||||
|
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||||
|
else:
|
||||||
|
search = 'site:imdb.com "%s"' % title
|
||||||
|
for (name, url, desc) in google.find(search, 2, timeout=timeout):
|
||||||
|
if url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return normalizeImdbId(int(ox.intValue(url)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
|
||||||
|
u = urllib2.urlopen(req)
|
||||||
|
data = u.read()
|
||||||
|
return_url = u.url
|
||||||
|
u.close()
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return return_url[28:35]
|
||||||
|
if data:
|
||||||
|
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||||
|
if imdb_id:
|
||||||
|
return imdb_id
|
||||||
|
|
||||||
|
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
||||||
|
req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
|
||||||
|
u = urllib2.urlopen(req)
|
||||||
|
data = u.read()
|
||||||
|
return_url = u.url
|
||||||
|
u.close()
|
||||||
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
|
return return_url[28:35]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import json
|
||||||
|
print json.dumps(Imdb('0306414'), indent=2)
|
||||||
|
#print json.dumps(Imdb('0133093'), indent=2)
|
||||||
|
|
84
ox/web/impawards.py
Normal file
84
ox/web/impawards.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# encoding: utf-8
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
from ox.html import stripTags
|
||||||
|
from ox.text import findRe
|
||||||
|
|
||||||
|
import imdb
|
||||||
|
|
||||||
|
def getData(id):
|
||||||
|
'''
|
||||||
|
>>> getData('1991/silence_of_the_lambs')['imdbId']
|
||||||
|
u'0102926'
|
||||||
|
|
||||||
|
>>> getData('1991/silence_of_the_lambs')['posters'][0]
|
||||||
|
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg'
|
||||||
|
|
||||||
|
>>> getData('1991/silence_of_the_lambs')['url']
|
||||||
|
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||||
|
'''
|
||||||
|
data = {
|
||||||
|
'url': getUrl(id)
|
||||||
|
}
|
||||||
|
html = readUrlUnicode(data['url'])
|
||||||
|
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
|
||||||
|
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||||
|
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||||
|
data['posters'] = []
|
||||||
|
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
|
||||||
|
for result in results:
|
||||||
|
result = result.replace('_xlg.html', '.html')
|
||||||
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
|
html = readUrlUnicode(url)
|
||||||
|
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
||||||
|
if result:
|
||||||
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
|
html = readUrlUnicode(url)
|
||||||
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
||||||
|
else:
|
||||||
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
|
||||||
|
data['posters'].append(poster)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
split = url.split('/')
|
||||||
|
year = split[3]
|
||||||
|
split = split[4][:-5].split('_')
|
||||||
|
if split[-1] == 'xlg':
|
||||||
|
split.pop()
|
||||||
|
if findRe(split[-1], 'ver\d+$'):
|
||||||
|
split.pop()
|
||||||
|
id = '%s/%s' % (year, '_'.join(split))
|
||||||
|
return id
|
||||||
|
|
||||||
|
def getIds():
|
||||||
|
ids = []
|
||||||
|
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
|
||||||
|
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
||||||
|
for page in range(pages, 0, -1):
|
||||||
|
for id in getIdsByPage(page):
|
||||||
|
if not id in ids:
|
||||||
|
ids.append(id)
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def getIdsByPage(page):
|
||||||
|
ids = []
|
||||||
|
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
|
||||||
|
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||||
|
for result in results:
|
||||||
|
url = 'http://impawards.com/%s' % result
|
||||||
|
ids.append(getId(url))
|
||||||
|
return set(ids)
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
url = "http://www.impawards.com/%s.html" % id
|
||||||
|
html = readUrlUnicode(url)
|
||||||
|
if findRe(html, "No Movie Posters on This Page"):
|
||||||
|
url = "http://www.impawards.com/%s_ver1.html" % id
|
||||||
|
return url
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ids = getIds()
|
||||||
|
print sorted(ids), len(ids)
|
187
ox/web/itunes.py
Normal file
187
ox/web/itunes.py
Normal file
|
@ -0,0 +1,187 @@
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# encoding: utf-8
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
from ox.cache import readUrl
|
||||||
|
from ox.html import decodeHtml, stripTags
|
||||||
|
from ox.text import findRe
|
||||||
|
from ox.text import findString
|
||||||
|
|
||||||
|
|
||||||
|
# to sniff itunes traffic, use something like
|
||||||
|
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||||
|
|
||||||
|
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
|
||||||
|
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
|
||||||
|
|
||||||
|
ITUNES_HEADERS = {
|
||||||
|
'X-Apple-Tz': '0',
|
||||||
|
'X-Apple-Storefront': '143441-1',
|
||||||
|
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
|
||||||
|
'Accept-Language': 'en-us, en;q=0.50',
|
||||||
|
'Accept-Encoding': 'gzip',
|
||||||
|
'Connection': 'close',
|
||||||
|
}
|
||||||
|
|
||||||
|
def composeUrl(request, parameters):
|
||||||
|
if request == 'advancedSearch':
|
||||||
|
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||||
|
if parameters['media'] == 'music':
|
||||||
|
url += urllib.urlencode({
|
||||||
|
'albumTerm': parameters['title'],
|
||||||
|
'allArtistNames': parameters['artist'],
|
||||||
|
'composerTerm': '',
|
||||||
|
'flavor': 0,
|
||||||
|
'genreIndex': 1,
|
||||||
|
'media': 'music',
|
||||||
|
'mediaType': 2,
|
||||||
|
'ringtone': 0,
|
||||||
|
'searchButton': 'submit',
|
||||||
|
'songTerm': ''
|
||||||
|
})
|
||||||
|
elif parameters['media'] == 'movie':
|
||||||
|
url += urllib.urlencode({
|
||||||
|
'actorTerm': '',
|
||||||
|
'closedCaption': 0,
|
||||||
|
'descriptionTerm': '',
|
||||||
|
'directorProducerName': parameters['director'],
|
||||||
|
'flavor': 0,
|
||||||
|
'media': 'movie',
|
||||||
|
'mediaType': 3,
|
||||||
|
'movieTerm': parameters['title'],
|
||||||
|
'ratingIndex': 1,
|
||||||
|
'releaseYearTerm': '',
|
||||||
|
'searchButton': 'submit'
|
||||||
|
})
|
||||||
|
elif request == 'viewAlbum':
|
||||||
|
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||||
|
elif request == 'viewMovie':
|
||||||
|
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||||
|
return url
|
||||||
|
|
||||||
|
def parseXmlDict(xml):
|
||||||
|
values = {}
|
||||||
|
strings = xml.split('<key>')
|
||||||
|
for string in strings:
|
||||||
|
if string.find('</key>') != -1:
|
||||||
|
key = findRe(string, '(.*?)</key>')
|
||||||
|
type = findRe(string, '</key><(.*?)>')
|
||||||
|
if type == 'true/':
|
||||||
|
value = True
|
||||||
|
else:
|
||||||
|
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||||
|
if type == 'integer':
|
||||||
|
value = int(value)
|
||||||
|
elif type == 'string':
|
||||||
|
value = decodeHtml(value)
|
||||||
|
values[key] = value
|
||||||
|
return values
|
||||||
|
|
||||||
|
def parseCast(xml, title):
|
||||||
|
list = []
|
||||||
|
try:
|
||||||
|
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
|
strings.pop()
|
||||||
|
for string in strings:
|
||||||
|
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
|
return list
|
||||||
|
except:
|
||||||
|
return list
|
||||||
|
|
||||||
|
def parseMovies(xml, title):
|
||||||
|
list = []
|
||||||
|
try:
|
||||||
|
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
|
strings.pop()
|
||||||
|
for string in strings:
|
||||||
|
list.append({
|
||||||
|
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||||
|
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||||
|
})
|
||||||
|
return list
|
||||||
|
except:
|
||||||
|
return list
|
||||||
|
|
||||||
|
class ItunesAlbum:
|
||||||
|
def __init__(self, id = '', title = '', artist = ''):
|
||||||
|
self.id = id
|
||||||
|
self.title = title
|
||||||
|
self.artist = artist
|
||||||
|
if not id:
|
||||||
|
self.id = self.getId()
|
||||||
|
|
||||||
|
def getId(self):
|
||||||
|
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
|
xml = readUrl(url, headers = ITUNES_HEADERS)
|
||||||
|
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
|
return id
|
||||||
|
|
||||||
|
def getData(self):
|
||||||
|
data = {'id': self.id}
|
||||||
|
url = composeUrl('viewAlbum', {'id': self.id})
|
||||||
|
xml = readUrl(url, None, ITUNES_HEADERS)
|
||||||
|
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||||
|
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||||
|
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||||
|
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||||
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
|
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
|
data['tracks'] = []
|
||||||
|
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
|
for string in strings:
|
||||||
|
data['tracks'].append(parseXmlDict(string))
|
||||||
|
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||||
|
return data
|
||||||
|
|
||||||
|
class ItunesMovie:
|
||||||
|
def __init__(self, id = '', title = '', director = ''):
|
||||||
|
self.id = id
|
||||||
|
self.title = title
|
||||||
|
self.director = director
|
||||||
|
if not id:
|
||||||
|
self.id = self.getId()
|
||||||
|
|
||||||
|
def getId(self):
|
||||||
|
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
|
xml = readUrl(url, headers = ITUNES_HEADERS)
|
||||||
|
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||||
|
return id
|
||||||
|
|
||||||
|
def getData(self):
|
||||||
|
data = {'id': self.id}
|
||||||
|
url = composeUrl('viewMovie', {'id': self.id})
|
||||||
|
xml = readUrl(url, None, ITUNES_HEADERS)
|
||||||
|
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||||
|
f.write(xml)
|
||||||
|
f.close()
|
||||||
|
data['actors'] = parseCast(xml, 'actors')
|
||||||
|
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||||
|
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||||
|
data['directors'] = parseCast(xml, 'directors')
|
||||||
|
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||||
|
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||||
|
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
|
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||||
|
data['producers'] = parseCast(xml, 'producers')
|
||||||
|
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||||
|
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||||
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
|
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||||
|
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||||
|
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
|
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||||
|
return data
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import simplejson
|
||||||
|
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
for v in data['relatedMovies']:
|
||||||
|
data = ItunesMovie(id = v['id']).getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
data = ItunesMovie(id='272960052').getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
|
128
ox/web/karagarga.py
Normal file
128
ox/web/karagarga.py
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
from ox import cache
|
||||||
|
from ox.html import stripTags
|
||||||
|
from ox.text import findRe
|
||||||
|
|
||||||
|
import auth
|
||||||
|
|
||||||
|
|
||||||
|
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
||||||
|
headers = headers.copy()
|
||||||
|
headers["Cookie"] = auth.get("karagarga.cookie")
|
||||||
|
return cache.readUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
|
def readUrlUnicode(url, timeout=cache.cache_timeout):
|
||||||
|
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
||||||
|
|
||||||
|
def getData(id):
|
||||||
|
data = {
|
||||||
|
"url": getUrl(id)
|
||||||
|
}
|
||||||
|
html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
|
||||||
|
if 'No torrent with ID' in html:
|
||||||
|
return False
|
||||||
|
data['added'] = stripTags(parseTable(html, 'Added'))
|
||||||
|
data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"')
|
||||||
|
# data['description'] = parseTable(html, 'Description')
|
||||||
|
data['director'] = stripTags(parseTable(html, 'Director / Artist'))
|
||||||
|
data['files'] = []
|
||||||
|
result = findRe(html, '<table class=main border="1" cellspacing=0 cellpadding="5">(.*?)</table>')
|
||||||
|
results = re.compile('<td>(.*?)</td><td align="right">(.*?)</td>', re.DOTALL).findall(result)
|
||||||
|
for name, size in results:
|
||||||
|
data['files'].append({
|
||||||
|
'name': name,
|
||||||
|
'size': '%s %s' % (size[:-2], size[-2:].strip().upper())
|
||||||
|
})
|
||||||
|
data['format'] = ''
|
||||||
|
if html.find('genreimages/dvdr.png') != -1:
|
||||||
|
data['format'] = 'DVD'
|
||||||
|
elif html.find('genreimages/hdrip.png') != -1:
|
||||||
|
data['format'] = 'HD'
|
||||||
|
data['genre'] = []
|
||||||
|
result = parseTable(html, 'Genres')
|
||||||
|
for string in result.split('\n'):
|
||||||
|
string = stripTags(findRe(string, '<a href="browse.php\?genre=.*?">(.*?)</a>'))
|
||||||
|
if string:
|
||||||
|
data['genre'].append(string)
|
||||||
|
data['id'] = id
|
||||||
|
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
||||||
|
data['language'] = stripTags(parseTable(html, 'Language'))
|
||||||
|
data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)'))
|
||||||
|
data['link'] = stripTags(parseTable(html, 'Internet Link'))
|
||||||
|
data['links'] = []
|
||||||
|
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Description'))
|
||||||
|
for (url, title) in results:
|
||||||
|
if url.find('javascript') == -1:
|
||||||
|
data['links'].append({
|
||||||
|
'title': title,
|
||||||
|
'url': url.replace('http://anonym.to/?', '')
|
||||||
|
})
|
||||||
|
data['people'] = 0
|
||||||
|
result = stripTags(findRe(html, '<a href="top10others.php.*?>(.*?) people')).strip()
|
||||||
|
if result:
|
||||||
|
data['people'] = int(result)
|
||||||
|
data['posters'] = []
|
||||||
|
results = re.compile('<img border=0 src="(http://.*?)"', re.DOTALL).findall(html)
|
||||||
|
for result in results:
|
||||||
|
data['posters'].append(result)
|
||||||
|
data['seeders'] = int(findRe(html, '#seeders" class="sublink".*?colspan=2>(.*?) seeder\(s\)'))
|
||||||
|
data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', ''))
|
||||||
|
data['snatched'] = int(findRe(html, '<a name="snatchers">.*?colspan=2>(.*?) '))
|
||||||
|
data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)<hr>').replace('included: ', '')
|
||||||
|
data['subtitles'] = []
|
||||||
|
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Subtitles'))
|
||||||
|
for (url, language) in results:
|
||||||
|
data['subtitles'].append({
|
||||||
|
'language': language.replace('click here for ', ''),
|
||||||
|
'url': url
|
||||||
|
})
|
||||||
|
data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"')
|
||||||
|
data['year'] = stripTags(parseTable(html, 'Year'))
|
||||||
|
data['title'] = stripTags(findRe(html, '<h1>(.*?)</h1>')).strip()
|
||||||
|
data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title'])
|
||||||
|
data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title'])
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
return url.split("=")[-1]
|
||||||
|
|
||||||
|
def getTorrent(id):
|
||||||
|
return readUrl(getData(id)['torrent'])
|
||||||
|
|
||||||
|
def getIds(lastId = 20):
|
||||||
|
lastId = '%s' % lastId
|
||||||
|
ids = []
|
||||||
|
page = 0
|
||||||
|
while True:
|
||||||
|
for id in getIdsByPage(page):
|
||||||
|
if not id in ids:
|
||||||
|
ids.append(id)
|
||||||
|
if lastId in ids:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
||||||
|
|
||||||
|
def getIdsByPage(page):
|
||||||
|
ids = []
|
||||||
|
url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
|
||||||
|
html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
|
||||||
|
strings = html.split('<td width="42" style="padding:0px;">')
|
||||||
|
strings.pop(0)
|
||||||
|
for string in strings:
|
||||||
|
ids.append(findRe(string, '"details.php\?id=(.*?)"'))
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
return "http://karagarga.net/details.php?id=%s" % id
|
||||||
|
|
||||||
|
def parseTable(html, title):
|
||||||
|
if title == 'Genres':
|
||||||
|
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</table>' % title)
|
||||||
|
else:
|
||||||
|
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</td>' % title)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print getIds("79317")
|
||||||
|
print getData("79317")
|
21
ox/web/lyricsfly.py
Normal file
21
ox/web/lyricsfly.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from ox.cache import readUrl
|
||||||
|
from ox.html import decodeHtml
|
||||||
|
from ox.text import findRe
|
||||||
|
|
||||||
|
|
||||||
|
def getLyrics(title, artist):
|
||||||
|
html = readUrl('http://lyricsfly.com/api/')
|
||||||
|
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||||
|
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||||
|
xml = readUrl(url)
|
||||||
|
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||||
|
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||||
|
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||||
|
lyrics.replace('\n\n\n', '\n\n')
|
||||||
|
lyrics = decodeHtml(lyrics.replace('&', '&'))
|
||||||
|
return lyrics
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print getLyrics('Election Day', 'Arcadia')
|
45
ox/web/metacritic.py
Normal file
45
ox/web/metacritic.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
from urllib import quote
|
||||||
|
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, decodeHtml, stripTags
|
||||||
|
|
||||||
|
|
||||||
|
def getMetacriticShowUrl(title):
|
||||||
|
title = quote(title)
|
||||||
|
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
|
data = readUrl(url)
|
||||||
|
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||||
|
|
||||||
|
def getData(title, url=None):
|
||||||
|
if not url:
|
||||||
|
url = getMetacriticShowUrl(title)
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
score = findRe(data, 'ALT="Metascore: (.*?)"')
|
||||||
|
if score:
|
||||||
|
score = int(score)
|
||||||
|
else:
|
||||||
|
score = -1
|
||||||
|
|
||||||
|
reviews = re.compile(
|
||||||
|
'<div class="scoreandreview"><div class="criticscore">(.*?)</div>'
|
||||||
|
'.*?<span class="publication">(.*?)</span>'
|
||||||
|
'.*?<span class="criticname">(.*?)</span></div>'
|
||||||
|
'.*?<div class="quote">(.*?)<br>'
|
||||||
|
'.*?<a href="(.*?)" ', re.DOTALL).findall(data)
|
||||||
|
|
||||||
|
metacritics = []
|
||||||
|
for review in reviews:
|
||||||
|
metacritics.append({
|
||||||
|
'score': int(review[0]),
|
||||||
|
'publication':review[1],
|
||||||
|
'critic':decodeHtml(review[2]),
|
||||||
|
'quote': stripTags(review[3]).strip(),
|
||||||
|
'link': review[4],
|
||||||
|
})
|
||||||
|
return dict(score = score, critics = metacritics, url = url)
|
||||||
|
|
126
ox/web/mininova.py
Normal file
126
ox/web/mininova.py
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
from urllib import quote
|
||||||
|
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||||
|
from ox.normalize import normalizeImdbId
|
||||||
|
import ox
|
||||||
|
|
||||||
|
from torrent import Torrent
|
||||||
|
|
||||||
|
|
||||||
|
def _parseResultsPage(data, max_results=10):
|
||||||
|
results=[]
|
||||||
|
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||||
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
|
torrentDate = row[0]
|
||||||
|
torrentExtra = row[1]
|
||||||
|
torrentId = row[2]
|
||||||
|
torrentTitle = decodeHtml(row[3]).strip()
|
||||||
|
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||||
|
privateTracker = 'priv.gif' in torrentExtra
|
||||||
|
if not privateTracker:
|
||||||
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
|
return results
|
||||||
|
|
||||||
|
def findMovie(query, max_results=10):
|
||||||
|
'''search for torrents on mininova
|
||||||
|
'''
|
||||||
|
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
return _parseResultsPage(data, max_results)
|
||||||
|
|
||||||
|
def findMovieByImdb(imdbId):
|
||||||
|
'''find torrents on mininova for a given imdb id
|
||||||
|
'''
|
||||||
|
results = []
|
||||||
|
imdbId = normalizeImdbId(imdbId)
|
||||||
|
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||||
|
return _parseResultsPage(data)
|
||||||
|
|
||||||
|
def getId(mininovaId):
|
||||||
|
mininovaId = unicode(mininovaId)
|
||||||
|
d = findRe(mininovaId, "/(\d+)")
|
||||||
|
if d:
|
||||||
|
return d
|
||||||
|
mininovaId = mininovaId.split('/')
|
||||||
|
if len(mininovaId) == 1:
|
||||||
|
return mininovaId[0]
|
||||||
|
else:
|
||||||
|
return mininovaId[-1]
|
||||||
|
|
||||||
|
def exists(mininovaId):
|
||||||
|
mininovaId = getId(mininovaId)
|
||||||
|
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||||
|
if not data or 'Torrent not found...' in data:
|
||||||
|
return False
|
||||||
|
if 'tracker</a> of this torrent requires registration.' in data:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def getData(mininovaId):
|
||||||
|
_key_map = {
|
||||||
|
'by': u'uploader',
|
||||||
|
}
|
||||||
|
mininovaId = getId(mininovaId)
|
||||||
|
torrent = dict()
|
||||||
|
torrent[u'id'] = mininovaId
|
||||||
|
torrent[u'domain'] = 'mininova.org'
|
||||||
|
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||||
|
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||||
|
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||||
|
|
||||||
|
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
|
||||||
|
if '<h1>Torrent not found...</h1>' in data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||||
|
key = d[0].lower().strip()
|
||||||
|
key = _key_map.get(key, key)
|
||||||
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
|
torrent[key] = value
|
||||||
|
|
||||||
|
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||||
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
|
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||||
|
if torrent['description']:
|
||||||
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
|
t = readUrl(torrent[u'torrent_link'])
|
||||||
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
|
return torrent
|
||||||
|
|
||||||
|
class Mininova(Torrent):
|
||||||
|
'''
|
||||||
|
>>> Mininova('123')
|
||||||
|
{}
|
||||||
|
>>> Mininova('1072195')['infohash']
|
||||||
|
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||||
|
'''
|
||||||
|
def __init__(self, mininovaId):
|
||||||
|
self.data = getData(mininovaId)
|
||||||
|
if not self.data:
|
||||||
|
return
|
||||||
|
Torrent.__init__(self)
|
||||||
|
ratio = self.data['share ratio'].split(',')
|
||||||
|
self['seeder'] = -1
|
||||||
|
self['leecher'] = -1
|
||||||
|
if len(ratio) == 2:
|
||||||
|
val = intValue(ratio[0].replace(',','').strip())
|
||||||
|
if val:
|
||||||
|
self['seeder'] = int(val)
|
||||||
|
val = intValue(ratio[1].replace(',','').strip())
|
||||||
|
if val:
|
||||||
|
self['leecher'] = int(val)
|
||||||
|
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||||
|
if val:
|
||||||
|
self['downloaded'] = int(val)
|
||||||
|
else:
|
||||||
|
self['downloaded'] = -1
|
||||||
|
published = self.data['added on']
|
||||||
|
published = published.split(' +')[0]
|
||||||
|
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||||
|
|
44
ox/web/movieposterdb.py
Normal file
44
ox/web/movieposterdb.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
from ox import findRe
|
||||||
|
|
||||||
|
def getData(id):
|
||||||
|
'''
|
||||||
|
>>> getData('0060304')['posters'][0]
|
||||||
|
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
||||||
|
>>> getData('0123456')['posters']
|
||||||
|
[]
|
||||||
|
'''
|
||||||
|
data = {
|
||||||
|
"url": getUrl(id)
|
||||||
|
}
|
||||||
|
data["posters"] = getPostersByUrl(data["url"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
return url.split("/")[-2]
|
||||||
|
|
||||||
|
def getPostersByUrl(url, group=True):
|
||||||
|
posters = []
|
||||||
|
html = readUrlUnicode(url)
|
||||||
|
if url in html:
|
||||||
|
if group:
|
||||||
|
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||||
|
for result in results:
|
||||||
|
posters += getPostersByUrl(result, False)
|
||||||
|
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||||
|
for result in results:
|
||||||
|
html = readUrlUnicode(result)
|
||||||
|
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||||
|
return posters
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
return "http://www.movieposterdb.com/movie/%s/" % id
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print getData('0060304')
|
||||||
|
print getData('0133093')
|
41
ox/web/opensubtitles.py
Normal file
41
ox/web/opensubtitles.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
|
||||||
|
import feedparser
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, stripTags
|
||||||
|
from ox import langCode2To3, langTo3Code
|
||||||
|
|
||||||
|
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
|
if len(language) == 2:
|
||||||
|
language = langCode2To3(language)
|
||||||
|
elif len(language) != 3:
|
||||||
|
language = langTo3Code(language)
|
||||||
|
url = "http://www.opensubtitles.org/en/search/"
|
||||||
|
if language:
|
||||||
|
url += "sublanguageid-%s/" % language
|
||||||
|
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||||
|
data = readUrl(url)
|
||||||
|
if "title>opensubtitles.com - search results</title" in data:
|
||||||
|
fd = feedparser.parse(data)
|
||||||
|
opensubtitleId = None
|
||||||
|
if fd.entries:
|
||||||
|
link = fd.entries[0]['links'][0]['href']
|
||||||
|
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||||
|
if opensubtitleId:
|
||||||
|
opensubtitleId = opensubtitleId[0]
|
||||||
|
else:
|
||||||
|
opensubtitleId = findRe(data, '/en/subtitles/(.*?)/')
|
||||||
|
return opensubtitleId
|
||||||
|
|
||||||
|
def downloadSubtitleById(opensubtitle_id):
|
||||||
|
srts = {}
|
||||||
|
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||||
|
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||||
|
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||||
|
name = stripTags(f[1]).split('\n')[0]
|
||||||
|
url = "http://www.opensubtitles.com%s" % f[0]
|
||||||
|
srts[name] = readUrlUnicode(url)
|
||||||
|
return srts
|
||||||
|
|
10
ox/web/oxdb.py
Normal file
10
ox/web/oxdb.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import ox.cache
|
||||||
|
|
||||||
|
def getPosterUrl(id):
|
||||||
|
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
||||||
|
if ox.cache.exists(url):
|
||||||
|
return url
|
||||||
|
return ''
|
||||||
|
|
12
ox/web/piratecinema.py
Normal file
12
ox/web/piratecinema.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import ox.cache
|
||||||
|
from ox.cache import exists
|
||||||
|
|
||||||
|
|
||||||
|
def getPosterUrl(id):
|
||||||
|
url = "http://piratecinema.org/posters/%s/%s.jpg" % (id[:4], id)
|
||||||
|
if ox.cache.exists(url):
|
||||||
|
return url
|
||||||
|
return ''
|
||||||
|
|
34
ox/web/rottentomatoes.py
Normal file
34
ox/web/rottentomatoes.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ox.cache import getHeaders, readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, stripTags
|
||||||
|
|
||||||
|
|
||||||
|
def readUrlByImdb(imdb):
|
||||||
|
#this would also wor but does not cache:
|
||||||
|
'''
|
||||||
|
from urllib2 import urlopen
|
||||||
|
u = urlopen(url)
|
||||||
|
return u.url
|
||||||
|
'''
|
||||||
|
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||||
|
data = readUrl(url)
|
||||||
|
if "movie_title" in data:
|
||||||
|
movies = re.compile('(/m/.*?/)').findall(data)
|
||||||
|
if movies:
|
||||||
|
return "http://www.rottentomatoes.com" + movies[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getData(url):
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
r = {}
|
||||||
|
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||||
|
if '(' in r['title']:
|
||||||
|
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
||||||
|
r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip()
|
||||||
|
r['synopsis'] = findRe(data, '<span id="movie_synopsis_all".*?>(.*?)</span>')
|
||||||
|
r['average rating'] = findRe(data, '<div id="bubble_allCritics".*?>(.*?)</div>').strip()
|
||||||
|
return r
|
||||||
|
|
61
ox/web/siteparser.py
Normal file
61
ox/web/siteparser.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
from ox import stripTags, decodeHtml
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup(key, data, data_type):
|
||||||
|
if data:
|
||||||
|
if isinstance(data[0], basestring):
|
||||||
|
#FIXME: some types need stripTags
|
||||||
|
#data = [stripTags(decodeHtml(p)).strip() for p in data]
|
||||||
|
data = [decodeHtml(p).strip() for p in data]
|
||||||
|
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||||
|
data = [cleanup(key, p, data_type) for p in data]
|
||||||
|
while len(data) == 1:
|
||||||
|
data = data[0]
|
||||||
|
if data_type == 'list' and isinstance(data, basestring):
|
||||||
|
data = [data, ]
|
||||||
|
elif data_type != 'list':
|
||||||
|
data = ''
|
||||||
|
return data
|
||||||
|
|
||||||
|
class SiteParser(dict):
|
||||||
|
baseUrl = ''
|
||||||
|
regex = {}
|
||||||
|
|
||||||
|
def getUrl(self, page):
|
||||||
|
return "%s%s" % (self.baseUrl, page)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
for key in self.regex:
|
||||||
|
url = self.getUrl(self.regex[key]['page'])
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
if isinstance(self.regex[key]['re'], basestring):
|
||||||
|
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||||
|
data = cleanup(key, data, self.regex[key]['type'])
|
||||||
|
else:
|
||||||
|
for r in self.regex[key]['re']:
|
||||||
|
if isinstance(data, basestring):
|
||||||
|
data = re.compile(r, re.DOTALL).findall(data)
|
||||||
|
else:
|
||||||
|
data = [re.compile(r, re.DOTALL).findall(d) for d in data]
|
||||||
|
data = cleanup(key, data, self.regex[key]['type'])
|
||||||
|
def apply_f(f, data):
|
||||||
|
if data and isinstance(data[0], list):
|
||||||
|
data = [f(d) for d in data]
|
||||||
|
else:
|
||||||
|
data = f(data)
|
||||||
|
return data
|
||||||
|
if self.regex[key]['type'] == 'float':
|
||||||
|
data = apply_f(float, data)
|
||||||
|
elif self.regex[key]['type'] == 'int':
|
||||||
|
data = apply_f(int, data)
|
||||||
|
elif self.regex[key]['type'] == 'date':
|
||||||
|
parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||||
|
data = apply_f(parse_date, data)
|
||||||
|
self[key] = data
|
||||||
|
|
292
ox/web/spiegel.py
Normal file
292
ox/web/spiegel.py
Normal file
|
@ -0,0 +1,292 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
import ox.cache
|
||||||
|
from ox.html import decodeHtml, stripTags
|
||||||
|
import ox.net
|
||||||
|
|
||||||
|
|
||||||
|
def getNews(year, month, day):
|
||||||
|
sections = [
|
||||||
|
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||||
|
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||||
|
]
|
||||||
|
dt = datetime(year, month, day)
|
||||||
|
day = int(dt.strftime('%j'))
|
||||||
|
date = dt.strftime('%d.%m.%Y')
|
||||||
|
news = []
|
||||||
|
for section in sections:
|
||||||
|
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||||
|
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||||
|
html = ox.net.readUrl(url)
|
||||||
|
else:
|
||||||
|
html = ox.cache.readUrl(url)
|
||||||
|
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||||
|
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||||
|
try:
|
||||||
|
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||||
|
except:
|
||||||
|
description = ''
|
||||||
|
try:
|
||||||
|
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||||
|
except:
|
||||||
|
imageUrl = ''
|
||||||
|
try:
|
||||||
|
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||||
|
except:
|
||||||
|
title = ''
|
||||||
|
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||||
|
new = {}
|
||||||
|
if len(dateString) == 10:
|
||||||
|
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||||
|
else:
|
||||||
|
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||||
|
# fix decodeHtml
|
||||||
|
# new['description'] = formatString(decodeHtml(description))
|
||||||
|
new['description'] = formatString(description)
|
||||||
|
new['imageUrl'] = imageUrl
|
||||||
|
new['section'] = formatSection(section)
|
||||||
|
new['title'] = formatString(title)
|
||||||
|
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||||
|
if new['title1'][-1:] == ':':
|
||||||
|
new['title1'] = new['title1'][0:-1]
|
||||||
|
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||||
|
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
|
||||||
|
if new['url'][:1] == '/':
|
||||||
|
new['url'] = 'http://www.spiegel.de' + new['url']
|
||||||
|
news.append(new)
|
||||||
|
# print '%s, %s' % (new['section'], dateString)
|
||||||
|
'''
|
||||||
|
elif dateString[:10] == date and not description:
|
||||||
|
print dateString + ' - no description'
|
||||||
|
elif dateString[:10] == date and not imageUrl:
|
||||||
|
print dateString + ' - no image'
|
||||||
|
'''
|
||||||
|
return news
|
||||||
|
|
||||||
|
def splitTitle(title):
|
||||||
|
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||||
|
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||||
|
return [title1, title2]
|
||||||
|
|
||||||
|
def formatString(string):
|
||||||
|
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||||
|
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||||
|
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||||
|
return string
|
||||||
|
|
||||||
|
def formatSection(string):
|
||||||
|
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||||
|
|
||||||
|
def formatSubsection(string):
|
||||||
|
# SPIEGEL, SPIEGEL special
|
||||||
|
subsection = {
|
||||||
|
'abi': 'Abi - und dann?',
|
||||||
|
'formel1': 'Formel 1',
|
||||||
|
'jobundberuf': 'Job & Beruf',
|
||||||
|
'leben': 'Leben U21',
|
||||||
|
'mensch': 'Mensch & Technik',
|
||||||
|
'sonst': '',
|
||||||
|
'staedte': u'St\xc3dte',
|
||||||
|
'ussports': 'US-Sports',
|
||||||
|
'wunderbar': 'wunderBAR'
|
||||||
|
}
|
||||||
|
if subsection.has_key(string):
|
||||||
|
return subsection[string].replace(u'\xc3', 'ae')
|
||||||
|
return string[:1].upper() + string[1:]
|
||||||
|
|
||||||
|
def getIssue(year, week):
|
||||||
|
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||||
|
if not ox.net.exists(coverUrl):
|
||||||
|
return None
|
||||||
|
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||||
|
contents = []
|
||||||
|
data = ox.cache.readUrl(url)
|
||||||
|
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||||
|
for item in items:
|
||||||
|
item = item[1]
|
||||||
|
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||||
|
title = stripTags(item).strip()
|
||||||
|
contents.append({'title': title, 'page': page})
|
||||||
|
pageUrl = {}
|
||||||
|
pages = page + 2
|
||||||
|
for page in range(1, pages + 10):
|
||||||
|
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
||||||
|
if ox.cache.exists(url):
|
||||||
|
pageUrl[page] = url
|
||||||
|
else:
|
||||||
|
pageUrl[page] = ''
|
||||||
|
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||||
|
|
||||||
|
|
||||||
|
def archiveIssues():
|
||||||
|
'''
|
||||||
|
this is just an example of an archiving application
|
||||||
|
'''
|
||||||
|
p = {}
|
||||||
|
import os
|
||||||
|
import simplejson
|
||||||
|
import time
|
||||||
|
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
|
||||||
|
localtime = time.localtime()
|
||||||
|
year = int(time.strftime('%Y', localtime))
|
||||||
|
week = int(time.strftime('%W', localtime))
|
||||||
|
for y in range(year, 1993, -1):
|
||||||
|
if y == year:
|
||||||
|
wMax = week + 1
|
||||||
|
else:
|
||||||
|
wMax = 53
|
||||||
|
for w in range(wMax, 0, -1):
|
||||||
|
print 'getIssue(%d, %d)' % (y, w)
|
||||||
|
issue = getIssue(y, w)
|
||||||
|
if issue:
|
||||||
|
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = simplejson.dumps(issue, ensure_ascii = False)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = []
|
||||||
|
for item in issue['contents']:
|
||||||
|
data.append('%3d %s' % (item['page'], item['title']))
|
||||||
|
data = '\n'.join(data)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = ox.cache.readUrl(issue['coverUrl'])
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
for page in issue['pageUrl']:
|
||||||
|
url = issue['pageUrl'][page]
|
||||||
|
if url:
|
||||||
|
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = ox.cache.readUrl(url)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
if not p:
|
||||||
|
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
|
||||||
|
else:
|
||||||
|
p['num'] += 1
|
||||||
|
p['sum'] += issue['pages']
|
||||||
|
if issue['pages'] < p['min']:
|
||||||
|
p['min'] = issue['pages']
|
||||||
|
if issue['pages'] > p['max']:
|
||||||
|
p['max'] = issue['pages']
|
||||||
|
print p['min'], p['sum'] / p['num'], p['max']
|
||||||
|
|
||||||
|
|
||||||
|
def archiveNews():
|
||||||
|
'''
|
||||||
|
this is just an example of an archiving application
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
import simplejson
|
||||||
|
import time
|
||||||
|
|
||||||
|
count = {}
|
||||||
|
colon = []
|
||||||
|
|
||||||
|
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||||
|
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||||
|
localtime = time.localtime()
|
||||||
|
year = int(time.strftime('%Y', localtime))
|
||||||
|
month = int(time.strftime('%m', localtime))
|
||||||
|
day = int(time.strftime('%d', localtime)) - 1
|
||||||
|
for y in range(year, 1999, -1):
|
||||||
|
if y == year:
|
||||||
|
mMax = month
|
||||||
|
else:
|
||||||
|
mMax = 12
|
||||||
|
for m in range(mMax, 0, -1):
|
||||||
|
if y == year and m == month:
|
||||||
|
dMax = day
|
||||||
|
elif m == 2 and y % 4 == 0 and y % 400 != 0:
|
||||||
|
dMax = days[m] + 1
|
||||||
|
else:
|
||||||
|
dMax = days[m]
|
||||||
|
for d in range(dMax, 0, -1):
|
||||||
|
print 'getNews(%d, %d, %d)' % (y, m, d)
|
||||||
|
news = getNews(y, m ,d)
|
||||||
|
for new in news:
|
||||||
|
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
if new['url'][-5:] == '.html':
|
||||||
|
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||||
|
else:
|
||||||
|
filename = dirname + '/' + new['url'] + '.json'
|
||||||
|
if not os.path.exists(filename) or True:
|
||||||
|
data = simplejson.dumps(new, ensure_ascii = False)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = filename[:-5] + '.txt'
|
||||||
|
if not os.path.exists(filename) or True:
|
||||||
|
data = splitTitle(new['title'])
|
||||||
|
data.append(new['description'])
|
||||||
|
data = '\n'.join(data)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
data = ox.cache.readUrl(new['imageUrl'])
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
strings = new['url'].split('/')
|
||||||
|
string = strings[3]
|
||||||
|
if len(strings) == 6:
|
||||||
|
string += '/' + strings[4]
|
||||||
|
if not count.has_key(string):
|
||||||
|
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||||
|
else:
|
||||||
|
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||||
|
strings = splitTitle(new['title'])
|
||||||
|
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||||
|
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||||
|
for key in sortDictByKey(count):
|
||||||
|
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||||
|
for value in colon:
|
||||||
|
print value
|
||||||
|
|
||||||
|
def sortDictByKey(d):
|
||||||
|
keys = d.keys()
|
||||||
|
keys.sort()
|
||||||
|
return keys
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# spiegel = Spiegel(2008, 8)
|
||||||
|
# print spiegel.getContents()
|
||||||
|
# news = News(2001, 9, 10)
|
||||||
|
# output(news.getNews())
|
||||||
|
'''
|
||||||
|
x = []
|
||||||
|
for d in range(10, 30):
|
||||||
|
print '2/%d' % d
|
||||||
|
news = getNews(2008, 2, d)
|
||||||
|
for new in news:
|
||||||
|
strings = new['url'].split('/')
|
||||||
|
string = formatSection(strings[3])
|
||||||
|
if len(strings) == 6:
|
||||||
|
string += '/' + formatSubsection(strings[4])
|
||||||
|
if not string in x:
|
||||||
|
x.append(string)
|
||||||
|
print x
|
||||||
|
'''
|
||||||
|
# archiveIssues()
|
||||||
|
archiveNews()
|
122
ox/web/thepiratebay.py
Normal file
122
ox/web/thepiratebay.py
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
from urllib import quote, urlencode
|
||||||
|
from urllib2 import URLError
|
||||||
|
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||||
|
from ox.normalize import normalizeImdbId
|
||||||
|
import ox
|
||||||
|
|
||||||
|
from torrent import Torrent
|
||||||
|
|
||||||
|
cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||||
|
|
||||||
|
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
||||||
|
headers = headers.copy()
|
||||||
|
headers['Cookie'] = 'language=en_EN'
|
||||||
|
return cache.readUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
|
def _readUrlUnicode(url, timeout=cache.cache_timeout):
|
||||||
|
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
|
||||||
|
|
||||||
|
def findMovies(query, max_results=10):
|
||||||
|
results = []
|
||||||
|
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||||
|
page_count = 1
|
||||||
|
while next and page_count < 4:
|
||||||
|
page_count += 1
|
||||||
|
url = next[0]
|
||||||
|
if not url.startswith('http'):
|
||||||
|
if not url.startswith('/'):
|
||||||
|
url = "/" + url
|
||||||
|
url = "http://thepiratebay.org" + url
|
||||||
|
data = _readUrlUnicode(url, timeout=cache_timeout)
|
||||||
|
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||||
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
|
torrentType = row[0]
|
||||||
|
torrentLink = "http://thepiratebay.org" + row[1]
|
||||||
|
torrentTitle = decodeHtml(row[2])
|
||||||
|
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||||
|
if torrentType in ['201']:
|
||||||
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
|
if len(results) >= max_results:
|
||||||
|
return results
|
||||||
|
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def findMovieByImdb(imdb):
|
||||||
|
return findMovies("tt" + normalizeImdbId(imdb))
|
||||||
|
|
||||||
|
def getId(piratebayId):
|
||||||
|
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||||
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
|
d = findRe(piratebayId, "tor/(\d+)")
|
||||||
|
if d:
|
||||||
|
piratebayId = d
|
||||||
|
d = findRe(piratebayId, "torrent/(\d+)")
|
||||||
|
if d:
|
||||||
|
piratebayId = d
|
||||||
|
return piratebayId
|
||||||
|
|
||||||
|
def exists(piratebayId):
|
||||||
|
piratebayId = getId(piratebayId)
|
||||||
|
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
||||||
|
|
||||||
|
def getData(piratebayId):
|
||||||
|
_key_map = {
|
||||||
|
'spoken language(s)': u'language',
|
||||||
|
'texted language(s)': u'subtitle language',
|
||||||
|
'by': u'uploader',
|
||||||
|
'leechers': 'leecher',
|
||||||
|
'seeders': 'seeder',
|
||||||
|
}
|
||||||
|
piratebayId = getId(piratebayId)
|
||||||
|
torrent = dict()
|
||||||
|
torrent[u'id'] = piratebayId
|
||||||
|
torrent[u'domain'] = 'thepiratebay.org'
|
||||||
|
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||||
|
|
||||||
|
data = _readUrlUnicode(torrent['comment_link'])
|
||||||
|
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
|
if not torrent[u'title']:
|
||||||
|
return None
|
||||||
|
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||||
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
|
title = quote(torrent['title'].encode('utf-8'))
|
||||||
|
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||||
|
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
|
key = d[0].lower().strip()
|
||||||
|
key = _key_map.get(key, key)
|
||||||
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
|
torrent[key] = value
|
||||||
|
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||||
|
if torrent[u'description']:
|
||||||
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
|
t = _readUrl(torrent[u'torrent_link'])
|
||||||
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
|
return torrent
|
||||||
|
|
||||||
|
class Thepiratebay(Torrent):
|
||||||
|
'''
|
||||||
|
>>> Thepiratebay('123')
|
||||||
|
{}
|
||||||
|
|
||||||
|
>>> Thepiratebay('3951349')['infohash']
|
||||||
|
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||||
|
'''
|
||||||
|
def __init__(self, piratebayId):
|
||||||
|
self.data = getData(piratebayId)
|
||||||
|
if not self.data:
|
||||||
|
return
|
||||||
|
Torrent.__init__(self)
|
||||||
|
published = self.data['uploaded']
|
||||||
|
published = published.replace(' GMT', '').split(' +')[0]
|
||||||
|
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
37
ox/web/torrent.py
Normal file
37
ox/web/torrent.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from ox import intValue
|
||||||
|
|
||||||
|
|
||||||
|
class Torrent(dict):
|
||||||
|
'''
|
||||||
|
>>> Torrent()
|
||||||
|
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||||
|
'''
|
||||||
|
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||||
|
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||||
|
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||||
|
_dict_keys = ('torrent_info', )
|
||||||
|
_list_keys = ()
|
||||||
|
data = {'torrent_info': {}}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
for key in self._string_keys:
|
||||||
|
self[key] = self.data.get(key, u'')
|
||||||
|
for key in self._dict_keys:
|
||||||
|
self[key] = self.data.get(key, {})
|
||||||
|
for key in self._list_keys:
|
||||||
|
self[key] = self.data.get(key, [])
|
||||||
|
for key in self._int_keys:
|
||||||
|
value = self.data.get(key, -1)
|
||||||
|
if not isinstance(value, int):
|
||||||
|
value = int(intValue(value))
|
||||||
|
self[key] = value
|
||||||
|
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||||
|
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||||
|
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||||
|
if 'files' in self.data['torrent_info']:
|
||||||
|
self['files'] = len(self.data['torrent_info']['files'])
|
||||||
|
else:
|
||||||
|
self['files'] = 1
|
||||||
|
|
32
ox/web/tv.py
Normal file
32
ox/web/tv.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
|
from ox import stripTags, findRe
|
||||||
|
from ox.cache import readUrlUnicode
|
||||||
|
|
||||||
|
|
||||||
|
def getEpisodeData(url):
|
||||||
|
'''
|
||||||
|
prases informatin on tvcom episode pages
|
||||||
|
returns dict with title, show, description, score
|
||||||
|
example:
|
||||||
|
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||||
|
'''
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
r = {}
|
||||||
|
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||||
|
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
||||||
|
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||||
|
#episode score
|
||||||
|
r['episode score'] = findRe(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||||
|
|
||||||
|
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||||
|
if match:
|
||||||
|
r['season'] = int(match[0][1])
|
||||||
|
r['episode'] = int(match[0][0])
|
||||||
|
#'Wednesday September 29, 2004' -> 2004-09-29
|
||||||
|
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
|
||||||
|
return r
|
||||||
|
|
120
ox/web/wikipedia.py
Normal file
120
ox/web/wikipedia.py
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from urllib import urlencode
|
||||||
|
|
||||||
|
import simplejson
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findRe, decodeHtml
|
||||||
|
|
||||||
|
|
||||||
|
def getId(url):
|
||||||
|
return url.split("/")[-1]
|
||||||
|
|
||||||
|
def getUrl(id):
|
||||||
|
return "http://en.wikipedia.org/wiki/%s" % id
|
||||||
|
|
||||||
|
|
||||||
|
def getMovieId(title, director='', year=''):
|
||||||
|
query = '"%s" film %s %s' % (title, director, year)
|
||||||
|
result = find(query, 1)
|
||||||
|
if result:
|
||||||
|
return result[0][1]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getUrlByImdbId(imdbId):
|
||||||
|
query = '"%s"'% imdbId
|
||||||
|
result = find(query)
|
||||||
|
if result:
|
||||||
|
url = result[0][1]
|
||||||
|
return url
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getUrlByImdb(imdbId):
|
||||||
|
# deprecated, use getUrlByImdbId()
|
||||||
|
return getUrlByImdbId(imdbId)
|
||||||
|
|
||||||
|
def getUrlByAllmovieId(allmovieId):
|
||||||
|
query = '"amg_id = 1:%s"'% allmovieId
|
||||||
|
result = find(query)
|
||||||
|
if result:
|
||||||
|
url = result[0][1]
|
||||||
|
return url
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getWikiData(wikipediaUrl):
|
||||||
|
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||||
|
url = "%s&action=raw" % url
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def getMovieData(wikipediaUrl):
|
||||||
|
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
|
||||||
|
data = getWikiData(wikipediaUrl)
|
||||||
|
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
|
||||||
|
filmbox = {}
|
||||||
|
_box = filmbox_data.strip().split('\n|')
|
||||||
|
if len(_box) == 1:
|
||||||
|
_box = _box[0].split('|\n')
|
||||||
|
for row in _box:
|
||||||
|
d = row.split('=')
|
||||||
|
if len(d) == 2:
|
||||||
|
key = d[0].strip()
|
||||||
|
if key[0] == '|':
|
||||||
|
key = key[1:]
|
||||||
|
value = d[1].strip()
|
||||||
|
filmbox[key] = value
|
||||||
|
if 'imdb title' in data:
|
||||||
|
filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
|
||||||
|
elif 'imdb episode' in data:
|
||||||
|
filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
|
||||||
|
if 'Amg movie' in data:
|
||||||
|
filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
|
||||||
|
if 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
|
||||||
|
filmbox['amg_id'] = filmbox['amg_id'][2:]
|
||||||
|
|
||||||
|
if 'rotten-tomatoes' in data:
|
||||||
|
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
|
||||||
|
if not filmbox['rottentomatoes_id']:
|
||||||
|
filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
|
||||||
|
if 'google video' in data:
|
||||||
|
filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
|
||||||
|
if 'DEFAULTSORT' in data:
|
||||||
|
filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||||
|
return filmbox
|
||||||
|
|
||||||
|
def getImageUrl(name):
|
||||||
|
data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
|
||||||
|
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||||
|
return url
|
||||||
|
|
||||||
|
def getPosterUrl(wikipediaUrl):
|
||||||
|
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
|
||||||
|
data = getMovieData(wikipediaUrl)
|
||||||
|
if 'image' in data:
|
||||||
|
return getImageUrl(data['image'])
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getMoviePoster(wikipediaUrl):
|
||||||
|
# deprecated, use getPosterUrl()
|
||||||
|
return getPosterUrl(wikipediaUrl)
|
||||||
|
|
||||||
|
def getAllmovieId(wikipediaUrl):
|
||||||
|
data = getMovieData(wikipediaUrl)
|
||||||
|
return data.get('amg_id', '')
|
||||||
|
|
||||||
|
def find(query, max_results=10):
|
||||||
|
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||||
|
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||||
|
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||||
|
data = readUrl(url)
|
||||||
|
if not data:
|
||||||
|
data = readUrl(url, timeout=0)
|
||||||
|
result = simplejson.loads(data)
|
||||||
|
results = []
|
||||||
|
if result and 'query' in result:
|
||||||
|
for r in result['query']['search']:
|
||||||
|
title = r['title']
|
||||||
|
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||||
|
results.append((title, url, ''))
|
||||||
|
return results
|
||||||
|
|
107
ox/web/youtube.py
Normal file
107
ox/web/youtube.py
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
from urllib import quote, unquote
|
||||||
|
import httplib
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import re
|
||||||
|
|
||||||
|
import feedparser
|
||||||
|
from ox.cache import readUrl, readUrlUnicode
|
||||||
|
from ox import findString, findRe
|
||||||
|
|
||||||
|
|
||||||
|
def getVideoKey(youtubeId):
|
||||||
|
data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
|
||||||
|
match = re.compile("token=(.+)&thumbnail").findall(data)
|
||||||
|
if match:
|
||||||
|
return unquote(match[0])
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getVideoUrl(youtubeId, format='mp4'):
|
||||||
|
youtubeKey = getVideoKey(youtubeId)
|
||||||
|
if format == '1080p':
|
||||||
|
fmt=37
|
||||||
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
|
||||||
|
if format == '720p':
|
||||||
|
fmt=22
|
||||||
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
|
||||||
|
elif format == 'mp4':
|
||||||
|
fmt=18
|
||||||
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
|
||||||
|
elif format == 'high':
|
||||||
|
fmt=35
|
||||||
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
|
||||||
|
else:
|
||||||
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s" % (youtubeId, youtubeKey)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def getMovieInfo(youtubeId, video_url_base=None):
|
||||||
|
url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
|
||||||
|
data = readUrl(url)
|
||||||
|
fd = feedparser.parse(data)
|
||||||
|
return getInfoFromAtom(fd.entries[0], video_url_base)
|
||||||
|
|
||||||
|
def getInfoFromAtom(entry, video_url_base=None):
|
||||||
|
info = dict()
|
||||||
|
info['title'] = entry['title']
|
||||||
|
info['description'] = entry['description']
|
||||||
|
info['author'] = entry['author']
|
||||||
|
#info['published'] = entry['published_parsed']
|
||||||
|
if 'media_keywords' in entry:
|
||||||
|
info['keywords'] = entry['media_keywords'].split(', ')
|
||||||
|
info['url'] = entry['links'][0]['href']
|
||||||
|
info['id'] = findString(info['url'], "/watch?v=")
|
||||||
|
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
||||||
|
if video_url_base:
|
||||||
|
info['flv'] = "%s/%s.%s" % (video_url_base, info['id'], 'flv')
|
||||||
|
info['mp4'] = "%s/%s.%s" % (video_url_base, info['id'], 'mp4')
|
||||||
|
else:
|
||||||
|
info['flv'] = getVideoUrl(info['id'], 'flv')
|
||||||
|
info['flv_high'] = getVideoUrl(info['id'], 'high')
|
||||||
|
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
||||||
|
info['720p'] = getVideoUrl(info['id'], '720p')
|
||||||
|
info['1080p'] = getVideoUrl(info['id'], '1080p')
|
||||||
|
info['embed'] = '<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>' % (info['id'], info['id'])
|
||||||
|
return info
|
||||||
|
|
||||||
|
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
|
||||||
|
query = quote(query)
|
||||||
|
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
fd = feedparser.parse(data)
|
||||||
|
videos = []
|
||||||
|
for entry in fd.entries:
|
||||||
|
v = getInfoFromAtom(entry, video_url_base)
|
||||||
|
videos.append(v)
|
||||||
|
if len(videos) >= max_results:
|
||||||
|
return videos
|
||||||
|
return videos
|
||||||
|
|
||||||
|
'''
|
||||||
|
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
|
||||||
|
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
|
||||||
|
data = readUrlUnicode(url)
|
||||||
|
regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ')
|
||||||
|
regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)" class="vimg120" title="(.*?)" alt="video">')
|
||||||
|
id_title = regx.findall(data)
|
||||||
|
data_flat = data.replace('\n', ' ')
|
||||||
|
videos = {}
|
||||||
|
for video in id_title:
|
||||||
|
vid = video[0]
|
||||||
|
if vid not in videos:
|
||||||
|
v = dict()
|
||||||
|
v['id'] = vid
|
||||||
|
v['link'] = "http//youtube.com/watch.v=%s" % v['id']
|
||||||
|
v['title'] = video[2].strip()
|
||||||
|
if video_url_base:
|
||||||
|
v['video_link'] = "%s/%s" % (video_url_base, v['id'])
|
||||||
|
else:
|
||||||
|
v['video_url'] = getVideoUrl(v['id'])
|
||||||
|
v['description'] = findRe(data, 'BeginvidDesc%s">(.*?)</span>' % v['id']).strip().replace('<b>', ' ').replace('</b>', '')
|
||||||
|
v['thumbnail'] = video[1]
|
||||||
|
videos[vid] = v
|
||||||
|
if len(videos) >= max_results:
|
||||||
|
return videos.values()
|
||||||
|
return videos.values()
|
||||||
|
'''
|
||||||
|
|
Loading…
Reference in a new issue