depend on ox, install as ox.web, migrate getUrl to readUrl

This commit is contained in:
j 2009-10-12 13:47:43 +02:00
parent d2849d44ef
commit 16eeaf8b25
28 changed files with 169 additions and 172 deletions

4
README
View file

@ -2,7 +2,7 @@ python-oxweb the internet is a dict
Depends: Depends:
python2.5 python2.5
python-oxlib (bzr branch http://code.0xdb.org/python-oxlib) python-ox (bzr branch http://code.0xdb.org/python-ox)
python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/) python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)
python-feedparser (http://www.feedparser.org/) python-feedparser (http://www.feedparser.org/)
(there seam to be some issues if not using the one from ubuntu/debian) (there seam to be some issues if not using the one from ubuntu/debian)
@ -17,4 +17,4 @@ Install:
} }
Test: Test:
nosetests --with-doctest oxweb nosetests --with-doctest web

View file

@ -1 +1 @@
oxlib ox

View file

@ -19,8 +19,8 @@ setup(
url="http://code.0xdb.org/oxweb", url="http://code.0xdb.org/oxweb",
download_url="http://code.0xdb.org/oxweb/download", download_url="http://code.0xdb.org/oxweb/download",
license="GPLv3", license="GPLv3",
packages=['oxweb'], package_dir = {'ox.web': 'web'},
zip_safe=False, packages=['ox.web'],
keywords = [ keywords = [
], ],
classifiers = [ classifiers = [

View file

@ -1,6 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8 # encoding: utf-8
__version__ = '0.1.0' __version__ = '1.0.0'
import imdb import imdb
import wikipedia import wikipedia

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from oxlib import stripTags, findRe from ox import stripTags, findRe
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
def getId(url): def getId(url):
@ -24,7 +24,7 @@ def getData(id):
data = { data = {
"url": getUrl(id) "url": getUrl(id)
} }
html = getUrlUnicode(data["url"]) html = readUrlUnicode(data["url"])
data['aka'] = parseList(html, 'AKA') data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>') data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
data['countries'] = parseList(html, 'Countries') data['countries'] = parseList(html, 'Countries')
@ -42,11 +42,11 @@ def getData(id):
data['themes'] = parseList(html, 'Themes') data['themes'] = parseList(html, 'Themes')
data['types'] = parseList(html, 'Types') data['types'] = parseList(html, 'Types')
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"') data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
html = getUrlUnicode("http://allmovie.com/work/%s/cast" % id) html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
data['cast'] = parseTable(html) data['cast'] = parseTable(html)
html = getUrlUnicode("http://allmovie.com/work/%s/credits" % id) html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
data['credits'] = parseTable(html) data['credits'] = parseTable(html)
html = getUrlUnicode("http://allmovie.com/work/%s/review" % id) html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
data['review'] = parseText(html, 'Review') data['review'] = parseText(html, 'Review')
return data return data

View file

@ -4,8 +4,6 @@
import os import os
import simplejson import simplejson
import oxlib
def get(key): def get(key):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json')) user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))

View file

@ -2,10 +2,10 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import oxlib.cache import ox.cache
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
from oxlib.html import stripTags from ox.html import stripTags
from oxlib.text import findRe, removeSpecialCharacters from ox.text import findRe, removeSpecialCharacters
import imdb import imdb
@ -30,9 +30,9 @@ def getData(id):
"url": getUrl(id) "url": getUrl(id)
} }
try: try:
html = getUrlUnicode(data["url"]) html = readUrlUnicode(data["url"])
except: except:
html = oxlib.cache.getUrl(data["url"]) html = ox.cache.getUrl(data["url"])
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>") data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>") data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>") data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
@ -48,7 +48,7 @@ def getData(id):
if not "/boxsets/" in result: if not "/boxsets/" in result:
data["posters"] = [result] data["posters"] = [result]
else: else:
html_ = getUrlUnicode(result) html_ = readUrlUnicode(result)
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id) result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"") result = findRe(result, "src=\"(.*?)\"")
data["posters"] = [result.replace("_w100", "")] data["posters"] = [result.replace("_w100", "")]
@ -64,7 +64,7 @@ def getData(id):
def getIds(): def getIds():
ids = [] ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd") html = readUrlUnicode("http://www.criterion.com/library/dvd")
results = re.compile("page=(.*?)\"").findall(html) results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2]) pages = int(results[len(results) - 2])
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
@ -74,13 +74,13 @@ def getIds():
def getIdsByPage(page): def getIdsByPage(page):
ids = [] ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page) html = readUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
results = re.compile("films/(.*?)\"").findall(html) results = re.compile("films/(.*?)\"").findall(html)
for result in results: for result in results:
ids.append(result) ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html) results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results: for result in results:
html = getUrlUnicode("http://www.criterion.com/boxsets/" + result) html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(.*?)\"").findall(html) results = re.compile("films/(.*?)\"").findall(html)
for result in results: for result in results:
ids.append(result) ids.append(result)

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from urllib import unquote from urllib import unquote
from oxlib.cache import getUrl from ox.cache import readUrl
def getVideoUrl(url): def getVideoUrl(url):
@ -13,7 +13,7 @@ def getVideoUrl(url):
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0] >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv' 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
''' '''
data = getUrl(url) data = readUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data) video = re.compile('''video", "(.*?)"''').findall(data)
for v in video: for v in video:
v = unquote(v).split('@@')[0] v = unquote(v).split('@@')[0]

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from oxlib import stripTags, findRe from ox import stripTags, findRe
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
import google import google
@ -21,7 +21,7 @@ def getShowUrl(title):
return None return None
def getShowData(url): def getShowData(url):
data = getUrlUnicode(url) data = readUrlUnicode(url)
r = {} r = {}
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>')) r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>') r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')

View file

@ -10,8 +10,8 @@ import Queue
import simplejson import simplejson
import oxlib import ox
from oxlib import stripTags from ox import stripTags
''' '''
@ -30,15 +30,15 @@ FIXME: how search depper than first page?
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60 DEFAULT_TIMEOUT = 24*60*60
def getUrl(url, data=None, headers=oxlib.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT): def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return oxlib.cache.getUrl(url, data, headers, timeout) return ox.cache.readUrl(url, data, headers, timeout)
def quote_plus(s): def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8')) return urllib.quote_plus(s.encode('utf-8'))
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url = "http://www.google.com/search?q=%s" % quote_plus(query) url = "http://www.google.com/search?q=%s" % quote_plus(query)
data = getUrl(url, timeout=timeout) data = readUrl(url, timeout=timeout)
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \ link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \ r'.*?(?:<br>|<table.*?>)' + \
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)' r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
@ -52,6 +52,6 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
def _find(query): def _find(query):
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = simplejson.loads(getUrlUnicode(url))['responseData']['results'] results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results']
return results return results

View file

@ -8,19 +8,19 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import chardet import chardet
import oxlib import ox
from oxlib import stripTags, decodeHtml, findRe, findString from ox import stripTags, decodeHtml, findRe, findString
import oxlib.cache import ox.cache
from oxlib.normalize import normalizeTitle, normalizeImdbId from ox.normalize import normalizeTitle, normalizeImdbId
from oxlib import * from ox import *
import google import google
''' '''
never timeout imdb data, to update cache remove data from cache folder never timeout imdb data, to update cache remove data from cache folder
''' '''
def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1): def readUrlUnicode(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=-1):
return oxlib.cache.getUrlUnicode(url, data, headers, timeout) return ox.cache.readUrlUnicode(url, data, headers, timeout)
''' '''
check if result is valid while updating check if result is valid while updating
@ -28,8 +28,8 @@ def validate(result, header):
return header['status'] == u'200' return header['status'] == u'200'
try: try:
d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate) d = ox.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e: except ox.cache.InvalidResult, e:
print e.headers print e.headers
''' '''
@ -76,7 +76,7 @@ def getRawMovieData(imdbId):
return data return data
def getMovieInfo(imdbId): def getMovieInfo(imdbId):
data = getUrlUnicode(getUrlBase(imdbId)) data = readUrlUnicode(getUrlBase(imdbId))
info = dict() info = dict()
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"') info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
if info['poster'] and '_V' in info['poster']: if info['poster'] and '_V' in info['poster']:
@ -246,7 +246,7 @@ def getMovieAKATitles(imdbId):
(u'Women of the Night', u'(undefined)')] (u'Women of the Night', u'(undefined)')]
''' '''
url = "%sreleaseinfo" % getUrlBase(imdbId) url = "%sreleaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>') titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>')
titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles) titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles)
return titles return titles
@ -268,7 +268,7 @@ def creditList(data, section=None):
def getMovieCredits(imdbId): def getMovieCredits(imdbId):
credits = dict() credits = dict()
url = "%sfullcredits" % getUrlBase(imdbId) url = "%sfullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
groups = data.split('<h5>') groups = data.split('<h5>')
for g in groups: for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
@ -278,7 +278,7 @@ def getMovieCredits(imdbId):
def getMovieTrailers(imdbId): def getMovieTrailers(imdbId):
url = "%strailers" % getUrlBase(imdbId) url = "%strailers" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"}) videos = soup('div', {'class':"video-gallery"})
trailers = [] trailers = []
@ -288,27 +288,27 @@ def getMovieTrailers(imdbId):
url = 'http://www.imdb.com' + a['href'] url = 'http://www.imdb.com' + a['href']
videoId = findRe(url, '/(vi\d*?)/') videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl) iframe = readUrlUnicode(iframeUrl)
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers return trailers
def getMovieQuotes(imdbId): def getMovieQuotes(imdbId):
url = "%squotes" % getUrlBase(imdbId) url = "%squotes" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q')) quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
quotes = [(q[0].strip(),q[1].strip()) for q in quotes] quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes return quotes
def getMoviePlot(imdbId): def getMoviePlot(imdbId):
url = "%splotsummary" % getUrlBase(imdbId) url = "%splotsummary" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0] plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
return plot.strip() return plot.strip()
def getMovieTechnical(imdbId): def getMovieTechnical(imdbId):
url = "%stechnical" % getUrlBase(imdbId) url = "%stechnical" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
results = {} results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data): for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip() results[t[0].strip()] = t[1].strip()
@ -316,7 +316,7 @@ def getMovieTechnical(imdbId):
def getMovieCompanyCredits(imdbId): def getMovieCompanyCredits(imdbId):
url = "%scompanycredits" % getUrlBase(imdbId) url = "%scompanycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
results = {} results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data): for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = [] results[field.strip()] = []
@ -326,7 +326,7 @@ def getMovieCompanyCredits(imdbId):
def getMovieLocations(imdbId): def getMovieLocations(imdbId):
url = "%slocations" % getUrlBase(imdbId) url = "%slocations" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data) locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
return locations return locations
@ -334,7 +334,7 @@ def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {} photos = {}
for key in keys: for key in keys:
url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key) url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url) data = readUrlUnicode(url)
photos[key] = {} photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data): for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0] img= "%s.jpg" % s[1].split('._V')[0]
@ -358,7 +358,7 @@ def getMoviePosters(imdbId):
def getMovieTrivia(imdbId): def getMovieTrivia(imdbId):
url = "%strivia" % getUrlBase(imdbId) url = "%strivia" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
trivia = re.compile('<li>(.*?)</li>').findall(data) trivia = re.compile('<li>(.*?)</li>').findall(data)
def clean(t): def clean(t):
t = decodeHtml(t) t = decodeHtml(t)
@ -371,7 +371,7 @@ def getMovieTrivia(imdbId):
def getMovieConnections(imdbId): def getMovieConnections(imdbId):
url = "%smovieconnections" % getUrlBase(imdbId) url = "%smovieconnections" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
connections={} connections={}
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data): for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1]) connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
@ -379,7 +379,7 @@ def getMovieConnections(imdbId):
def getMovieKeywords(imdbId): def getMovieKeywords(imdbId):
url = "%skeywords" % getUrlBase(imdbId) url = "%skeywords" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
keywords = [] keywords = []
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data): for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
keyword = decodeHtml(keyword) keyword = decodeHtml(keyword)
@ -389,7 +389,7 @@ def getMovieKeywords(imdbId):
def getMovieExternalReviews(imdbId): def getMovieExternalReviews(imdbId):
url = "%sexternalreviews" % getUrlBase(imdbId) url = "%sexternalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
_reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data) _reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
reviews = {} reviews = {}
for r in _reviews: for r in _reviews:
@ -430,7 +430,7 @@ def _parseDate(d):
def getMovieReleaseDates(imdbId): def getMovieReleaseDates(imdbId):
url = "%sreleaseinfo" % getUrlBase(imdbId) url = "%sreleaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
releasedates = [] releasedates = []
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>''' regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
@ -468,7 +468,7 @@ def getMovieFlimingDates(imdbId):
def getMovieBusiness(imdbId): def getMovieBusiness(imdbId):
url = "%sbusiness" % getUrlBase(imdbId) url = "%sbusiness" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
business = {} business = {}
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data): for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
key = stripTags(r[0]).strip().lower() key = stripTags(r[0]).strip().lower()
@ -478,7 +478,7 @@ def getMovieBusiness(imdbId):
def getMovieEpisodes(imdbId): def getMovieEpisodes(imdbId):
url = "%sepisodes" % getUrlBase(imdbId) url = "%sepisodes" % getUrlBase(imdbId)
data = getUrlUnicode(url) data = readUrlUnicode(url)
episodes = {} episodes = {}
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>''' regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
for r in re.compile(regexp, re.DOTALL).findall(data): for r in re.compile(regexp, re.DOTALL).findall(data):
@ -514,7 +514,7 @@ class IMDb:
self.pageUrl = getUrlBase(imdbId) self.pageUrl = getUrlBase(imdbId)
def getPage(self): def getPage(self):
return getUrlUnicode(self.pageUrl) return readUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value): def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
@ -682,10 +682,10 @@ def guess(title, director=''):
search = 'site:imdb.com "%s"' % title search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2): for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'): if url.startswith('http://www.imdb.com/title/tt'):
return normalizeImdbId(int(oxlib.intValue(url))) return normalizeImdbId(int(ox.intValue(url)))
try: try:
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS) req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req) u = urllib2.urlopen(req)
data = u.read() data = u.read()
return_url = u.url return_url = u.url
@ -700,7 +700,7 @@ def guess(title, director=''):
return imdb_id return imdb_id
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8')) imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS) req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req) u = urllib2.urlopen(req)
data = u.read() data = u.read()
return_url = u.url return_url = u.url
@ -737,7 +737,7 @@ def getEpisodeData(title, episode, show_url = None):
def getPersonData(imdbId): def getPersonData(imdbId):
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
url = u'http://www.imdb.com/name/nm%s/' % imdbId url = u'http://www.imdb.com/name/nm%s/' % imdbId
data = getUrlUnicode(url) data = readUrlUnicode(url)
info = dict() info = dict()
info['name'] = findRe(data, u'<title>(.*?)</title>') info['name'] = findRe(data, u'<title>(.*?)</title>')
filmo = data.split(u'<h3>Additional Details</h3>')[0] filmo = data.split(u'<h3>Additional Details</h3>')[0]

View file

@ -2,9 +2,9 @@
# encoding: utf-8 # encoding: utf-8
import re import re
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
from oxlib.html import stripTags from ox.html import stripTags
from oxlib.text import findRe from ox.text import findRe
import imdb import imdb
@ -22,7 +22,7 @@ def getData(id):
data = { data = {
'url': getUrl(id) 'url': getUrl(id)
} }
html = getUrlUnicode(data['url']) html = readUrlUnicode(data['url'])
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">')) data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)') data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
@ -31,11 +31,11 @@ def getData(id):
for result in results: for result in results:
result = result.replace('_xlg.html', '.html') result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url) html = readUrlUnicode(url)
result = findRe(html, '<a href = (\w*?_xlg.html)') result = findRe(html, '<a href = (\w*?_xlg.html)')
if result: if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url) html = readUrlUnicode(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else: else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt=')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
@ -55,7 +55,7 @@ def getId(url):
def getIds(): def getIds():
ids = [] ids = []
html = getUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60) html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1 pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
for id in getIdsByPage(page): for id in getIdsByPage(page):
@ -65,7 +65,7 @@ def getIds():
def getIdsByPage(page): def getIdsByPage(page):
ids = [] ids = []
html = getUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1) html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html) results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results: for result in results:
url = 'http://impawards.com/%s' % result url = 'http://impawards.com/%s' % result
@ -74,7 +74,7 @@ def getIdsByPage(page):
def getUrl(id): def getUrl(id):
url = "http://www.impawards.com/%s.html" % id url = "http://www.impawards.com/%s.html" % id
html = getUrlUnicode(url) html = readUrlUnicode(url)
if findRe(html, "No Movie Posters on This Page"): if findRe(html, "No Movie Posters on This Page"):
url = "http://www.impawards.com/%s_ver1.html" % id url = "http://www.impawards.com/%s_ver1.html" % id
return url return url

View file

@ -3,10 +3,10 @@
import re import re
import urllib import urllib
from oxlib.cache import getUrl from ox.cache import readUrl
from oxlib.html import decodeHtml, stripTags from ox.html import decodeHtml, stripTags
from oxlib.text import findRe from ox.text import findRe
from oxlib.text import findString from ox.text import findString
# to sniff itunes traffic, use something like # to sniff itunes traffic, use something like
@ -113,14 +113,14 @@ class ItunesAlbum:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = getUrl(url, headers = ITUNES_HEADERS) xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&') id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = composeUrl('viewAlbum', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS) xml = readUrl(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>') data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>') data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
@ -144,14 +144,14 @@ class ItunesMovie:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = getUrl(url, headers = ITUNES_HEADERS) xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&') id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id}) url = composeUrl('viewMovie', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS) xml = readUrl(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml) f.write(xml)
f.close() f.close()

View file

@ -1,24 +1,24 @@
import re import re
from oxlib import cache from ox import cache
from oxlib.html import stripTags from ox.html import stripTags
from oxlib.text import findRe from ox.text import findRe
import auth import auth
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy() headers = headers.copy()
headers["Cookie"] = auth.get("karagarga.cookie") headers["Cookie"] = auth.get("karagarga.cookie")
return cache.getUrl(url, data, headers, timeout) return cache.readUrl(url, data, headers, timeout)
def getUrlUnicode(url, timeout=cache.cache_timeout): def readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.getUrlUnicode(url, _getUrl=_getUrl, timeout=timeout) return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
def getData(id): def getData(id):
data = { data = {
"url": getUrl(id) "url": getUrl(id)
} }
html = getUrlUnicode("%s%s" % (data["url"], "&filelist=1")) html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
if 'No torrent with ID' in html: if 'No torrent with ID' in html:
return False return False
data['added'] = stripTags(parseTable(html, 'Added')) data['added'] = stripTags(parseTable(html, 'Added'))
@ -87,7 +87,7 @@ def getId(url):
return url.split("=")[-1] return url.split("=")[-1]
def getTorrent(id): def getTorrent(id):
return _getUrl(getData(id)['torrent']) return readUrl(getData(id)['torrent'])
def getIds(lastId = 20): def getIds(lastId = 20):
lastId = '%s' % lastId lastId = '%s' % lastId
@ -105,7 +105,7 @@ def getIds(lastId = 20):
def getIdsByPage(page): def getIdsByPage(page):
ids = [] ids = []
url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
html = getUrlUnicode(url, timeout = 23*60*60) #get new ids once per day html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
strings = html.split('<td width="42" style="padding:0px;">') strings = html.split('<td width="42" style="padding:0px;">')
strings.pop(0) strings.pop(0)
for string in strings: for string in strings:

View file

@ -1,15 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from oxlib.cache import getUrl from ox.cache import readUrl
from oxlib.html import decodeHtml from ox.html import decodeHtml
from oxlib.text import findRe from ox.text import findRe
def getLyrics(title, artist): def getLyrics(title, artist):
html = getUrl('http://lyricsfly.com/api/') html = readUrl('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>') key = findRe(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = getUrl(url) xml = readUrl(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip() lyrics = lyrics.replace('[br]', '\n').strip()

View file

@ -3,14 +3,14 @@
import re import re
from urllib import quote from urllib import quote
from oxlib.cache import getUrl, getUrlUnicode from ox.cache import readUrl, readUrlUnicode
from oxlib import findRe, decodeHtml, stripTags from ox import findRe, decodeHtml, stripTags
def getMetacriticShowUrl(title): def getMetacriticShowUrl(title):
title = quote(title) title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = getUrl(url) data = readUrl(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(title, url=None): def getData(title, url=None):
@ -18,7 +18,7 @@ def getData(title, url=None):
url = getMetacriticShowUrl(title) url = getMetacriticShowUrl(title)
if not url: if not url:
return None return None
data = getUrlUnicode(url) data = readUrlUnicode(url)
score = findRe(data, 'ALT="Metascore: (.*?)"') score = findRe(data, 'ALT="Metascore: (.*?)"')
if score: if score:
score = int(score) score = int(score)

View file

@ -5,10 +5,10 @@ import re
import socket import socket
from urllib import quote from urllib import quote
from oxlib.cache import getUrl, getUrlUnicode from ox.cache import readUrl, readUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxlib.normalize import normalizeImdbId from ox.normalize import normalizeImdbId
import oxlib import ox
from torrent import Torrent from torrent import Torrent
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
'''search for torrents on mininova '''search for torrents on mininova
''' '''
url = "http://www.mininova.org/search/%s/seeds" % quote(query) url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = getUrlUnicode(url) data = readUrlUnicode(url)
return _parseResultsPage(data, max_results) return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId): def findMovieByImdb(imdbId):
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
''' '''
results = [] results = []
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
return _parseResultsPage(data) return _parseResultsPage(data)
def getId(mininovaId): def getId(mininovaId):
@ -55,7 +55,7 @@ def getId(mininovaId):
def exists(mininovaId): def exists(mininovaId):
mininovaId = getId(mininovaId) mininovaId = getId(mininovaId)
data = oxlib.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId) data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data: if not data or 'Torrent not found...' in data:
return False return False
if 'tracker</a> of this torrent requires registration.' in data: if 'tracker</a> of this torrent requires registration.' in data:
@ -74,7 +74,7 @@ def getData(mininovaId):
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link']) data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
if '<h1>Torrent not found...</h1>' in data: if '<h1>Torrent not found...</h1>' in data:
return None return None
@ -89,7 +89,7 @@ def getData(mininovaId):
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>') torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link']) t = readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent

View file

@ -3,8 +3,8 @@
import re import re
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
from oxlib import findRe from ox import findRe
def getData(id): def getData(id):
''' '''
@ -24,7 +24,7 @@ def getId(url):
def getPostersByUrl(url, group=True): def getPostersByUrl(url, group=True):
posters = [] posters = []
html = getUrlUnicode(url) html = readUrlUnicode(url)
if url in html: if url in html:
if group: if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True):
posters += getPostersByUrl(result, False) posters += getPostersByUrl(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results: for result in results:
html = getUrlUnicode(result) html = readUrlUnicode(result)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters return posters

View file

@ -3,9 +3,9 @@
import re import re
import feedparser import feedparser
from oxlib.cache import getUrl, getUrlUnicode from ox.cache import readUrl, readUrlUnicode
import oxlib import ox
from oxlib import langCode2To3, langTo3Code from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if len(language) == 2: if len(language) == 2:
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if language: if language:
url += "sublanguageid-%s/" % language url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb) url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = getUrl(url) data = readUrl(url)
if "title>opensubtitles.com - search results</title" in data: if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data) fd = feedparser.parse(data)
opensubtitleId = None opensubtitleId = None
@ -26,16 +26,16 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if opensubtitleId: if opensubtitleId:
opensubtitleId = opensubtitleId[0] opensubtitleId = opensubtitleId[0]
else: else:
opensubtitleId = oxlib.findRe(data, '/en/subtitles/(.*?)/') opensubtitleId = ox.findRe(data, '/en/subtitles/(.*?)/')
return opensubtitleId return opensubtitleId
def downloadSubtitleById(opensubtitle_id): def downloadSubtitleById(opensubtitle_id):
srts = {} srts = {}
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data): for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = oxlib.stripTags(f[1]).split('\n')[0] name = ox.stripTags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0] url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = getUrlUnicode(url) srts[name] = readUrlUnicode(url)
return srts return srts

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache import ox.cache
def getPosterUrl(id): def getPosterUrl(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id url = "http://0xdb.org/%s/poster.0xdb.jpg" % id

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache import ox.cache
from oxlib.cache import exists from ox.cache import exists
def getPosterUrl(id): def getPosterUrl(id):

View file

@ -2,11 +2,11 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from oxlib.cache import getHeaders, getUrl, getUrlUnicode from ox.cache import getHeaders, readUrl, readUrlUnicode
from oxlib import findRe, stripTags from ox import findRe, stripTags
def getUrlByImdb(imdb): def readUrlByImdb(imdb):
#this would also wor but does not cache: #this would also wor but does not cache:
''' '''
from urllib2 import urlopen from urllib2 import urlopen
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
return u.url return u.url
''' '''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = getUrl(url) data = readUrl(url)
if "movie_title" in data: if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data) movies = re.compile('(/m/.*?/)').findall(data)
if movies: if movies:
@ -22,7 +22,7 @@ def getUrlByImdb(imdb):
return None return None
def getData(url): def getData(url):
data = getUrlUnicode(url) data = readUrlUnicode(url)
r = {} r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>') r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']: if '(' in r['title']:

View file

@ -6,9 +6,9 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import oxlib.cache import ox.cache
from oxlib.html import decodeHtml, stripTags from ox.html import decodeHtml, stripTags
import oxlib.net import ox.net
def getNews(year, month, day): def getNews(year, month, day):
@ -23,9 +23,9 @@ def getNews(year, month, day):
for section in sections: for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()): if date == time.strftime('%d.%m.%Y', time.localtime()):
html = oxlib.net.getUrl(url) html = ox.net.readUrl(url)
else: else:
html = oxlib.cache.getUrl(url) html = ox.cache.readUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try: try:
@ -102,11 +102,11 @@ def formatSubsection(string):
def getIssue(year, week): def getIssue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxlib.net.exists(coverUrl): if not ox.net.exists(coverUrl):
return None return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = [] contents = []
soup = BeautifulSoup(oxlib.cache.getUrl(url)) soup = BeautifulSoup(ox.cache.readUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}): for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item) item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0]) page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
@ -116,7 +116,7 @@ def getIssue(year, week):
pages = page + 2 pages = page + 2
for page in range(1, pages + 10): for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxlib.cache.exists(url): if ox.cache.exists(url):
pageUrl[page] = url pageUrl[page] = url
else: else:
pageUrl[page] = '' pageUrl[page] = ''
@ -164,7 +164,7 @@ def archiveIssues():
f.close() f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename): if not os.path.exists(filename):
data = oxlib.cache.getUrl(issue['coverUrl']) data = ox.cache.readUrl(issue['coverUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
@ -173,7 +173,7 @@ def archiveIssues():
if url: if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename): if not os.path.exists(filename):
data = oxlib.cache.getUrl(url) data = ox.cache.readUrl(url)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
@ -244,7 +244,7 @@ def archiveNews():
f.close() f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1] filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename): if not os.path.exists(filename):
data = oxlib.cache.getUrl(new['imageUrl']) data = ox.cache.readUrl(new['imageUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()

View file

@ -6,10 +6,10 @@ import socket
from urllib import quote, urlencode from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError
from oxlib.cache import getUrl, getUrlUnicode from ox.cache import readUrl, readUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxlib.normalize import normalizeImdbId from ox.normalize import normalizeImdbId
import oxlib import ox
from torrent import Torrent from torrent import Torrent
@ -18,13 +18,13 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE) season_episode = re.compile("S..E..", re.IGNORECASE)
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy() headers = headers.copy()
headers['Cookie'] = 'language=en_EN' headers['Cookie'] = 'language=en_EN'
return cache.getUrl(url, data, headers, timeout) return cache.readUrl(url, data, headers, timeout)
def _getUrlUnicode(url, timeout=cache.cache_timeout): def _readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.getUrlUnicode(url, _getUrl=_getUrl, timeout=timeout) return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
def findMovies(query, max_results=10): def findMovies(query, max_results=10):
results = [] results = []
@ -37,7 +37,7 @@ def findMovies(query, max_results=10):
if not url.startswith('/'): if not url.startswith('/'):
url = "/" + url url = "/" + url
url = "http://thepiratebay.org" + url url = "http://thepiratebay.org" + url
data = _getUrlUnicode(url, timeout=cache_timeout) data = _readUrlUnicode(url, timeout=cache_timeout)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>''' regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0] torrentType = row[0]
@ -67,7 +67,7 @@ def getId(piratebayId):
def exists(piratebayId): def exists(piratebayId):
piratebayId = getId(piratebayId) piratebayId = getId(piratebayId)
return oxlib.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId) return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def getData(piratebayId): def getData(piratebayId):
_key_map = { _key_map = {
@ -83,7 +83,7 @@ def getData(piratebayId):
torrent[u'domain'] = 'thepiratebay.org' torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link']) data = _readUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
@ -99,7 +99,7 @@ def getData(piratebayId):
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link']) t = _readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from oxlib import intValue from ox import intValue
class Torrent(dict): class Torrent(dict):

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from oxlib import stripTags, findRe from ox import stripTags, findRe
from oxlib.cache import getUrlUnicode from ox.cache import readUrlUnicode
def getEpisodeData(url): def getEpisodeData(url):
@ -14,7 +14,7 @@ def getEpisodeData(url):
example: example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
''' '''
data = getUrlUnicode(url) data = readUrlUnicode(url)
r = {} r = {}
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0]) r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>') r['show'] = findRe(data, '<h1>(.*?)</h1>')

View file

@ -3,8 +3,8 @@
from urllib import urlencode from urllib import urlencode
import simplejson import simplejson
from oxlib.cache import getUrlUnicode from ox.cache import readUrl, readUrlUnicode
from oxlib import findRe, decodeHtml from ox import findRe, decodeHtml
def getId(url): def getId(url):
@ -44,7 +44,7 @@ def getUrlByAllmovieId(allmovieId):
def getWikiData(wikipediaUrl): def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url url = "%s&action=raw" % url
data = getUrlUnicode(url) data = readUrlUnicode(url)
return data return data
def getMovieData(wikipediaUrl): def getMovieData(wikipediaUrl):
@ -83,7 +83,7 @@ def getMovieData(wikipediaUrl):
return filmbox return filmbox
def getImageUrl(name): def getImageUrl(name):
data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name) data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"') url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
return url return url
@ -103,13 +103,12 @@ def getAllmovieId(wikipediaUrl):
return data.get('amg_id', '') return data.get('amg_id', '')
def find(query, max_results=10): def find(query, max_results=10):
from oxlib.cache import getUrl
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = getUrl(url) data = readUrl(url)
if not data: if not data:
data = getUrl(url, timeout=0) data = readUrl(url, timeout=0)
result = simplejson.loads(data) result = simplejson.loads(data)
results = [] results = []
if result and 'query' in result: if result and 'query' in result:

View file

@ -6,12 +6,12 @@ import xml.etree.ElementTree as ET
import re import re
import feedparser import feedparser
from oxlib.cache import getUrl, getUrlUnicode from ox.cache import readUrl, readUrlUnicode
from oxlib import findString, findRe from ox import findString, findRe
def getVideoKey(youtubeId): def getVideoKey(youtubeId):
data = getUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId) data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
match = re.compile("token=(.+)&thumbnail").findall(data) match = re.compile("token=(.+)&thumbnail").findall(data)
if match: if match:
return unquote(match[0]) return unquote(match[0])
@ -31,7 +31,7 @@ def getVideoUrl(youtubeId, format='mp4'):
def getMovieInfo(youtubeId, video_url_base=None): def getMovieInfo(youtubeId, video_url_base=None):
url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
data = getUrl(url) data = readUrl(url)
fd = feedparser.parse(data) fd = feedparser.parse(data)
return getInfoFromAtom(fd.entries[0], video_url_base) return getInfoFromAtom(fd.entries[0], video_url_base)
@ -59,7 +59,7 @@ def getInfoFromAtom(entry, video_url_base=None):
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None): def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
query = quote(query) query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results) url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = getUrlUnicode(url) data = readUrlUnicode(url)
fd = feedparser.parse(data) fd = feedparser.parse(data)
videos = [] videos = []
for entry in fd.entries: for entry in fd.entries:
@ -72,7 +72,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=No
''' '''
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None): def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query) url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
data = getUrlUnicode(url) data = readUrlUnicode(url)
regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ') regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ')
regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)" class="vimg120" title="(.*?)" alt="video">') regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)" class="vimg120" title="(.*?)" alt="video">')
id_title = regx.findall(data) id_title = regx.findall(data)