depend on ox, install as ox.web, migrate getUrl to readUrl

This commit is contained in:
j 2009-10-12 13:47:43 +02:00
parent d2849d44ef
commit 16eeaf8b25
28 changed files with 169 additions and 172 deletions

4
README
View File

@ -2,7 +2,7 @@ python-oxweb the internet is a dict
Depends:
python2.5
python-oxlib (bzr branch http://code.0xdb.org/python-oxlib)
python-ox (bzr branch http://code.0xdb.org/python-ox)
python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)
python-feedparser (http://www.feedparser.org/)
(there seam to be some issues if not using the one from ubuntu/debian)
@ -17,4 +17,4 @@ Install:
}
Test:
nosetests --with-doctest oxweb
nosetests --with-doctest web

View File

@ -1 +1 @@
oxlib
ox

View File

@ -19,8 +19,8 @@ setup(
url="http://code.0xdb.org/oxweb",
download_url="http://code.0xdb.org/oxweb/download",
license="GPLv3",
packages=['oxweb'],
zip_safe=False,
package_dir = {'ox.web': 'web'},
packages=['ox.web'],
keywords = [
],
classifiers = [

View File

@ -1,6 +1,6 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
__version__ = '0.1.0'
__version__ = '1.0.0'
import imdb
import wikipedia

View File

@ -3,8 +3,8 @@
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import getUrlUnicode
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
def getId(url):
@ -24,7 +24,7 @@ def getData(id):
data = {
"url": getUrl(id)
}
html = getUrlUnicode(data["url"])
html = readUrlUnicode(data["url"])
data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
data['countries'] = parseList(html, 'Countries')
@ -42,11 +42,11 @@ def getData(id):
data['themes'] = parseList(html, 'Themes')
data['types'] = parseList(html, 'Types')
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
html = getUrlUnicode("http://allmovie.com/work/%s/cast" % id)
html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
data['cast'] = parseTable(html)
html = getUrlUnicode("http://allmovie.com/work/%s/credits" % id)
html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
data['credits'] = parseTable(html)
html = getUrlUnicode("http://allmovie.com/work/%s/review" % id)
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
data['review'] = parseText(html, 'Review')
return data

View File

@ -4,8 +4,6 @@
import os
import simplejson
import oxlib
def get(key):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))

View File

@ -2,10 +2,10 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
import oxlib.cache
from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
from oxlib.text import findRe, removeSpecialCharacters
import ox.cache
from ox.cache import readUrlUnicode
from ox.html import stripTags
from ox.text import findRe, removeSpecialCharacters
import imdb
@ -30,9 +30,9 @@ def getData(id):
"url": getUrl(id)
}
try:
html = getUrlUnicode(data["url"])
html = readUrlUnicode(data["url"])
except:
html = oxlib.cache.getUrl(data["url"])
html = ox.cache.getUrl(data["url"])
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
@ -48,7 +48,7 @@ def getData(id):
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = getUrlUnicode(result)
html_ = readUrlUnicode(result)
result = findRe(html_, "<a href=\"http://www.criterion.com/films/%s\">(.*?)</a>" % id)
result = findRe(result, "src=\"(.*?)\"")
data["posters"] = [result.replace("_w100", "")]
@ -64,7 +64,7 @@ def getData(id):
def getIds():
ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd")
html = readUrlUnicode("http://www.criterion.com/library/dvd")
results = re.compile("page=(.*?)\"").findall(html)
pages = int(results[len(results) - 2])
for page in range(pages, 0, -1):
@ -74,13 +74,13 @@ def getIds():
def getIdsByPage(page):
ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
html = readUrlUnicode("http://www.criterion.com/library/dvd?page=%s" % page)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = getUrlUnicode("http://www.criterion.com/boxsets/" + result)
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(.*?)\"").findall(html)
for result in results:
ids.append(result)

View File

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import unquote
from oxlib.cache import getUrl
from ox.cache import readUrl
def getVideoUrl(url):
@ -13,7 +13,7 @@ def getVideoUrl(url):
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0]
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv'
'''
data = getUrl(url)
data = readUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]

View File

@ -3,8 +3,8 @@
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import getUrlUnicode
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
import google
@ -21,7 +21,7 @@ def getShowUrl(title):
return None
def getShowData(url):
data = getUrlUnicode(url)
data = readUrlUnicode(url)
r = {}
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')

View File

@ -10,8 +10,8 @@ import Queue
import simplejson
import oxlib
from oxlib import stripTags
import ox
from ox import stripTags
'''
@ -30,15 +30,15 @@ FIXME: how search depper than first page?
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
def getUrl(url, data=None, headers=oxlib.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return oxlib.cache.getUrl(url, data, headers, timeout)
def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.readUrl(url, data, headers, timeout)
def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8'))
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url = "http://www.google.com/search?q=%s" % quote_plus(query)
data = getUrl(url, timeout=timeout)
data = readUrl(url, timeout=timeout)
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
@ -52,6 +52,6 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
def _find(query):
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query)
results = simplejson.loads(getUrlUnicode(url))['responseData']['results']
results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results']
return results

View File

@ -8,19 +8,19 @@ import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxlib
from oxlib import stripTags, decodeHtml, findRe, findString
import oxlib.cache
from oxlib.normalize import normalizeTitle, normalizeImdbId
from oxlib import *
import ox
from ox import stripTags, decodeHtml, findRe, findString
import ox.cache
from ox.normalize import normalizeTitle, normalizeImdbId
from ox import *
import google
'''
never timeout imdb data, to update cache remove data from cache folder
'''
def getUrlUnicode(url, data=None, headers=oxlib.cache.DEFAULT_HEADERS, timeout=-1):
return oxlib.cache.getUrlUnicode(url, data, headers, timeout)
def readUrlUnicode(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=-1):
return ox.cache.readUrlUnicode(url, data, headers, timeout)
'''
check if result is valid while updating
@ -28,8 +28,8 @@ def validate(result, header):
return header['status'] == u'200'
try:
d = oxlib.cache.getUrlUnicode(url, data, headers, timeout=0, valid=validate)
except oxlib.cache.InvalidResult, e:
d = ox.cache.readUrlUnicode(url, data, headers, timeout=0, valid=validate)
except ox.cache.InvalidResult, e:
print e.headers
'''
@ -76,7 +76,7 @@ def getRawMovieData(imdbId):
return data
def getMovieInfo(imdbId):
data = getUrlUnicode(getUrlBase(imdbId))
data = readUrlUnicode(getUrlBase(imdbId))
info = dict()
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
if info['poster'] and '_V' in info['poster']:
@ -246,7 +246,7 @@ def getMovieAKATitles(imdbId):
(u'Women of the Night', u'(undefined)')]
'''
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
titles = findRe(data, 'name="akas".*?<table.*?>(.*?)</table>')
titles = re.compile("td>(.*?)</td>\n\n<td>(.*)</td>").findall(titles)
return titles
@ -268,7 +268,7 @@ def creditList(data, section=None):
def getMovieCredits(imdbId):
credits = dict()
url = "%sfullcredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
groups = data.split('<h5>')
for g in groups:
section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g)
@ -278,7 +278,7 @@ def getMovieCredits(imdbId):
def getMovieTrailers(imdbId):
url = "%strailers" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
soup = BeautifulSoup(data)
videos = soup('div', {'class':"video-gallery"})
trailers = []
@ -288,27 +288,27 @@ def getMovieTrailers(imdbId):
url = 'http://www.imdb.com' + a['href']
videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
iframe = readUrlUnicode(iframeUrl)
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
def getMovieQuotes(imdbId):
url = "%squotes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
quotes = re.compile('<b>(.*?)</b>:(.*?)<br>', re.DOTALL).findall(findString(data, '<a name="q'))
quotes = [(q[0].strip(),q[1].strip()) for q in quotes]
return quotes
def getMoviePlot(imdbId):
url = "%splotsummary" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
plot = findRe(data, '<p class="plotpar">(.*?)<i>').split('</p>')[0]
return plot.strip()
def getMovieTechnical(imdbId):
url = "%stechnical" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
results = {}
for t in re.compile('<h5>(.*?)</h5>(.*?)<br/>', re.DOTALL).findall(data):
results[t[0].strip()] = t[1].strip()
@ -316,7 +316,7 @@ def getMovieTechnical(imdbId):
def getMovieCompanyCredits(imdbId):
url = "%scompanycredits" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
results = {}
for field, c in re.compile('<h2>(.*?)</h2><ul>(.*?)</ul>').findall(data):
results[field.strip()] = []
@ -326,7 +326,7 @@ def getMovieCompanyCredits(imdbId):
def getMovieLocations(imdbId):
url = "%slocations" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
locations = re.compile('<dt><a href="/List.*?>(.*?)</a></dt>').findall(data)
return locations
@ -334,7 +334,7 @@ def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
photos = {}
for key in keys:
url = "%smediaindex?refine=%s" % (getUrlBase(imdbId), key)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
photos[key] = {}
for s in re.compile('''<img alt="(.*?)".*?src="(http://ia.media-imdb.com/.*?.jpg)''').findall(data):
img= "%s.jpg" % s[1].split('._V')[0]
@ -358,7 +358,7 @@ def getMoviePosters(imdbId):
def getMovieTrivia(imdbId):
url = "%strivia" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
trivia = re.compile('<li>(.*?)</li>').findall(data)
def clean(t):
t = decodeHtml(t)
@ -371,7 +371,7 @@ def getMovieTrivia(imdbId):
def getMovieConnections(imdbId):
url = "%smovieconnections" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
connections={}
for c in re.compile('''<h5>(.*?)</h5>(.*?)\n\n''', re.DOTALL).findall(data):
connections[unicode(c[0])] = re.compile('''<a href="/title/tt(\d{7})/">''').findall(c[1])
@ -379,7 +379,7 @@ def getMovieConnections(imdbId):
def getMovieKeywords(imdbId):
url = "%skeywords" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
keywords = []
for keyword in re.compile('''<a.*?href="/keyword.*?>(.*?)</a>''').findall(data):
keyword = decodeHtml(keyword)
@ -389,7 +389,7 @@ def getMovieKeywords(imdbId):
def getMovieExternalReviews(imdbId):
url = "%sexternalreviews" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
_reviews = re.compile('<li><a href="(.*?)">(.*?)</a></li>').findall(data)
reviews = {}
for r in _reviews:
@ -430,7 +430,7 @@ def _parseDate(d):
def getMovieReleaseDates(imdbId):
url = "%sreleaseinfo" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
releasedates = []
regexp = '''<tr><td>(.*?)</td>.*?<td align="right">(.*?)</td>.*?<td>(.*?)</td></tr>'''
@ -468,7 +468,7 @@ def getMovieFlimingDates(imdbId):
def getMovieBusiness(imdbId):
url = "%sbusiness" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
business = {}
for r in re.compile('''<h5>(.*?)</h5>(.*?)<br/>.<br/>''', re.DOTALL).findall(data):
key = stripTags(r[0]).strip().lower()
@ -478,7 +478,7 @@ def getMovieBusiness(imdbId):
def getMovieEpisodes(imdbId):
url = "%sepisodes" % getUrlBase(imdbId)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
episodes = {}
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
for r in re.compile(regexp, re.DOTALL).findall(data):
@ -514,7 +514,7 @@ class IMDb:
self.pageUrl = getUrlBase(imdbId)
def getPage(self):
return getUrlUnicode(self.pageUrl)
return readUrlUnicode(self.pageUrl)
def parse_raw_value(self, key, value):
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
@ -682,10 +682,10 @@ def guess(title, director=''):
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google.find(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return normalizeImdbId(int(oxlib.intValue(url)))
return normalizeImdbId(int(ox.intValue(url)))
try:
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
@ -700,7 +700,7 @@ def guess(title, director=''):
return imdb_id
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
req = urllib2.Request(imdb_url, None, oxlib.net.DEFAULT_HEADERS)
req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS)
u = urllib2.urlopen(req)
data = u.read()
return_url = u.url
@ -737,7 +737,7 @@ def getEpisodeData(title, episode, show_url = None):
def getPersonData(imdbId):
imdbId = normalizeImdbId(imdbId)
url = u'http://www.imdb.com/name/nm%s/' % imdbId
data = getUrlUnicode(url)
data = readUrlUnicode(url)
info = dict()
info['name'] = findRe(data, u'<title>(.*?)</title>')
filmo = data.split(u'<h3>Additional Details</h3>')[0]

View File

@ -2,9 +2,9 @@
# encoding: utf-8
import re
from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
from oxlib.text import findRe
from ox.cache import readUrlUnicode
from ox.html import stripTags
from ox.text import findRe
import imdb
@ -22,7 +22,7 @@ def getData(id):
data = {
'url': getUrl(id)
}
html = getUrlUnicode(data['url'])
html = readUrlUnicode(data['url'])
data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
@ -31,11 +31,11 @@ def getData(id):
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url)
html = readUrlUnicode(url)
result = findRe(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = getUrlUnicode(url)
html = readUrlUnicode(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
@ -55,7 +55,7 @@ def getId(url):
def getIds():
ids = []
html = getUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
@ -65,7 +65,7 @@ def getIds():
def getIdsByPage(page):
ids = []
html = getUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
@ -74,7 +74,7 @@ def getIdsByPage(page):
def getUrl(id):
url = "http://www.impawards.com/%s.html" % id
html = getUrlUnicode(url)
html = readUrlUnicode(url)
if findRe(html, "No Movie Posters on This Page"):
url = "http://www.impawards.com/%s_ver1.html" % id
return url

View File

@ -3,10 +3,10 @@
import re
import urllib
from oxlib.cache import getUrl
from oxlib.html import decodeHtml, stripTags
from oxlib.text import findRe
from oxlib.text import findString
from ox.cache import readUrl
from ox.html import decodeHtml, stripTags
from ox.text import findRe
from ox.text import findString
# to sniff itunes traffic, use something like
@ -113,14 +113,14 @@ class ItunesAlbum:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = getUrl(url, headers = ITUNES_HEADERS)
xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS)
xml = readUrl(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
@ -144,14 +144,14 @@ class ItunesMovie:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = getUrl(url, headers = ITUNES_HEADERS)
xml = readUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS)
xml = readUrl(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()

View File

@ -1,24 +1,24 @@
import re
from oxlib import cache
from oxlib.html import stripTags
from oxlib.text import findRe
from ox import cache
from ox.html import stripTags
from ox.text import findRe
import auth
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy()
headers["Cookie"] = auth.get("karagarga.cookie")
return cache.getUrl(url, data, headers, timeout)
return cache.readUrl(url, data, headers, timeout)
def getUrlUnicode(url, timeout=cache.cache_timeout):
return cache.getUrlUnicode(url, _getUrl=_getUrl, timeout=timeout)
def readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
def getData(id):
data = {
"url": getUrl(id)
}
html = getUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
if 'No torrent with ID' in html:
return False
data['added'] = stripTags(parseTable(html, 'Added'))
@ -87,7 +87,7 @@ def getId(url):
return url.split("=")[-1]
def getTorrent(id):
return _getUrl(getData(id)['torrent'])
return readUrl(getData(id)['torrent'])
def getIds(lastId = 20):
lastId = '%s' % lastId
@ -105,7 +105,7 @@ def getIds(lastId = 20):
def getIdsByPage(page):
ids = []
url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
html = getUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
strings = html.split('<td width="42" style="padding:0px;">')
strings.pop(0)
for string in strings:

View File

@ -1,15 +1,15 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from oxlib.cache import getUrl
from oxlib.html import decodeHtml
from oxlib.text import findRe
from ox.cache import readUrl
from ox.html import decodeHtml
from ox.text import findRe
def getLyrics(title, artist):
html = getUrl('http://lyricsfly.com/api/')
html = readUrl('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = getUrl(url)
xml = readUrl(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()

View File

@ -3,14 +3,14 @@
import re
from urllib import quote
from oxlib.cache import getUrl, getUrlUnicode
from oxlib import findRe, decodeHtml, stripTags
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, decodeHtml, stripTags
def getMetacriticShowUrl(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = getUrl(url)
data = readUrl(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(title, url=None):
@ -18,7 +18,7 @@ def getData(title, url=None):
url = getMetacriticShowUrl(title)
if not url:
return None
data = getUrlUnicode(url)
data = readUrlUnicode(url)
score = findRe(data, 'ALT="Metascore: (.*?)"')
if score:
score = int(score)

View File

@ -5,10 +5,10 @@ import re
import socket
from urllib import quote
from oxlib.cache import getUrl, getUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxlib.normalize import normalizeImdbId
import oxlib
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from ox.normalize import normalizeImdbId
import ox
from torrent import Torrent
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
'''search for torrents on mininova
'''
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId):
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
'''
results = []
imdbId = normalizeImdbId(imdbId)
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
return _parseResultsPage(data)
def getId(mininovaId):
@ -55,7 +55,7 @@ def getId(mininovaId):
def exists(mininovaId):
mininovaId = getId(mininovaId)
data = oxlib.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
@ -74,7 +74,7 @@ def getData(mininovaId):
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
if '<h1>Torrent not found...</h1>' in data:
return None
@ -89,7 +89,7 @@ def getData(mininovaId):
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link'])
t = readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View File

@ -3,8 +3,8 @@
import re
from oxlib.cache import getUrlUnicode
from oxlib import findRe
from ox.cache import readUrlUnicode
from ox import findRe
def getData(id):
'''
@ -24,7 +24,7 @@ def getId(url):
def getPostersByUrl(url, group=True):
posters = []
html = getUrlUnicode(url)
html = readUrlUnicode(url)
if url in html:
if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True):
posters += getPostersByUrl(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = getUrlUnicode(result)
html = readUrlUnicode(result)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters

View File

@ -3,9 +3,9 @@
import re
import feedparser
from oxlib.cache import getUrl, getUrlUnicode
import oxlib
from oxlib import langCode2To3, langTo3Code
from ox.cache import readUrl, readUrlUnicode
import ox
from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if len(language) == 2:
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if language:
url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = getUrl(url)
data = readUrl(url)
if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data)
opensubtitleId = None
@ -26,16 +26,16 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = oxlib.findRe(data, '/en/subtitles/(.*?)/')
opensubtitleId = ox.findRe(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def downloadSubtitleById(opensubtitle_id):
srts = {}
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = oxlib.stripTags(f[1]).split('\n')[0]
name = ox.stripTags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = getUrlUnicode(url)
srts[name] = readUrlUnicode(url)
return srts

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache
import ox.cache
def getPosterUrl(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import oxlib.cache
from oxlib.cache import exists
import ox.cache
from ox.cache import exists
def getPosterUrl(id):

View File

@ -2,11 +2,11 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from oxlib.cache import getHeaders, getUrl, getUrlUnicode
from oxlib import findRe, stripTags
from ox.cache import getHeaders, readUrl, readUrlUnicode
from ox import findRe, stripTags
def getUrlByImdb(imdb):
def readUrlByImdb(imdb):
#this would also wor but does not cache:
'''
from urllib2 import urlopen
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
return u.url
'''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = getUrl(url)
data = readUrl(url)
if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data)
if movies:
@ -22,7 +22,7 @@ def getUrlByImdb(imdb):
return None
def getData(url):
data = getUrlUnicode(url)
data = readUrlUnicode(url)
r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:

View File

@ -6,9 +6,9 @@ import time
from BeautifulSoup import BeautifulSoup
import oxlib.cache
from oxlib.html import decodeHtml, stripTags
import oxlib.net
import ox.cache
from ox.html import decodeHtml, stripTags
import ox.net
def getNews(year, month, day):
@ -23,9 +23,9 @@ def getNews(year, month, day):
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = oxlib.net.getUrl(url)
html = ox.net.readUrl(url)
else:
html = oxlib.cache.getUrl(url)
html = ox.cache.readUrl(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
@ -102,11 +102,11 @@ def formatSubsection(string):
def getIssue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not oxlib.net.exists(coverUrl):
if not ox.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
soup = BeautifulSoup(oxlib.cache.getUrl(url))
soup = BeautifulSoup(ox.cache.readUrl(url))
for item in soup('a', {'href': re.compile('http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=')}):
item = str(item)
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
@ -116,7 +116,7 @@ def getIssue(year, week):
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if oxlib.cache.exists(url):
if ox.cache.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
@ -164,7 +164,7 @@ def archiveIssues():
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = oxlib.cache.getUrl(issue['coverUrl'])
data = ox.cache.readUrl(issue['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
@ -173,7 +173,7 @@ def archiveIssues():
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = oxlib.cache.getUrl(url)
data = ox.cache.readUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
@ -244,7 +244,7 @@ def archiveNews():
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = oxlib.cache.getUrl(new['imageUrl'])
data = ox.cache.readUrl(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()

View File

@ -6,10 +6,10 @@ import socket
from urllib import quote, urlencode
from urllib2 import URLError
from oxlib.cache import getUrl, getUrlUnicode
from oxlib import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxlib.normalize import normalizeImdbId
import oxlib
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox.normalize import normalizeImdbId
import ox
from torrent import Torrent
@ -18,13 +18,13 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
headers = headers.copy()
headers['Cookie'] = 'language=en_EN'
return cache.getUrl(url, data, headers, timeout)
return cache.readUrl(url, data, headers, timeout)
def _getUrlUnicode(url, timeout=cache.cache_timeout):
return cache.getUrlUnicode(url, _getUrl=_getUrl, timeout=timeout)
def _readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
def findMovies(query, max_results=10):
results = []
@ -37,7 +37,7 @@ def findMovies(query, max_results=10):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = _getUrlUnicode(url, timeout=cache_timeout)
data = _readUrlUnicode(url, timeout=cache_timeout)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
@ -67,7 +67,7 @@ def getId(piratebayId):
def exists(piratebayId):
piratebayId = getId(piratebayId)
return oxlib.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def getData(piratebayId):
_key_map = {
@ -83,7 +83,7 @@ def getData(piratebayId):
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link'])
data = _readUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
@ -99,7 +99,7 @@ def getData(piratebayId):
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link'])
t = _readUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from oxlib import intValue
from ox import intValue
class Torrent(dict):

View File

@ -3,8 +3,8 @@
import re
import time
from oxlib import stripTags, findRe
from oxlib.cache import getUrlUnicode
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
def getEpisodeData(url):
@ -14,7 +14,7 @@ def getEpisodeData(url):
example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
'''
data = getUrlUnicode(url)
data = readUrlUnicode(url)
r = {}
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>')

View File

@ -3,8 +3,8 @@
from urllib import urlencode
import simplejson
from oxlib.cache import getUrlUnicode
from oxlib import findRe, decodeHtml
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, decodeHtml
def getId(url):
@ -44,7 +44,7 @@ def getUrlByAllmovieId(allmovieId):
def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
data = getUrlUnicode(url)
data = readUrlUnicode(url)
return data
def getMovieData(wikipediaUrl):
@ -83,7 +83,7 @@ def getMovieData(wikipediaUrl):
return filmbox
def getImageUrl(name):
data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
return url
@ -103,13 +103,12 @@ def getAllmovieId(wikipediaUrl):
return data.get('amg_id', '')
def find(query, max_results=10):
from oxlib.cache import getUrl
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = getUrl(url)
data = readUrl(url)
if not data:
data = getUrl(url, timeout=0)
data = readUrl(url, timeout=0)
result = simplejson.loads(data)
results = []
if result and 'query' in result:

View File

@ -6,12 +6,12 @@ import xml.etree.ElementTree as ET
import re
import feedparser
from oxlib.cache import getUrl, getUrlUnicode
from oxlib import findString, findRe
from ox.cache import readUrl, readUrlUnicode
from ox import findString, findRe
def getVideoKey(youtubeId):
data = getUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
match = re.compile("token=(.+)&thumbnail").findall(data)
if match:
return unquote(match[0])
@ -31,7 +31,7 @@ def getVideoUrl(youtubeId, format='mp4'):
def getMovieInfo(youtubeId, video_url_base=None):
url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
data = getUrl(url)
data = readUrl(url)
fd = feedparser.parse(data)
return getInfoFromAtom(fd.entries[0], video_url_base)
@ -59,7 +59,7 @@ def getInfoFromAtom(entry, video_url_base=None):
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
fd = feedparser.parse(data)
videos = []
for entry in fd.entries:
@ -72,7 +72,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=No
'''
def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
data = getUrlUnicode(url)
data = readUrlUnicode(url)
regx = re.compile(' <a href="/watch.v=(.*?)" title="(.*?)" ')
regx = re.compile('<a href="/watch\?v=(\w*?)" ><img src="(.*?)" class="vimg120" title="(.*?)" alt="video">')
id_title = regx.findall(data)