ox gets some tests too

This commit is contained in:
j 2008-05-05 20:33:23 +02:00
parent b3d3f44d20
commit 1b93ae048d
5 changed files with 50 additions and 19 deletions

3
README
View file

@ -6,3 +6,6 @@ Depends:
python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/)
python-feedparser (http://www.feedparser.org/)
Test:
nosetests --with-doctest ox

View file

@ -3,6 +3,13 @@ from urllib import unquote
from oxutils.cache import getUrl
def getVideoUrl(url):
'''
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
'''
data = getUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
@ -10,6 +17,3 @@ def getVideoUrl(url):
return "http://www.dailymotion.com" + v
return ''
if __name__ == '__main__':
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')

View file

@ -6,9 +6,10 @@ from datetime import datetime
import re
import socket
from urllib import quote
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -45,6 +46,7 @@ def findMovieByImdb(imdbId):
return _parseResultsPage(data)
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRegexp(mininovaId, "/(\d+)")
if d:
return d
@ -80,15 +82,22 @@ def getData(mininovaId):
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = decodeHtml(stripTags(torrent['description'])).strip()
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> sha.sha(unicode(Mininova('1072195'))).hexdigest()
'ec98268a0aeaef8292f7bcf3585d0bc3910b3fac'
'''
def __init__(self, mininovaId):
self.data = getData(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = int(intValue(ratio[0].replace(',','').strip()))

View file

@ -7,9 +7,10 @@ import re
import socket
from urllib import quote, urlencode
from urllib2 import URLError
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -19,13 +20,13 @@ socket.setdefaulttimeout(10.0)
season_episode = re.compile("S..E..", re.IGNORECASE)
def getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
headers = cache.DEFAULT_HEADERS
headers['Cookie'] = 'language=en_EN'
return cache.getUrl(url, data, headers, timeout)
def getUrlUnicode(url):
return cache.getUrlUnicode(url, _getUrl=getUrl)
def _getUrlUnicode(url):
return cache.getUrlUnicode(url, _getUrl=_getUrl)
def findMovies(query, max_results=10):
results = []
@ -38,7 +39,7 @@ def findMovies(query, max_results=10):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = getUrlUnicode(url)
data = _getUrlUnicode(url)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
@ -79,7 +80,7 @@ def getData(piratebayId):
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = getUrlUnicode(torrent['comment_link'])
data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
@ -91,15 +92,25 @@ def getData(piratebayId):
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = decodeHtml(stripTags(findRegexp(data, '<div class="nfo">(.*?)</div>'))).strip()
t = getUrl(torrent[u'torrent_link'])
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> sha.sha(unicode(Thepiratebay('3951349'))).hexdigest()
'ef64e438e3eef6e6a05cac4eea56b9f0289d3f22'
'''
def __init__(self, piratebayId):
self.data = getData(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]

View file

@ -6,12 +6,16 @@ from oxutils import intValue
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {}
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
@ -25,9 +29,9 @@ class Torrent(dict):
if not isinstance(value, int):
value = int(intValue(value))
self[key] = value
self['infohash'] = self.data['torrent_info']['hash']
self['size'] = self.data['torrent_info']['size']
self['announce'] = self.data['torrent_info']['announce']
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else: