findRegexp -> findRe, update setup.py

This commit is contained in:
j 2008-05-07 11:45:00 +02:00
parent 1b93ae048d
commit 8e8f8f3896
4 changed files with 37 additions and 33 deletions

View file

@ -12,7 +12,7 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import chardet import chardet
import oxutils import oxutils
from oxutils import stripTags, htmldecode, findRegexp, findString from oxutils import stripTags, htmldecode, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId from oxutils.normalize import normalizeTitle, normalizeImdbId
@ -57,7 +57,7 @@ def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId)) data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
info = dict() info = dict()
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"') info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data): for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower() title = stripTags(i[0]).strip().lower()
@ -93,11 +93,12 @@ def getMovieInfo(imdbId):
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title) title = htmldecode(html_title)
title = stripTags(title) title = stripTags(title)
year = findRegexp(title, '\((\d{4})\)') year = findRe(title, '\((\d{4})\)')
if not year: if not year:
year = findRegexp(title, '\((\d{4})') year = findRe(title, '\((\d{4})')
title = re.sub('\(\d{4}\)', '', title) _y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
title = re.sub('\(\d{4}/I*\)', '', title) if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
title = title.strip() title = title.strip()
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
info['year'] = year info['year'] = year
''' '''
#Rating #Rating
rating = findRegexp(data, '<b>(.*?)/10</b>') rating = findRe(data, '<b>(.*?)/10</b>')
if rating: if rating:
info['rating'] = int(float(rating) * 1000) info['rating'] = int(float(rating) * 1000)
else: else:
info['rating'] = -1 info['rating'] = -1
#Votes #Votes
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>') votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes: if votes:
info['votes'] = int(votes.replace(',', '')) info['votes'] = int(votes.replace(',', ''))
else: else:
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
for a in videos[0]('a'): for a in videos[0]('a'):
title = stripTags(unicode(a)).strip() title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href'] url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/') videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl) iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"')) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers return trailers
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
cs = BeautifulSoup(c) cs = BeautifulSoup(c)
if connection: if connection:
#relation -> list of imdb ids #relation -> list of imdb ids
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections return connections
def getMovieKeywords(imdbId): def getMovieKeywords(imdbId):
@ -315,11 +316,11 @@ class IMDb:
value = unicode(value, 'utf-8') value = unicode(value, 'utf-8')
value = stripTags(value).strip() value = stripTags(value).strip()
if key == 'runtime': if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min') parsed_value = findRe(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)') parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec') parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)') parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = 0 parsed_value = 0
else: else:
@ -598,7 +599,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'): if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35] return return_url[28:35]
if data: if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)') imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id: if imdb_id:
return imdb_id return imdb_id

View file

@ -9,7 +9,7 @@ from urllib import quote
import sha import sha
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils.normalize import normalizeImdbId from oxutils.normalize import normalizeImdbId
from torrent import Torrent from torrent import Torrent
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
def getId(mininovaId): def getId(mininovaId):
mininovaId = unicode(mininovaId) mininovaId = unicode(mininovaId)
d = findRegexp(mininovaId, "/(\d+)") d = findRe(mininovaId, "/(\d+)")
if d: if d:
return d return d
mininovaId = mininovaId.split('/') mininovaId = mininovaId.split('/')
@ -78,9 +78,9 @@ def getData(mininovaId):
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>') torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>') torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link']) t = getUrl(torrent[u'torrent_link'])

View file

@ -10,7 +10,7 @@ from urllib2 import URLError
import sha import sha
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId from oxutils.normalize import normalizeImdbId
from torrent import Torrent from torrent import Torrent
@ -61,7 +61,7 @@ def getId(piratebayId):
piratebayId = piratebayId.split('org/')[1] piratebayId = piratebayId.split('org/')[1]
if 'tor/' in piratebayId: if 'tor/' in piratebayId:
piratebayId = piratebayId.split('tor/')[1] piratebayId = piratebayId.split('tor/')[1]
d = findRegexp(piratebayId, "/(\d+)") d = findRe(piratebayId, "/(\d+)")
if d: if d:
piratebayId = d piratebayId = d
return piratebayId return piratebayId
@ -81,18 +81,18 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link']) data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip() torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title'])) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link']) t = _getUrl(torrent[u'torrent_link'])

View file

@ -8,16 +8,19 @@ import os
setup( setup(
name="ox", name="ox",
version="0.1", version="0.1",
# uncomment the following lines if you fill them out in release.py
description="collection of scrapers for various websites", description="collection of scrapers for various websites",
author="bot", author="0x",
author_email="bot@0xdb.org", author_email="code@0xdb.org",
url="http://ox.0xdb.org", url="http://code.0xdb.org/ox",
download_url="http://ox.0xdb.org/download", download_url="http://code.0xdb.org/ox/download",
license="GPL", license="GPLv3",
packages=find_packages(), packages=find_packages(),
zip_safe=False, zip_safe=False,
install_requires=[
'oxutils',
'feedparser',
'beautifulsoup',
],
keywords = [ keywords = [
], ],
classifiers = [ classifiers = [