findRegexp -> findRe, update setup.py

This commit is contained in:
j 2008-05-07 11:45:00 +02:00
parent 1b93ae048d
commit 8e8f8f3896
4 changed files with 37 additions and 33 deletions

View file

@ -12,7 +12,7 @@ import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, htmldecode, findRegexp, findString
from oxutils import stripTags, htmldecode, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
@ -57,7 +57,7 @@ def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
@ -93,11 +93,12 @@ def getMovieInfo(imdbId):
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = stripTags(title)
year = findRegexp(title, '\((\d{4})\)')
year = findRe(title, '\((\d{4})\)')
if not year:
year = findRegexp(title, '\((\d{4})')
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
year = findRe(title, '\((\d{4})')
_y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
info['year'] = year
'''
#Rating
rating = findRegexp(data, '<b>(.*?)/10</b>')
rating = findRe(data, '<b>(.*?)/10</b>')
if rating:
info['rating'] = int(float(rating) * 1000)
else:
info['rating'] = -1
#Votes
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes:
info['votes'] = int(votes.replace(',', ''))
else:
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/')
videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getMovieKeywords(imdbId):
@ -315,11 +316,11 @@ class IMDb:
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
@ -598,7 +599,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id

View file

@ -9,7 +9,7 @@ from urllib import quote
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRegexp(mininovaId, "/(\d+)")
d = findRe(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
@ -78,9 +78,9 @@ def getData(mininovaId):
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link'])

View file

@ -10,7 +10,7 @@ from urllib2 import URLError
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -61,7 +61,7 @@ def getId(piratebayId):
piratebayId = piratebayId.split('org/')[1]
if 'tor/' in piratebayId:
piratebayId = piratebayId.split('tor/')[1]
d = findRegexp(piratebayId, "/(\d+)")
d = findRe(piratebayId, "/(\d+)")
if d:
piratebayId = d
return piratebayId
@ -81,18 +81,18 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link'])

View file

@ -8,16 +8,19 @@ import os
setup(
name="ox",
version="0.1",
# uncomment the following lines if you fill them out in release.py
description="collection of scrapers for various websites",
author="bot",
author_email="bot@0xdb.org",
url="http://ox.0xdb.org",
download_url="http://ox.0xdb.org/download",
license="GPL",
author="0x",
author_email="code@0xdb.org",
url="http://code.0xdb.org/ox",
download_url="http://code.0xdb.org/ox/download",
license="GPLv3",
packages=find_packages(),
zip_safe=False,
install_requires=[
'oxutils',
'feedparser',
'beautifulsoup',
],
keywords = [
],
classifiers = [