findRegexp -> findRe, update setup.py
This commit is contained in:
parent
1b93ae048d
commit
8e8f8f3896
4 changed files with 37 additions and 33 deletions
33
ox/imdb.py
33
ox/imdb.py
|
@ -12,7 +12,7 @@ import time
|
|||
from BeautifulSoup import BeautifulSoup
|
||||
import chardet
|
||||
import oxutils
|
||||
from oxutils import stripTags, htmldecode, findRegexp, findString
|
||||
from oxutils import stripTags, htmldecode, findRe, findString
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils.normalize import normalizeTitle, normalizeImdbId
|
||||
|
||||
|
@ -57,7 +57,7 @@ def getMovieInfo(imdbId):
|
|||
data = getUrl(getUrlBase(imdbId))
|
||||
soup = BeautifulSoup(data)
|
||||
info = dict()
|
||||
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
|
||||
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
||||
title = stripTags(i[0]).strip().lower()
|
||||
|
@ -93,11 +93,12 @@ def getMovieInfo(imdbId):
|
|||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||
title = htmldecode(html_title)
|
||||
title = stripTags(title)
|
||||
year = findRegexp(title, '\((\d{4})\)')
|
||||
year = findRe(title, '\((\d{4})\)')
|
||||
if not year:
|
||||
year = findRegexp(title, '\((\d{4})')
|
||||
title = re.sub('\(\d{4}\)', '', title)
|
||||
title = re.sub('\(\d{4}/I*\)', '', title)
|
||||
year = findRe(title, '\((\d{4})')
|
||||
_y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
|
||||
if _y:
|
||||
title = title.replace(_y, '')
|
||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
title = title.replace(t, '')
|
||||
title = title.strip()
|
||||
|
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
|
|||
info['year'] = year
|
||||
'''
|
||||
#Rating
|
||||
rating = findRegexp(data, '<b>(.*?)/10</b>')
|
||||
rating = findRe(data, '<b>(.*?)/10</b>')
|
||||
if rating:
|
||||
info['rating'] = int(float(rating) * 1000)
|
||||
else:
|
||||
info['rating'] = -1
|
||||
|
||||
#Votes
|
||||
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||
if votes:
|
||||
info['votes'] = int(votes.replace(',', ''))
|
||||
else:
|
||||
|
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
|
|||
for a in videos[0]('a'):
|
||||
title = stripTags(unicode(a)).strip()
|
||||
url = 'http://www.imdb.com' + a['href']
|
||||
videoId = findRegexp(url, '/(vi\d*?)/')
|
||||
videoId = findRe(url, '/(vi\d*?)/')
|
||||
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
||||
iframe = getUrlUnicode(iframeUrl)
|
||||
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
|
||||
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
|
||||
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||
return trailers
|
||||
|
||||
|
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
|
|||
cs = BeautifulSoup(c)
|
||||
if connection:
|
||||
#relation -> list of imdb ids
|
||||
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||
connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||
return connections
|
||||
|
||||
def getMovieKeywords(imdbId):
|
||||
|
@ -315,11 +316,11 @@ class IMDb:
|
|||
value = unicode(value, 'utf-8')
|
||||
value = stripTags(value).strip()
|
||||
if key == 'runtime':
|
||||
parsed_value = findRegexp(value, '(.*?) min')
|
||||
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
||||
parsed_value = findRe(value, '(.*?) min')
|
||||
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = findRegexp(value, '(.*?) sec')
|
||||
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
||||
parsed_value = findRe(value, '(.*?) sec')
|
||||
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = 0
|
||||
else:
|
||||
|
@ -598,7 +599,7 @@ def guess(title, director=''):
|
|||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||
return return_url[28:35]
|
||||
if data:
|
||||
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||
if imdb_id:
|
||||
return imdb_id
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ from urllib import quote
|
|||
import sha
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||
from oxutils.normalize import normalizeImdbId
|
||||
|
||||
from torrent import Torrent
|
||||
|
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
|
|||
|
||||
def getId(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRegexp(mininovaId, "/(\d+)")
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
|
@ -78,9 +78,9 @@ def getData(mininovaId):
|
|||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = getUrl(torrent[u'torrent_link'])
|
||||
|
|
|
@ -10,7 +10,7 @@ from urllib2 import URLError
|
|||
import sha
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from oxutils.normalize import normalizeImdbId
|
||||
|
||||
from torrent import Torrent
|
||||
|
@ -61,7 +61,7 @@ def getId(piratebayId):
|
|||
piratebayId = piratebayId.split('org/')[1]
|
||||
if 'tor/' in piratebayId:
|
||||
piratebayId = piratebayId.split('tor/')[1]
|
||||
d = findRegexp(piratebayId, "/(\d+)")
|
||||
d = findRe(piratebayId, "/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
@ -81,18 +81,18 @@ def getData(piratebayId):
|
|||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||
|
||||
data = _getUrlUnicode(torrent['comment_link'])
|
||||
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = _getUrl(torrent[u'torrent_link'])
|
||||
|
|
17
setup.py
17
setup.py
|
@ -8,16 +8,19 @@ import os
|
|||
setup(
|
||||
name="ox",
|
||||
version="0.1",
|
||||
|
||||
# uncomment the following lines if you fill them out in release.py
|
||||
description="collection of scrapers for various websites",
|
||||
author="bot",
|
||||
author_email="bot@0xdb.org",
|
||||
url="http://ox.0xdb.org",
|
||||
download_url="http://ox.0xdb.org/download",
|
||||
license="GPL",
|
||||
author="0x",
|
||||
author_email="code@0xdb.org",
|
||||
url="http://code.0xdb.org/ox",
|
||||
download_url="http://code.0xdb.org/ox/download",
|
||||
license="GPLv3",
|
||||
packages=find_packages(),
|
||||
zip_safe=False,
|
||||
install_requires=[
|
||||
'oxutils',
|
||||
'feedparser',
|
||||
'beautifulsoup',
|
||||
],
|
||||
keywords = [
|
||||
],
|
||||
classifiers = [
|
||||
|
|
Loading…
Reference in a new issue