findRegexp -> findRe, update setup.py
This commit is contained in:
parent
1b93ae048d
commit
8e8f8f3896
4 changed files with 37 additions and 33 deletions
33
ox/imdb.py
33
ox/imdb.py
|
@ -12,7 +12,7 @@ import time
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
import chardet
|
import chardet
|
||||||
import oxutils
|
import oxutils
|
||||||
from oxutils import stripTags, htmldecode, findRegexp, findString
|
from oxutils import stripTags, htmldecode, findRe, findString
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils.normalize import normalizeTitle, normalizeImdbId
|
from oxutils.normalize import normalizeTitle, normalizeImdbId
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@ def getMovieInfo(imdbId):
|
||||||
data = getUrl(getUrlBase(imdbId))
|
data = getUrl(getUrlBase(imdbId))
|
||||||
soup = BeautifulSoup(data)
|
soup = BeautifulSoup(data)
|
||||||
info = dict()
|
info = dict()
|
||||||
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
|
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||||
|
|
||||||
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
|
||||||
title = stripTags(i[0]).strip().lower()
|
title = stripTags(i[0]).strip().lower()
|
||||||
|
@ -93,11 +93,12 @@ def getMovieInfo(imdbId):
|
||||||
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
|
||||||
title = htmldecode(html_title)
|
title = htmldecode(html_title)
|
||||||
title = stripTags(title)
|
title = stripTags(title)
|
||||||
year = findRegexp(title, '\((\d{4})\)')
|
year = findRe(title, '\((\d{4})\)')
|
||||||
if not year:
|
if not year:
|
||||||
year = findRegexp(title, '\((\d{4})')
|
year = findRe(title, '\((\d{4})')
|
||||||
title = re.sub('\(\d{4}\)', '', title)
|
_y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
|
||||||
title = re.sub('\(\d{4}/I*\)', '', title)
|
if _y:
|
||||||
|
title = title.replace(_y, '')
|
||||||
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||||
title = title.replace(t, '')
|
title = title.replace(t, '')
|
||||||
title = title.strip()
|
title = title.strip()
|
||||||
|
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
|
||||||
info['year'] = year
|
info['year'] = year
|
||||||
'''
|
'''
|
||||||
#Rating
|
#Rating
|
||||||
rating = findRegexp(data, '<b>(.*?)/10</b>')
|
rating = findRe(data, '<b>(.*?)/10</b>')
|
||||||
if rating:
|
if rating:
|
||||||
info['rating'] = int(float(rating) * 1000)
|
info['rating'] = int(float(rating) * 1000)
|
||||||
else:
|
else:
|
||||||
info['rating'] = -1
|
info['rating'] = -1
|
||||||
|
|
||||||
#Votes
|
#Votes
|
||||||
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
|
||||||
if votes:
|
if votes:
|
||||||
info['votes'] = int(votes.replace(',', ''))
|
info['votes'] = int(votes.replace(',', ''))
|
||||||
else:
|
else:
|
||||||
|
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
|
||||||
for a in videos[0]('a'):
|
for a in videos[0]('a'):
|
||||||
title = stripTags(unicode(a)).strip()
|
title = stripTags(unicode(a)).strip()
|
||||||
url = 'http://www.imdb.com' + a['href']
|
url = 'http://www.imdb.com' + a['href']
|
||||||
videoId = findRegexp(url, '/(vi\d*?)/')
|
videoId = findRe(url, '/(vi\d*?)/')
|
||||||
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
|
||||||
iframe = getUrlUnicode(iframeUrl)
|
iframe = getUrlUnicode(iframeUrl)
|
||||||
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
|
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
|
||||||
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
|
||||||
return trailers
|
return trailers
|
||||||
|
|
||||||
|
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
|
||||||
cs = BeautifulSoup(c)
|
cs = BeautifulSoup(c)
|
||||||
if connection:
|
if connection:
|
||||||
#relation -> list of imdb ids
|
#relation -> list of imdb ids
|
||||||
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||||
return connections
|
return connections
|
||||||
|
|
||||||
def getMovieKeywords(imdbId):
|
def getMovieKeywords(imdbId):
|
||||||
|
@ -315,11 +316,11 @@ class IMDb:
|
||||||
value = unicode(value, 'utf-8')
|
value = unicode(value, 'utf-8')
|
||||||
value = stripTags(value).strip()
|
value = stripTags(value).strip()
|
||||||
if key == 'runtime':
|
if key == 'runtime':
|
||||||
parsed_value = findRegexp(value, '(.*?) min')
|
parsed_value = findRe(value, '(.*?) min')
|
||||||
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||||
if not parsed_value:
|
if not parsed_value:
|
||||||
parsed_value = findRegexp(value, '(.*?) sec')
|
parsed_value = findRe(value, '(.*?) sec')
|
||||||
parsed_value = findRegexp(parsed_value, '([0-9]+)')
|
parsed_value = findRe(parsed_value, '([0-9]+)')
|
||||||
if not parsed_value:
|
if not parsed_value:
|
||||||
parsed_value = 0
|
parsed_value = 0
|
||||||
else:
|
else:
|
||||||
|
@ -598,7 +599,7 @@ def guess(title, director=''):
|
||||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||||
return return_url[28:35]
|
return return_url[28:35]
|
||||||
if data:
|
if data:
|
||||||
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||||
if imdb_id:
|
if imdb_id:
|
||||||
return imdb_id
|
return imdb_id
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from urllib import quote
|
||||||
import sha
|
import sha
|
||||||
|
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||||
from oxutils.normalize import normalizeImdbId
|
from oxutils.normalize import normalizeImdbId
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
|
||||||
|
|
||||||
def getId(mininovaId):
|
def getId(mininovaId):
|
||||||
mininovaId = unicode(mininovaId)
|
mininovaId = unicode(mininovaId)
|
||||||
d = findRegexp(mininovaId, "/(\d+)")
|
d = findRe(mininovaId, "/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
return d
|
return d
|
||||||
mininovaId = mininovaId.split('/')
|
mininovaId = mininovaId.split('/')
|
||||||
|
@ -78,9 +78,9 @@ def getData(mininovaId):
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
|
|
||||||
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||||
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||||
if torrent['description']:
|
if torrent['description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
t = getUrl(torrent[u'torrent_link'])
|
t = getUrl(torrent[u'torrent_link'])
|
||||||
|
|
|
@ -10,7 +10,7 @@ from urllib2 import URLError
|
||||||
import sha
|
import sha
|
||||||
|
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||||
from oxutils.normalize import normalizeImdbId
|
from oxutils.normalize import normalizeImdbId
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
@ -61,7 +61,7 @@ def getId(piratebayId):
|
||||||
piratebayId = piratebayId.split('org/')[1]
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
if 'tor/' in piratebayId:
|
if 'tor/' in piratebayId:
|
||||||
piratebayId = piratebayId.split('tor/')[1]
|
piratebayId = piratebayId.split('tor/')[1]
|
||||||
d = findRegexp(piratebayId, "/(\d+)")
|
d = findRe(piratebayId, "/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
return piratebayId
|
return piratebayId
|
||||||
|
@ -81,18 +81,18 @@ def getData(piratebayId):
|
||||||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||||
|
|
||||||
data = _getUrlUnicode(torrent['comment_link'])
|
data = _getUrlUnicode(torrent['comment_link'])
|
||||||
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
if not torrent[u'title']:
|
if not torrent[u'title']:
|
||||||
return None
|
return None
|
||||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||||
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
|
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
|
||||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||||
if torrent[u'description']:
|
if torrent[u'description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
t = _getUrl(torrent[u'torrent_link'])
|
t = _getUrl(torrent[u'torrent_link'])
|
||||||
|
|
17
setup.py
17
setup.py
|
@ -8,16 +8,19 @@ import os
|
||||||
setup(
|
setup(
|
||||||
name="ox",
|
name="ox",
|
||||||
version="0.1",
|
version="0.1",
|
||||||
|
|
||||||
# uncomment the following lines if you fill them out in release.py
|
|
||||||
description="collection of scrapers for various websites",
|
description="collection of scrapers for various websites",
|
||||||
author="bot",
|
author="0x",
|
||||||
author_email="bot@0xdb.org",
|
author_email="code@0xdb.org",
|
||||||
url="http://ox.0xdb.org",
|
url="http://code.0xdb.org/ox",
|
||||||
download_url="http://ox.0xdb.org/download",
|
download_url="http://code.0xdb.org/ox/download",
|
||||||
license="GPL",
|
license="GPLv3",
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
install_requires=[
|
||||||
|
'oxutils',
|
||||||
|
'feedparser',
|
||||||
|
'beautifulsoup',
|
||||||
|
],
|
||||||
keywords = [
|
keywords = [
|
||||||
],
|
],
|
||||||
classifiers = [
|
classifiers = [
|
||||||
|
|
Loading…
Reference in a new issue