diff --git a/ox/imdb.py b/ox/imdb.py
index d31afde..78b84b7 100644
--- a/ox/imdb.py
+++ b/ox/imdb.py
@@ -12,7 +12,7 @@ import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
-from oxutils import stripTags, htmldecode, findRegexp, findString
+from oxutils import stripTags, decodeHtml, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
@@ -57,13 +57,13 @@ def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
- info['poster'] = findRegexp(data, 'name="poster".*?
(.*?):(.*?)
', ' ').replace(' ', ' ')
- title = htmldecode(html_title)
+ title = decodeHtml(html_title)
title = stripTags(title)
- year = findRegexp(title, '\((\d{4})\)')
+ year = findRe(title, '\((\d{4})\)')
if not year:
- year = findRegexp(title, '\((\d{4})')
- title = re.sub('\(\d{4}\)', '', title)
- title = re.sub('\(\d{4}/I*\)', '', title)
+ year = findRe(title, '\((\d{4})')
+ _y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
+ if _y:
+ title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
@@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
info['year'] = year
'''
#Rating
- rating = findRegexp(data, '(.*?)/10')
+ rating = findRe(data, '(.*?)/10')
if rating:
info['rating'] = int(float(rating) * 1000)
else:
info['rating'] = -1
#Votes
- votes = findRegexp(data, '\((.*?) votes\)')
+ votes = findRe(data, '\((.*?) votes\)')
if votes:
info['votes'] = int(votes.replace(',', ''))
else:
@@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
- videoId = findRegexp(url, '/(vi\d*?)/')
+ videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
- videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
+ videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
@@ -209,7 +210,7 @@ def getMovieLocations(imdbId):
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
- locations.append(htmldecode(key.string))
+ locations.append(decodeHtml(key.string))
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
- connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
+ connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getMovieKeywords(imdbId):
@@ -269,7 +270,7 @@ def getMovieKeywords(imdbId):
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
- k = htmldecode(key.string)
+ k = decodeHtml(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
return keywords
@@ -315,11 +316,11 @@ class IMDb:
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
- parsed_value = findRegexp(value, '(.*?) min')
- parsed_value = findRegexp(parsed_value, '([0-9]+)')
+ parsed_value = findRe(value, '(.*?) min')
+ parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
- parsed_value = findRegexp(value, '(.*?) sec')
- parsed_value = findRegexp(parsed_value, '([0-9]+)')
+ parsed_value = findRe(value, '(.*?) sec')
+ parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
@@ -508,7 +509,7 @@ class IMDb:
episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u''
- description = htmldecode(match[5])
+ description = decodeHtml(match[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
episodes[episode]['date'] = ''
@@ -598,7 +599,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
- imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?- .*?(.*?):.*?')
- torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
- torrent[u'description'] = findRegexp(data, '
(.*?)
')
+ torrent[u'title'] = findRe(data, '(.*?):.*?')
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+ torrent[u'description'] = findRe(data, '(.*?)
')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link'])
diff --git a/ox/thepiratebay.py b/ox/thepiratebay.py
index 897c4f8..b5ea4c0 100644
--- a/ox/thepiratebay.py
+++ b/ox/thepiratebay.py
@@ -10,7 +10,7 @@ from urllib2 import URLError
import sha
from oxutils.cache import getUrl, getUrlUnicode
-from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
+from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@@ -61,7 +61,7 @@ def getId(piratebayId):
piratebayId = piratebayId.split('org/')[1]
if 'tor/' in piratebayId:
piratebayId = piratebayId.split('tor/')[1]
- d = findRegexp(piratebayId, "/(\d+)")
+ d = findRe(piratebayId, "/(\d+)")
if d:
piratebayId = d
return piratebayId
@@ -81,18 +81,18 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link'])
- torrent[u'title'] = findRegexp(data, '(.*?) \(download torrent\) - TPB')
+ torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
- torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
- torrent[u'description'] = findRegexp(data, '(.*?)
')
+ torrent[u'description'] = findRe(data, '(.*?)
')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link'])
diff --git a/setup.py b/setup.py
index 4877b36..4840537 100644
--- a/setup.py
+++ b/setup.py
@@ -8,16 +8,19 @@ import os
setup(
name="ox",
version="0.1",
-
- # uncomment the following lines if you fill them out in release.py
description="collection of scrapers for various websites",
- author="bot",
- author_email="bot@0xdb.org",
- url="http://ox.0xdb.org",
- download_url="http://ox.0xdb.org/download",
- license="GPL",
+ author="0x",
+ author_email="code@0xdb.org",
+ url="http://code.0xdb.org/ox",
+ download_url="http://code.0xdb.org/ox/download",
+ license="GPLv3",
packages=find_packages(),
zip_safe=False,
+ install_requires=[
+ 'oxutils',
+ 'feedparser',
+ 'beautifulsoup',
+ ],
keywords = [
],
classifiers = [