diff --git a/ox/imdb.py b/ox/imdb.py index d31afde..78b84b7 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -12,7 +12,7 @@ import time from BeautifulSoup import BeautifulSoup import chardet import oxutils -from oxutils import stripTags, htmldecode, findRegexp, findString +from oxutils import stripTags, decodeHtml, findRe, findString from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle, normalizeImdbId @@ -57,13 +57,13 @@ def getMovieInfo(imdbId): data = getUrl(getUrlBase(imdbId)) soup = BeautifulSoup(data) info = dict() - info['poster'] = findRegexp(data, 'name="poster".*?(.*?):(.*?)
', ' ').replace(' ', ' ') - title = htmldecode(html_title) + title = decodeHtml(html_title) title = stripTags(title) - year = findRegexp(title, '\((\d{4})\)') + year = findRe(title, '\((\d{4})\)') if not year: - year = findRegexp(title, '\((\d{4})') - title = re.sub('\(\d{4}\)', '', title) - title = re.sub('\(\d{4}/I*\)', '', title) + year = findRe(title, '\((\d{4})') + _y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)') + if _y: + title = title.replace(_y, '') for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') title = title.strip() @@ -109,14 +110,14 @@ def getMovieInfo(imdbId): info['year'] = year ''' #Rating - rating = findRegexp(data, '(.*?)/10') + rating = findRe(data, '(.*?)/10') if rating: info['rating'] = int(float(rating) * 1000) else: info['rating'] = -1 #Votes - votes = findRegexp(data, '\((.*?) votes\)') + votes = findRe(data, '\((.*?) votes\)') if votes: info['votes'] = int(votes.replace(',', '')) else: @@ -171,10 +172,10 @@ def getMovieTrailers(imdbId): for a in videos[0]('a'): title = stripTags(unicode(a)).strip() url = 'http://www.imdb.com' + a['href'] - videoId = findRegexp(url, '/(vi\d*?)/') + videoId = findRe(url, '/(vi\d*?)/') iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframe = getUrlUnicode(iframeUrl) - videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"')) + videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) return trailers @@ -209,7 +210,7 @@ def getMovieLocations(imdbId): soup = BeautifulSoup(data) locations = [] for key in soup('a', {'href': re.compile('^/List')}): - locations.append(htmldecode(key.string)) + locations.append(decodeHtml(key.string)) return locations def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): @@ -260,7 +261,7 @@ def getMovieConnections(imdbId): cs = BeautifulSoup(c) if connection: #relation -> list of imdb ids - connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] + connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] return connections def getMovieKeywords(imdbId): @@ -269,7 +270,7 @@ def getMovieKeywords(imdbId): soup = BeautifulSoup(data) keywords = [] for key in soup('a', {'href': re.compile('^/keyword/')}): - k = htmldecode(key.string) + k = decodeHtml(key.string) k = k.replace(u'\xa0', ' ') keywords.append(k) return keywords @@ -315,11 +316,11 @@ class IMDb: value = unicode(value, 'utf-8') value = stripTags(value).strip() if key == 'runtime': - parsed_value = findRegexp(value, '(.*?) min') - parsed_value = findRegexp(parsed_value, '([0-9]+)') + parsed_value = findRe(value, '(.*?) min') + parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: - parsed_value = findRegexp(value, '(.*?) sec') - parsed_value = findRegexp(parsed_value, '([0-9]+)') + parsed_value = findRe(value, '(.*?) sec') + parsed_value = findRe(parsed_value, '([0-9]+)') if not parsed_value: parsed_value = 0 else: @@ -508,7 +509,7 @@ class IMDb: episodes[episode]['title'] = match[3].strip() if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): episodes[episode]['title'] = u'' - description = htmldecode(match[5]) + description = decodeHtml(match[5]) description = stripTags(description.split('Next US airings:')[0]) episodes[episode]['description'] = description episodes[episode]['date'] = '' @@ -598,7 +599,7 @@ def guess(title, director=''): if return_url.startswith('http://www.imdb.com/title/tt'): return return_url[28:35] if data: - imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?
  1. .*?(.*?):.*?') - torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') - torrent[u'description'] = findRegexp(data, '
    (.*?)
    ') + torrent[u'title'] = findRe(data, '(.*?):.*?') + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + torrent[u'description'] = findRe(data, '
    (.*?)
    ') if torrent['description']: torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() t = getUrl(torrent[u'torrent_link']) diff --git a/ox/thepiratebay.py b/ox/thepiratebay.py index 897c4f8..b5ea4c0 100644 --- a/ox/thepiratebay.py +++ b/ox/thepiratebay.py @@ -10,7 +10,7 @@ from urllib2 import URLError import sha from oxutils.cache import getUrl, getUrlUnicode -from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines +from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines from oxutils.normalize import normalizeImdbId from torrent import Torrent @@ -61,7 +61,7 @@ def getId(piratebayId): piratebayId = piratebayId.split('org/')[1] if 'tor/' in piratebayId: piratebayId = piratebayId.split('tor/')[1] - d = findRegexp(piratebayId, "/(\d+)") + d = findRe(piratebayId, "/(\d+)") if d: piratebayId = d return piratebayId @@ -81,18 +81,18 @@ def getData(piratebayId): torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId data = _getUrlUnicode(torrent['comment_link']) - torrent[u'title'] = findRegexp(data, '(.*?) \(download torrent\) - TPB') + torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB') if not torrent[u'title']: return None torrent[u'title'] = decodeHtml(torrent[u'title']).strip() - torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title'])) for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decodeHtml(stripTags(d[1].strip())) torrent[key] = value - torrent[u'description'] = findRegexp(data, '
    (.*?)
    ') + torrent[u'description'] = findRe(data, '
    (.*?)
    ') if torrent[u'description']: torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() t = _getUrl(torrent[u'torrent_link']) diff --git a/setup.py b/setup.py index 4877b36..4840537 100644 --- a/setup.py +++ b/setup.py @@ -8,16 +8,19 @@ import os setup( name="ox", version="0.1", - - # uncomment the following lines if you fill them out in release.py description="collection of scrapers for various websites", - author="bot", - author_email="bot@0xdb.org", - url="http://ox.0xdb.org", - download_url="http://ox.0xdb.org/download", - license="GPL", + author="0x", + author_email="code@0xdb.org", + url="http://code.0xdb.org/ox", + download_url="http://code.0xdb.org/ox/download", + license="GPLv3", packages=find_packages(), zip_safe=False, + install_requires=[ + 'oxutils', + 'feedparser', + 'beautifulsoup', + ], keywords = [ ], classifiers = [