From 7695a9c015e3b5a2453325522bac0b5b4f5abed8 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 21 May 2016 15:19:25 +0200 Subject: [PATCH] fix some tests and urls --- ox/web/google.py | 8 +-- ox/web/imdb.py | 66 +++++++++++----------- ox/web/impawards.py | 23 ++++---- ox/web/mininova.py | 121 ----------------------------------------- ox/web/startpage.py | 4 +- ox/web/thepiratebay.py | 39 ++++--------- ox/web/torrent.py | 37 ------------- 7 files changed, 60 insertions(+), 238 deletions(-) delete mode 100644 ox/web/mininova.py delete mode 100644 ox/web/torrent.py diff --git a/ox/web/google.py b/ox/web/google.py index fc1f420..72aa32f 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -21,11 +21,11 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description - >>> find("The Matrix site:imdb.com", 1)[0][0] - u'The Matrix (1999) - IMDb' + >>> str(find("The Matrix site:imdb.com", 1)[0][0]) + 'The Matrix (1999) - IMDb' - >>> find("The Matrix site:imdb.com", 1)[0][1] - u'http://www.imdb.com/title/tt0133093/' + >>> str(find("The Matrix site:imdb.com", 1)[0][1]) + 'http://www.imdb.com/title/tt0133093/' """ results = [] offset = 0 diff --git a/ox/web/imdb.py b/ox/web/imdb.py index cc0cc48..fa9925e 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -7,7 +7,7 @@ import time import unicodedata from six.moves.urllib.parse import urlencode -from six import string_types +from six import text_type, string_types from .. import find_re, strip_tags, decode_html from .. import cache @@ -27,11 +27,11 @@ def get_url(id): class Imdb(SiteParser): ''' - >>> Imdb('0068646')['title'] - u'The Godfather' + >>> Imdb('0068646')['title'] == text_type(u'The Godfather') + True - >>> Imdb('0133093')['title'] - u'The Matrix' + >>> Imdb('0133093')['title'] == text_type(u'The Matrix') + True ''' regex = { 'alternativeTitles': { @@ -313,11 +313,11 @@ class Imdb(SiteParser): return self._cache[url] def __init__(self, id, timeout=-1): - #use akas.imdb.com to always get original title: - #http://www.imdb.com/help/show_leaf?titlelanguagedisplay + # use akas.imdb.com to always get original title: + # http://www.imdb.com/help/show_leaf?titlelanguagedisplay self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) - + url = self.baseUrl + 'combined' page = self.read_url(url, timeout=-1) if 'IMDb: Page not found' in page \ @@ -640,25 +640,25 @@ def get_movie_by_title(title, timeout=-1): If there is more than one film with that title for the year Title (Year/I) - >>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}') - u'1602860' + >>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')) + '1602860' - >>> get_movie_by_title(u'The Matrix (1999)') - u'0133093' + >>> str(get_movie_by_title(u'The Matrix (1999)')) + '0133093' - >>> get_movie_by_title(u'Little Egypt (1951)') - u'0043748' + >>> str(get_movie_by_title(u'Little Egypt (1951)')) + '0043748' + + >>> str(get_movie_by_title(u'Little Egypt (1897/I)')) + '0214882' - >>> get_movie_by_title(u'Little Egypt (1897/I)') - u'0214882' - >>> get_movie_by_title(u'Little Egypt') None - >>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}') - u'0866567' + >>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')) + '0866567' ''' - params = {'s':'tt','q': title} + params = {'s': 'tt', 'q': title} if not isinstance(title, bytes): try: params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1') @@ -676,20 +676,21 @@ def get_movie_by_title(title, timeout=-1): def get_movie_id(title, director='', year='', timeout=-1): ''' - >>> get_movie_id('The Matrix') - u'0133093' + >>> str(get_movie_id('The Matrix')) + '0133093' - >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard') - u'0060304' + >>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')) + '0060304' - >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') - u'0060304' + >>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')) + '0060304' - >>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') - u'0179214' + >>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard')) + '0179214' + + >>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard')) + '0179214' - >>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') - u'0179214' ''' imdbId = { (u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514', @@ -772,9 +773,6 @@ def get_movie_poster(imdbId): ''' >>> get_movie_poster('0133093') 'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg' - - >>> get_movie_poster('0994352') - 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' ''' info = ImdbCombined(imdbId) if 'posterId' in info: @@ -806,7 +804,7 @@ def max_votes(): url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' data = cache.read_url(url) votes = max([int(v.replace(',', '')) - for v in re.compile('([\d,]+)').findall(data)]) + for v in re.compile('([\d,]+)').findall(data)]) return votes def guess(title, director='', timeout=-1): diff --git a/ox/web/impawards.py b/ox/web/impawards.py index 855fbb1..f11ca12 100644 --- a/ox/web/impawards.py +++ b/ox/web/impawards.py @@ -10,14 +10,14 @@ from ox.text import find_re def get_data(id): ''' - >>> get_data('1991/silence_of_the_lambs')['imdbId'] - u'0102926' + >>> str(get_data('1991/silence_of_the_lambs')['imdbId']) + '0102926' - >>> get_data('1991/silence_of_the_lambs')['posters'][0] - u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' + >>> str(get_data('1991/silence_of_the_lambs')['posters'][0]) + 'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' - >>> get_data('1991/silence_of_the_lambs')['url'] - u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' + >>> str(get_data('1991/silence_of_the_lambs')['url']) + 'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' ''' data = { 'url': get_url(id) @@ -46,7 +46,6 @@ def get_data(id): else: poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '', re.DOTALL).findall(html) for result in results: url = 'http://impawards.com/%s' % result ids.append(get_id(url)) return set(ids) - #get all - html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) + # get all + html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True) pages = int(find_re(html, '')) + 1 for page in range(pages, 0, -1): for id in get_ids(page): - if not id in ids: + if id not in ids: ids.append(id) return ids + def get_url(id): url = u"http://www.impawards.com/%s.html" % id html = read_url(url, unicode=True) diff --git a/ox/web/mininova.py b/ox/web/mininova.py deleted file mode 100644 index 799390c..0000000 --- a/ox/web/mininova.py +++ /dev/null @@ -1,121 +0,0 @@ -# -*- coding: utf-8 -*- -# vi:si:et:sw=4:sts=4:ts=4 -from datetime import datetime -import re -import socket -from six.moves.urllib.parse import quote - -from ox.cache import read_url -from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines -from ox.normalize import normalize_imdbid -import ox - -from torrent import Torrent - - -def _parse_results_page(data, max_results=10): - results=[] - regexp = '''(.*?)(.*?)(.*?).*?.*?''' - for row in re.compile(regexp, re.DOTALL).findall(data): - torrentDate = row[0] - torrentExtra = row[1] - torrentId = row[2] - torrentTitle = decode_html(row[3]).strip() - torrentLink = "http://www.mininova.org/tor/" + torrentId - privateTracker = 'priv.gif' in torrentExtra - if not privateTracker: - results.append((torrentTitle, torrentLink, '')) - return results - -def find_movie(query=None, imdb=None, max_results=10): - '''search for torrents on mininova - ''' - if imdb: - url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb) - else: - url = "http://www.mininova.org/search/%s/seeds" % quote(query) - data = read_url(url, unicode=True) - return _parse_results_page(data, max_results) - -def get_id(mininovaId): - mininovaId = unicode(mininovaId) - d = find_re(mininovaId, "/(\d+)") - if d: - return d - mininovaId = mininovaId.split('/') - if len(mininovaId) == 1: - return mininovaId[0] - else: - return mininovaId[-1] - -def exists(mininovaId): - mininovaId = get_id(mininovaId) - data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId) - if not data or 'Torrent not found...' in data: - return False - if 'tracker of this torrent requires registration.' in data: - return False - return True - -def get_data(mininovaId): - _key_map = { - 'by': u'uploader', - } - mininovaId = get_id(mininovaId) - torrent = dict() - torrent[u'id'] = mininovaId - torrent[u'domain'] = 'mininova.org' - torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId - torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId - torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId - - data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True) - if '

Torrent not found...

' in data: - return None - - for d in re.compile('

.(.*?):(.*?)

', re.DOTALL).findall(data): - key = d[0].lower().strip() - key = _key_map.get(key, key) - value = decode_html(strip_tags(d[1].strip())) - torrent[key] = value - - torrent[u'title'] = find_re(data, '(.*?):.*?') - torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') - torrent[u'description'] = find_re(data, '
(.*?)
') - if torrent['description']: - torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() - t = read_url(torrent[u'torrent_link']) - torrent[u'torrent_info'] = get_torrent_info(t) - return torrent - -class Mininova(Torrent): - ''' - >>> Mininova('123') - {} - >>> Mininova('1072195')['infohash'] - '72dfa59d2338e4a48c78cec9de25964cddb64104' - ''' - def __init__(self, mininovaId): - self.data = get_data(mininovaId) - if not self.data: - return - Torrent.__init__(self) - ratio = self.data['share ratio'].split(',') - self['seeder'] = -1 - self['leecher'] = -1 - if len(ratio) == 2: - val = int_value(ratio[0].replace(',','').strip()) - if val: - self['seeder'] = int(val) - val = int_value(ratio[1].replace(',','').strip()) - if val: - self['leecher'] = int(val) - val = int_value(self.data['downloads'].replace(',','').strip()) - if val: - self['downloaded'] = int(val) - else: - self['downloaded'] = -1 - published = self.data['added on'] - published = published.split(' +')[0] - self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S") - diff --git a/ox/web/startpage.py b/ox/web/startpage.py index 1df25a4..ca18437 100644 --- a/ox/web/startpage.py +++ b/ox/web/startpage.py @@ -21,10 +21,10 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): Return max_results tuples with title, url, description >>> find("The Matrix site:imdb.com", 1)[0][0] - u'The Matrix (1999) - IMDb' + 'The Matrix (1999) - IMDb' >>> find("The Matrix site:imdb.com", 1)[0][1] - u'http://www.imdb.com/title/tt0133093/' + 'http://www.imdb.com/title/tt0133093/' """ results = [] url = 'https://eu1.startpage.com/do/search?nosteeraway=1&abp=1&language=english&cmd=process_search&query=%s&x=0&y=0&cat=web&engine0=v1all' % quote_plus(query) diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py index 125ce7d..e9a6445 100644 --- a/ox/web/thepiratebay.py +++ b/ox/web/thepiratebay.py @@ -9,11 +9,10 @@ from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normal from ox.normalize import normalize_imdbid import ox -from torrent import Torrent - cache_timeout = 24*60*60 # cache search only for 24 hours season_episode = re.compile("S..E..", re.IGNORECASE) +baseurl = "https://thepiratebay.org/" def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): @@ -25,7 +24,7 @@ def find_movies(query=None, imdb=None, max_results=10): if imdb: query = "tt" + normalize_imdbid(imdb) results = [] - next = ["https://thepiratebay.se/search/%s/0/3/200" % quote(query), ] + next = [baseurl + "hsearch/%s/0/3/200" % quote(query), ] page_count = 1 while next and page_count < 4: page_count += 1 @@ -33,12 +32,12 @@ def find_movies(query=None, imdb=None, max_results=10): if not url.startswith('http'): if not url.startswith('/'): url = "/" + url - url = "https://thepiratebay.se" + url + url = baseurl + url data = read_url(url, timeout=cache_timeout, unicode=True) regexp = '''(.*?).*?''' for row in re.compile(regexp, re.DOTALL).findall(data): torrentType = row[0] - torrentLink = "https://thepiratebay.se" + row[1] + torrentLink = baseurl + row[1] torrentTitle = decode_html(row[2]) # 201 = Movies , 202 = Movie DVDR, 205 TV Shows if torrentType in ['201']: @@ -61,7 +60,7 @@ def get_id(piratebayId): def exists(piratebayId): piratebayId = get_id(piratebayId) - return ox.net.exists("https://thepiratebay.se/torrent/%s" % piratebayId) + return ox.net.exists(baseurl + "torrent/%s" % piratebayId) def get_data(piratebayId): _key_map = { @@ -75,7 +74,7 @@ def get_data(piratebayId): torrent = dict() torrent[u'id'] = piratebayId torrent[u'domain'] = 'thepiratebay.org' - torrent[u'comment_link'] = 'https://thepiratebay.se/torrent/%s' % piratebayId + torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId data = read_url(torrent['comment_link'], unicode=True) torrent[u'title'] = find_re(data, '(.*?) \(download torrent\) - TPB') @@ -84,33 +83,15 @@ def get_data(piratebayId): torrent[u'title'] = decode_html(torrent[u'title']).strip() torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') title = quote(torrent['title'].encode('utf-8')) - torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) + torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"') + torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&") for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) - torrent[key] = value + if not '<' in key: + torrent[key] = value torrent[u'description'] = find_re(data, '
(.*?)
') if torrent[u'description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() - t = read_url(torrent[u'torrent_link']) - torrent[u'torrent_info'] = get_torrent_info(t) return torrent - -class Thepiratebay(Torrent): - ''' - >>> Thepiratebay('123') - {} - - >>> Thepiratebay('3951349')['infohash'] - '4e84415d36ed7b54066160c05a0b0f061898d12b' - ''' - def __init__(self, piratebayId): - self.data = get_data(piratebayId) - if not self.data: - return - Torrent.__init__(self) - published = self.data['uploaded'] - published = published.replace(' GMT', '').split(' +')[0] - self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S") - diff --git a/ox/web/torrent.py b/ox/web/torrent.py deleted file mode 100644 index 1312075..0000000 --- a/ox/web/torrent.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -# vi:si:et:sw=4:sts=4:ts=4 -from ox import int_value - - -class Torrent(dict): - ''' - >>> Torrent() - {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1} - ''' - _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', - 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language') - _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files') - _dict_keys = ('torrent_info', ) - _list_keys = () - data = {'torrent_info': {}} - - def __init__(self): - for key in self._string_keys: - self[key] = self.data.get(key, u'') - for key in self._dict_keys: - self[key] = self.data.get(key, {}) - for key in self._list_keys: - self[key] = self.data.get(key, []) - for key in self._int_keys: - value = self.data.get(key, -1) - if not isinstance(value, int): - value = int(int_value(value)) - self[key] = value - self['infohash'] = self.data['torrent_info'].get('hash', '') - self['size'] = self.data['torrent_info'].get('size', -1) - self['announce'] = self.data['torrent_info'].get('announce', '') - if 'files' in self.data['torrent_info']: - self['files'] = len(self.data['torrent_info']['files']) - else: - self['files'] = 1 -