diff --git a/ox/form.py b/ox/form.py index faa1551..5358afb 100644 --- a/ox/form.py +++ b/ox/form.py @@ -5,8 +5,7 @@ from __future__ import print_function import itertools import mimetypes -import os -import hashlib +import random import sys from six import PY2 @@ -21,7 +20,8 @@ _fmt = '%%0%dd' % _width def _make_boundary(): # Craft a random boundary. - boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '==' + token = random.randrange(sys.maxsize) + boundary = ('=' * 15) + (_fmt % token) + '==' return boundary class MultiPartForm(object): diff --git a/ox/web/imdb.py b/ox/web/imdb.py index b63dc69..c766ecf 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -27,52 +27,6 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_ def get_url(id): return "http://www.imdb.com/title/tt%s/" % id - -def reference_section(id): - return { - 'page': 'reference', - 're': [ - '

'.format(id=id), - '' + label + '.*?(.*?)', - ], - 'type': type, - } - if more: - conditions['re'] += more - return conditions - - -''' -'posterIds': { - 'page': 'posters', - 're': '/unknown-thumbnail/media/rm(.*?)/tt', - 'type': 'list' -}, -''' - class Imdb(SiteParser): ''' >>> Imdb('0068646')['title'] == text_type(u'The Godfather') @@ -91,29 +45,49 @@ class Imdb(SiteParser): 'type': 'list' }, 'aspectratio': { - 'page': 'reference', - 're': 'Aspect Ratio.*?ipl-inline-list__item">\s+([\d\.]+)', + 'page': 'combined', + 're': 'Aspect Ratio:

([\d\.]+)', 'type': 'float', }, - 'budget': zebra_table('Budget', more=[ - lambda data: find_re(decode_html(data).replace(',', ''), '\d+') - ], type='int'), - 'cast': { - 'page': 'reference', + 'budget': { + 'page': 'business', 're': [ - ' (.*?)
', - '.*?>(.*?)
.*?(.*?)', + '
Budget
\s*?\$(.*?).*?>(.*?).*?(.*?)', lambda ll: [strip_tags(l) for l in ll] + ], + 'type': 'list' + }, + 'cinematographer': { + 'page': 'combined', + 're': [ + lambda data: data.split('Series Crew')[0], + 'Cinematography by(.*?)', + '(.*?)' ], 'type': 'list' }, - 'cinematographer': reference_section('cinematographers'), 'connections': { 'page': 'movieconnections', 're': '

(.*?)

(.*?)(<\/div>\n (.*?)']), + 'country': { + 'page': 'combined', + 're': [ + '
Country:
.*?
', + #'(.*?)', #links changed to work with existing caches, just take all links + '(.*?)', + ], + 'type': 'list' + }, 'creator': { 'page': '', 're': [ @@ -123,12 +97,44 @@ class Imdb(SiteParser): ], 'type': 'list' }, - 'director': reference_section('directors'), - 'editor': reference_section('editors'), - 'composer': reference_section('composers'), + 'director': { + 'page': 'combined', + 're': [ + lambda data: data.split('Series Crew')[0], + 'Directed by(.*?)', + '(.*?)
', + '(.*?)' + ], + 'type': 'list' + }, + 'composer': { + 'page': 'combined', + 're': [ + lambda data: data.split('Series Crew')[0], + 'Original Music by(.*?)', + '(.*?)<', + 'page': 'combined', + 're': '
.*?(.*?)', 'type': 'string' }, 'filmingLocations': { @@ -139,44 +145,77 @@ class Imdb(SiteParser): ], 'type': 'list' }, - 'genre': zebra_list('Genres', more=['(.*?)', lambda x: x[0]]), - 'gross': zebra_table('Cumulative Worldwide Gross', more=[ - lambda data: find_re(decode_html(data).replace(',', ''), '\d+') - ], type='int'), + 'genre': { + 'page': 'combined', + 're': [ + '
Genre:
(.*?)(.*?)' + ], + 'type': 'list' + }, + 'gross': { + 'page': 'business', + 're': [ + '
Gross
\s*?\$(.*?)', + '
Language:
.*?
', + #'(.*?)', #links changed to work with existing caches, just take all links + '(.*?)', + ], + 'type': 'list' + }, 'originalTitle': { 'page': 'releaseinfo', 're': '\(original title\)\s*(.*?)', 'type': 'string' }, - 'summary': zebra_table('Plot Summary', more=[ - '

(.*?)', + 'summary': { + 'page': 'plotsummary', + 're': '

(.*?)<\/p>', 'type': 'string' }, - 'producer': reference_section('producers'), - 'productionCompany': { - 'page': 'reference', + 'posterId': { + 'page': 'combined', + 're': '', + 'type': 'string' + }, + 'posterIds': { + 'page': 'posters', + 're': '/unknown-thumbnail/media/rm(.*?)/tt', + 'type': 'list' + }, + 'producer': { + 'page': 'combined', 're': [ - 'Production Companies.*?', + lambda data: data.split('Series Crew')[0], + 'Produced by(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'productionCompany': { + 'page': 'combined', + 're': [ + 'Production Companies

    (.*?)
', '(.*?)' ], 'type': 'list' }, 'rating': { - 'page': 'reference', - 're': [ - '
(.*?)
', - 'ipl-rating-star__rating">([\d,.]+?)', - ], + 'page': 'combined', + 're': '
.*?([\d,.]+?)/10', 'type': 'float' }, 'releasedate': { @@ -187,43 +226,59 @@ class Imdb(SiteParser): ], 'type': 'list' }, - #FIXME using some /offsite/ redirect now - #'reviews': { - # 'page': 'externalreviews', - # 're': [ - # '
    (.*?)
', - # '
  • .*?(.*?).*?
  • ' - # ], - # 'type': 'list' - #}, - 'runtime': zebra_list('Runtime'), - 'color': zebra_list('Color', more=['(.*?)']), - 'sound': zebra_list('Sound Mix', more=['(.*?)', lambda x: x[0]]), - - 'season': { - 'page': 'reference', + 'reviews': { + 'page': 'externalreviews', 're': [ - '
      (.*?)
    ', - 'Season (\d+)', + '
      (.*?)
    ', + '
  • (.*?)
  • ' + ], + 'type': 'list' + }, + 'runtime': { + 'page': 'combined', + 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', + 'type': 'string' + }, + 'color': { + 'page': 'combined', + 're': [ + '
    Color:
    (.*?)
    ', + '(.*?)' + ], + 'type': 'list' + }, + 'sound': { + 'page': 'combined', + 're': [ + '
    Sound Mix:
    (.*?)
    ', + '(.*?)' + ], + 'type': 'list' + }, + 'season': { + 'page': 'combined', + 're': [ + '
    Original Air Date:
    .*?
    (.*?)
    ', + '\(Season (\d+), Episode \d+\)', ], 'type': 'int' }, 'episode': { - 'page': 'reference', + 'page': 'combined', 're': [ - '
      (.*?)
    ', - 'Episode (\d+)', + '
    Original Air Date:
    .*?
    (.*?)
    ', + '\(Season \d+, Episode (\d+)\)', ], 'type': 'int' }, 'series': { - 'page': 'reference', - 're': '

    .*?(TV series|TV mini-series) ', 'type': 'string' }, 'title': { @@ -240,17 +295,22 @@ class Imdb(SiteParser): 'type': 'list', }, 'votes': { - 'page': 'reference', - 're': [ - 'class="ipl-rating-star__total-votes">\((.*?)\)', - lambda r: r.replace(',', '') - ], + 'page': 'combined', + 're': '([\d,]*?) votes', 'type': 'string' }, - 'writer': reference_section('writers'), + 'writer': { + 'page': 'combined', + 're': [ + lambda data: data.split('Series Crew')[0], + 'Writing credits(.*?)', + '(.*?)' + ], + 'type': 'list' + }, 'year': { - 'page': 'reference', - 're': '=["\']og:title["\'] content="[^"]*?\((\d{4}).*?"', + 'page': 'combined', + 're': '="og:title" content="[^"]*?\((\d{4}).*?"', 'type': 'int' }, 'credits': { @@ -275,7 +335,7 @@ class Imdb(SiteParser): self.baseUrl = "http://www.imdb.com/title/tt%s/" % id super(Imdb, self).__init__(timeout) - url = self.baseUrl + 'reference' + url = self.baseUrl + 'combined' page = self.read_url(url, timeout=-1) if 'IMDb: Page not found' in page \ or 'The requested URL was not found on our server.' in page: @@ -293,6 +353,8 @@ class Imdb(SiteParser): if 'country' in self: self['country'] = [normalize_country_name(c) or c for c in self['country']] + if 'sound' in self: + self['sound'] = list(set(self['sound'])) def cleanup_title(title): if title.startswith('"') and title.endswith('"'): @@ -327,8 +389,6 @@ class Imdb(SiteParser): del self['alternativeTitles'] if 'runtime' in self and self['runtime']: - if isinstance(self['runtime'], list): - self['runtime'] = self['runtime'][0] if 'min' in self['runtime']: base = 60 else: @@ -336,9 +396,8 @@ class Imdb(SiteParser): self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base if 'runtime' in self and not self['runtime']: del self['runtime'] - - if 'sound' in self: - self['sound'] = list(sorted(set(self['sound']))) + if 'votes' in self: + self['votes'] = self['votes'].replace(',', '') if 'cast' in self: if isinstance(self['cast'][0], string_types): @@ -346,7 +405,6 @@ class Imdb(SiteParser): self['actor'] = [c[0] for c in self['cast']] def cleanup_character(c): c = c.replace('(uncredited)', '').strip() - c = re.sub('\s+', ' ', c) return c self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} for x in self['cast']] @@ -370,11 +428,18 @@ class Imdb(SiteParser): return r cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) + self['connections'] = cc for key in ('country', 'genre'): if key in self: self[key] = list(filter(lambda x: x.lower() != 'home', self[key])) + #0092999 + if '_director' in self: + if 'series' in self or 'isSeries' in self: + self['creator'] = self.pop('_director') + else: + del self['_director'] if 'isSeries' in self: del self['isSeries'] self['isSeries'] = True @@ -493,7 +558,7 @@ class ImdbCombined(Imdb): def __init__(self, id, timeout=-1): _regex = {} for key in self.regex: - if self.regex[key]['page'] in ('releaseinfo', 'reference'): + if self.regex[key]['page'] in ('combined', 'releaseinfo'): _regex[key] = self.regex[key] self.regex = _regex super(ImdbCombined, self).__init__(id, timeout) diff --git a/ox/web/opensubtitles.py b/ox/web/opensubtitles.py index 2346a7d..7684402 100644 --- a/ox/web/opensubtitles.py +++ b/ox/web/opensubtitles.py @@ -2,12 +2,12 @@ # vi:si:et:sw=4:sts=4:ts=4 import re +import feedparser from ox.cache import read_url from ox import find_re, strip_tags from ox.iso import langCode2To3, langTo3Code def find_subtitles(imdb, parts = 1, language = "eng"): - import feedparser if len(language) == 2: language = langCode2To3(language) elif len(language) != 3: diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 61a79bd..fa21948 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -33,7 +33,7 @@ class SiteParser(dict): return "%s%s" % (self.baseUrl, page) def read_url(self, url, timeout): - if url not in self._cache: + if not url in self._cache: self._cache[url] = read_url(url, timeout=timeout, unicode=True) return self._cache[url] diff --git a/ox/web/youtube.py b/ox/web/youtube.py index 805f716..6a0ac3f 100644 --- a/ox/web/youtube.py +++ b/ox/web/youtube.py @@ -7,6 +7,7 @@ import re from xml.dom.minidom import parseString import json +import feedparser import ox from ox.cache import read_url, cache_timeout @@ -60,7 +61,6 @@ def get_video_info(id): return info def find(query, max_results=10, offset=1, orderBy='relevance'): - import feedparser query = quote(query) url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results) data = read_url(url) diff --git a/requirements.txt b/requirements.txt index b7509ec..e611b3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ chardet +feedparser six>=1.5.2 diff --git a/setup.py b/setup.py index 89fc803..34bb2af 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ setup( download_url="https://code.0x2620.org/python-ox/download", license="GPLv3", packages=['ox', 'ox.torrent', 'ox.web'], - install_requires=['six>=1.5.2', 'chardet'], + install_requires=['six>=1.5.2', 'chardet', 'feedparser'], keywords=[ ], classifiers=[