From cc1bad76cdcac8af5f53b3f12f67ed8c948f3037 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 3 Aug 2019 23:35:16 +0200 Subject: [PATCH 1/5] update user agent --- ox/net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/net.py b/ox/net.py index 2a5e71b..59e6abe 100644 --- a/ox/net.py +++ b/ox/net.py @@ -21,7 +21,7 @@ from chardet.universaldetector import UniversalDetector DEBUG = False # Default headers for HTTP requests. DEFAULT_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4', From 388f33ebb629c56751dc5bef6c6bbe2d33f60876 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 3 Aug 2019 23:38:31 +0200 Subject: [PATCH 2/5] cache imdb urls in parallel --- ox/web/siteparser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 61a79bd..8c212bf 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re +from multiprocessing.pool import ThreadPool from six import string_types @@ -28,6 +29,7 @@ def cleanup(key, data, data_type): class SiteParser(dict): baseUrl = '' regex = {} + pool = ThreadPool(8) def get_url(self, page): return "%s%s" % (self.baseUrl, page) @@ -39,6 +41,9 @@ class SiteParser(dict): def __init__(self, timeout=-1): self._cache = {} + urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex)) + self.pool.map(self.get_url, urls) + for key in self.regex: url = self.get_url(self.regex[key]['page']) data = self.read_url(url, timeout) From 665a4038b2df1222b9584ec950c3c35f1fe81a01 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 8 Aug 2019 17:08:13 +0200 Subject: [PATCH 3/5] space --- ox/web/wikipedia.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index cb73758..de8b064 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -17,7 +17,7 @@ def get_id(url): def get_url(id=None, imdb=None, allmovie=None): if imdb: - query = '"%s"'% imdb + query = '"%s"' % imdb result = find(query) if result: url = result[0][1] @@ -26,7 +26,7 @@ def get_url(id=None, imdb=None, allmovie=None): return url return "" if allmovie: - query = '"amg_id = 1:%s"'% allmovie + query = '"amg_id = 1:%s"' % allmovie result = find(query) if result: url = result[0][1] @@ -140,7 +140,7 @@ def get_allmovie_id(wikipedia_url): return data.get('amg_id', '') def find(query, max_results=10): - query = {'action': 'query', 'list':'search', 'format': 'json', + query = {'action': 'query', 'list': 'search', 'format': 'json', 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query) data = read_url(url) From cef85fc4defb57be8442bf83bdd3fbc1751e3ce1 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 14:51:13 +0100 Subject: [PATCH 4/5] depend on lxml --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b7509ec..51c3f99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ chardet six>=1.5.2 +lxml From 03c119155081f7b9f65e1f55d3a58708c9dc6704 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 14:51:32 +0100 Subject: [PATCH 5/5] fall back to storyline for summary --- ox/web/imdb.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 7d91dc7..2f14e33 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -199,6 +199,11 @@ class Imdb(SiteParser): 'summary': zebra_table('Plot Summary', more=[ '

(.*?)Storyline.*?

(.*?)

', + 'type': 'string' + }, 'posterId': { 'page': 'reference', 're': '', @@ -517,10 +522,13 @@ class Imdb(SiteParser): ]) if self['releasedate'] == 'x': del self['releasedate'] + + if 'summary' not in self and 'storyline' in self: + self['summary'] = self.pop('storyline') if 'summary' in self: if isinstance(self['summary'], list): self['summary'] = self['summary'][0] - self['summary'] = self['summary'].split('