cache imdb urls in parallel

This commit is contained in:
j 2019-08-03 23:38:31 +02:00
parent cc1bad76cd
commit 388f33ebb6

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from multiprocessing.pool import ThreadPool
from six import string_types
@ -28,6 +29,7 @@ def cleanup(key, data, data_type):
class SiteParser(dict):
baseUrl = ''
regex = {}
pool = ThreadPool(8)
def get_url(self, page):
return "%s%s" % (self.baseUrl, page)
@ -39,6 +41,9 @@ class SiteParser(dict):
def __init__(self, timeout=-1):
self._cache = {}
urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex))
self.pool.map(self.get_url, urls)
for key in self.regex:
url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout)