From 388f33ebb629c56751dc5bef6c6bbe2d33f60876 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 3 Aug 2019 23:38:31 +0200 Subject: [PATCH] cache imdb urls in parallel --- ox/web/siteparser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 61a79bd..8c212bf 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re +from multiprocessing.pool import ThreadPool from six import string_types @@ -28,6 +29,7 @@ def cleanup(key, data, data_type): class SiteParser(dict): baseUrl = '' regex = {} + pool = ThreadPool(8) def get_url(self, page): return "%s%s" % (self.baseUrl, page) @@ -39,6 +41,9 @@ class SiteParser(dict): def __init__(self, timeout=-1): self._cache = {} + urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex)) + self.pool.map(self.get_url, urls) + for key in self.regex: url = self.get_url(self.regex[key]['page']) data = self.read_url(url, timeout)