always utf-8, fixes #3209

fix keyword parsing
fix release date
2019-06-28 12:21:32 +02:00 · 2019-06-28 09:58:47 +02:00 · 2019-06-26 06:53:01 +02:00 · 2019-04-30 18:44:33 +02:00 · 2019-04-29 12:22:27 +02:00
4 changed files with 46 additions and 4 deletions
--- a/ox/api.py
+++ b/ox/api.py
@ -227,6 +227,8 @@ def signin(url):
        url = 'https://%s/api/' % url
    else:
        site = url.split('/')[2]
+    if not url.endswith('/'):
+        url += '/'
    api = API(url)
    update = False
    try:
--- a/ox/cache.py
+++ b/ox/cache.py
@ -456,7 +456,47 @@ class RedisCache(KVCache):
        self.backend = redis.from_url(self.url)


-if cache_path().startswith('fs:'):
+class FallbackCache(KVCache):
+    caches = []
+
+    def __init__(self):
+        fallback = cache_path()
+        for path in fallback.split('|'):
+            os.environ['oxCACHE'] = path
+            if path.startswith('redis:'):
+                store = RedisCache()
+            elif path.startswith('memcache:'):
+                store = MemCache()
+            self.caches.append(store)
+        os.environ['oxCACHE'] = fallback
+
+    def get(self, url, data, headers=None, timeout=-1, value="data"):
+        if timeout == 0:
+            return None
+
+        info_key, data_key = self._keys(url, data, headers)
+        for cache in self.caches:
+            try:
+                info = cache.backend.get(info_key)
+            except:
+                info = None
+            if info:
+                return cache.get(url, data, headers, timeout, value)
+        return None
+
+    def set(self, url, post_data, data, headers):
+        self.caches[0].set(url, post_data, data, headers)
+        for cache in self.caches[1:]:
+            cache.delete(url, post_data, headers)
+
+    def delete(self, url, data=None, headers=None):
+        for cache in self.caches:
+            cache.delete(url, data, headers)
+
+
+if '|' in cache_path():
+    store = FallbackCache()
+elif cache_path().startswith('fs:'):
    store = FileCache()
 elif cache_path().startswith('redis:'):
    store = RedisCache()
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -187,7 +187,7 @@ class Imdb(SiteParser):
        ], type='int'),
        'keyword': {
            'page': 'keywords',
-            're': '<a href="/keyword/.*?>(.*?)</a>',
+            're': 'data-item-keyword="(.*?)"',
            'type': 'list'
        },
        'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
@ -224,7 +224,7 @@ class Imdb(SiteParser):
        'releasedate': {
            'page': 'releaseinfo',
            're': [
-                '<td class="release_date">(.*?)</td>',
+                '<td class="release-date-item__date".*?>(.*?)</td>',
                strip_tags,
            ],
            'type': 'list'
--- a/ox/web/piratecinema.py
+++ b/ox/web/piratecinema.py
@ -7,7 +7,7 @@ from ox.net import read_url

 def get_poster_url(id):
    url = 'http://piratecinema.org/posters/'
-    html = read_url(url, unicode=True)
+    html = read_url(url).decode('utf-8')
    results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
    for result in results:
        if result[1] == id:
Author	SHA1	Message	Date
j	2026b64faf	always utf-8, fixes #3209	2019-06-28 12:21:32 +02:00
j	0728847ffa	fix keyword parsing	2019-06-28 09:58:47 +02:00
j	8ecb14795f	fix release date	2019-06-26 06:53:01 +02:00
j	75b12dfb86	normalize api to end with /	2019-04-30 18:44:33 +02:00
j	8675edf19f	fallback cache	2019-04-29 12:22:27 +02:00