From f086c64e51e66e5eb2147be8baec87f1c14f0ab7 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Thu, 6 Jun 2013 11:20:43 +0200 Subject: [PATCH] parse arsenal --- ox/web/arsenal.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 ox/web/arsenal.py diff --git a/ox/web/arsenal.py b/ox/web/arsenal.py new file mode 100644 index 0000000..64112ac --- /dev/null +++ b/ox/web/arsenal.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from datetime import datetime +from urllib import urlencode +import json +import os +import re + +from ox import find_re, strip_tags, decode_html +from ox.cache import read_url +from ox.net import open_url + +def get_data(id, language='en'): + if language == 'de': + url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id + else: + url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id + html = read_url(url, unicode=True) + if 'ID does not exist' in html: + return None + if 'Willkommen in der Datenbank des Arsenal' in html: + return None + data = {} + data[u'id'] = id + data[u'url'] = url + m = re.compile('

(.*?)

').findall(html) + if m: + data[u'title'] = m[0] + m = re.compile("Director: (.*?)").findall(html) + if m: + data[u'director'] = m[0] + + m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) + if m: + data[u'image'] = m[0] + + units = re.compile("
(.*?)
", re.DOTALL).findall(html) + for x in map(re.compile('(.*?): (.*)', re.DOTALL).findall, units): + if x: + #data[x[0][0].lower()] = strip_tags(x[0][1]) + key = x[0][0].lower() + data[key] = x[0][1] + if key == "forum catalogue pdf": + data[key] = find_re(data[key], '"(http:.*?)"') + else: + data[key] = strip_tags(data[key]) + if "running time (minutes)" in data: + data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 + for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): + if key in data and data[key].isdigit(): + data[key] = int(data[key]) + return data + +def backup(filename): + if os.path.exists(filename): + with open(filename) as f: + data = json.load(f) + else: + data = {} + start = ids and max(map(int, data)) or 1 + for i in range(start, 11872): + info = get_data(i) + if info: + data[i] = info + if len(data) % 10 == 0: + print 'save', filename, len(data) + with open(filename, 'w') as f: + json.dump(data, f) + else: + print 'ignore', i + with open(filename, 'w') as f: + json.dump(data, f) + return data +