From 4d83132f206abf10c65d87050c506019489bef19 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Mon, 5 Sep 2011 01:29:12 +0200 Subject: [PATCH] add import script --- csv2mongodb.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 csv2mongodb.py diff --git a/csv2mongodb.py b/csv2mongodb.py new file mode 100644 index 0000000..bf41a91 --- /dev/null +++ b/csv2mongodb.py @@ -0,0 +1,58 @@ +import csv +import re +from datetime import datetime + +from pymongo import Connection +connection = Connection() + +db = connection.cablegates +cables = db.cables + +from pymongo import ASCENDING + +#http://cryptome.org/z/z.7z +csv_filename = 'data/cables.csv' + +#http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent +html_cables = 'data/cables' + +#incomplete/same info as csv +#http://file.wikileaks.org/torrent/cable_db_full.7z.torrent + +if __name__ == "__main__": + cables.create_index([('refid', ASCENDING),]) + + + csv.field_size_limit(1024*1024*2) + reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\") + + n = 0 + for row in reader: + cable = {} + keys = ['id', + 'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content'] + + #skip id, integer id of csv dump + for i in range(0, len(keys)): + cable[keys[i]] = row[i].strip() + + cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M') + html = '%s/cable/%s/%s.html' % (html_cables, + cable['created'].strftim('%Y/%m'), cable['refid']) + with open(html) as f: + data = f.read() + cable['released'] = re.compile("(.*?)").findall(data)[-1] + + data = cable['content'] + if 'SUBJECT: ' in data: + cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)', + re.DOTALL).findall(data)[0][0].replace('\n', '').strip() + print cable['subject'] + if 'TAGS: ' in data: + cable['tags'] = re.compile('TAGS: (.*?)( \n|$)', + re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ') + + print n, data['refid'] + cables.save(data) + n += 1 +