add import script

This commit is contained in:
j 2011-09-05 01:29:12 +02:00
parent d63f020810
commit 4d83132f20
1 changed files with 58 additions and 0 deletions

58
csv2mongodb.py Normal file
View File

@ -0,0 +1,58 @@
import csv
import re
from datetime import datetime
from pymongo import Connection
connection = Connection()
db = connection.cablegates
cables = db.cables
from pymongo import ASCENDING
#http://cryptome.org/z/z.7z
csv_filename = 'data/cables.csv'
#http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent
html_cables = 'data/cables'
#incomplete/same info as csv
#http://file.wikileaks.org/torrent/cable_db_full.7z.torrent
if __name__ == "__main__":
cables.create_index([('refid', ASCENDING),])
csv.field_size_limit(1024*1024*2)
reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\")
n = 0
for row in reader:
cable = {}
keys = ['id',
'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content']
#skip id, integer id of csv dump
for i in range(0, len(keys)):
cable[keys[i]] = row[i].strip()
cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M')
html = '%s/cable/%s/%s.html' % (html_cables,
cable['created'].strftim('%Y/%m'), cable['refid'])
with open(html) as f:
data = f.read()
cable['released'] = re.compile("<a href='/reldate/.*?'>(.*?)</a>").findall(data)[-1]
data = cable['content']
if 'SUBJECT: ' in data:
cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)',
re.DOTALL).findall(data)[0][0].replace('\n', '').strip()
print cable['subject']
if 'TAGS: ' in data:
cable['tags'] = re.compile('TAGS: (.*?)( \n|$)',
re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ')
print n, data['refid']
cables.save(data)
n += 1