add import script
This commit is contained in:
parent
d63f020810
commit
4d83132f20
1 changed files with 58 additions and 0 deletions
58
csv2mongodb.py
Normal file
58
csv2mongodb.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from pymongo import Connection
|
||||||
|
connection = Connection()
|
||||||
|
|
||||||
|
db = connection.cablegates
|
||||||
|
cables = db.cables
|
||||||
|
|
||||||
|
from pymongo import ASCENDING
|
||||||
|
|
||||||
|
#http://cryptome.org/z/z.7z
|
||||||
|
csv_filename = 'data/cables.csv'
|
||||||
|
|
||||||
|
#http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent
|
||||||
|
html_cables = 'data/cables'
|
||||||
|
|
||||||
|
#incomplete/same info as csv
|
||||||
|
#http://file.wikileaks.org/torrent/cable_db_full.7z.torrent
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cables.create_index([('refid', ASCENDING),])
|
||||||
|
|
||||||
|
|
||||||
|
csv.field_size_limit(1024*1024*2)
|
||||||
|
reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\")
|
||||||
|
|
||||||
|
n = 0
|
||||||
|
for row in reader:
|
||||||
|
cable = {}
|
||||||
|
keys = ['id',
|
||||||
|
'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content']
|
||||||
|
|
||||||
|
#skip id, integer id of csv dump
|
||||||
|
for i in range(0, len(keys)):
|
||||||
|
cable[keys[i]] = row[i].strip()
|
||||||
|
|
||||||
|
cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M')
|
||||||
|
html = '%s/cable/%s/%s.html' % (html_cables,
|
||||||
|
cable['created'].strftim('%Y/%m'), cable['refid'])
|
||||||
|
with open(html) as f:
|
||||||
|
data = f.read()
|
||||||
|
cable['released'] = re.compile("<a href='/reldate/.*?'>(.*?)</a>").findall(data)[-1]
|
||||||
|
|
||||||
|
data = cable['content']
|
||||||
|
if 'SUBJECT: ' in data:
|
||||||
|
cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)',
|
||||||
|
re.DOTALL).findall(data)[0][0].replace('\n', '').strip()
|
||||||
|
print cable['subject']
|
||||||
|
if 'TAGS: ' in data:
|
||||||
|
cable['tags'] = re.compile('TAGS: (.*?)( \n|$)',
|
||||||
|
re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ')
|
||||||
|
|
||||||
|
print n, data['refid']
|
||||||
|
cables.save(data)
|
||||||
|
n += 1
|
||||||
|
|
Loading…
Reference in a new issue