add import script
This commit is contained in:
parent
d63f020810
commit
4d83132f20
1 changed files with 58 additions and 0 deletions
58
csv2mongodb.py
Normal file
58
csv2mongodb.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import csv
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
from pymongo import Connection
|
||||
connection = Connection()
|
||||
|
||||
db = connection.cablegates
|
||||
cables = db.cables
|
||||
|
||||
from pymongo import ASCENDING
|
||||
|
||||
#http://cryptome.org/z/z.7z
|
||||
csv_filename = 'data/cables.csv'
|
||||
|
||||
#http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent
|
||||
html_cables = 'data/cables'
|
||||
|
||||
#incomplete/same info as csv
|
||||
#http://file.wikileaks.org/torrent/cable_db_full.7z.torrent
|
||||
|
||||
if __name__ == "__main__":
|
||||
cables.create_index([('refid', ASCENDING),])
|
||||
|
||||
|
||||
csv.field_size_limit(1024*1024*2)
|
||||
reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\")
|
||||
|
||||
n = 0
|
||||
for row in reader:
|
||||
cable = {}
|
||||
keys = ['id',
|
||||
'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content']
|
||||
|
||||
#skip id, integer id of csv dump
|
||||
for i in range(0, len(keys)):
|
||||
cable[keys[i]] = row[i].strip()
|
||||
|
||||
cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M')
|
||||
html = '%s/cable/%s/%s.html' % (html_cables,
|
||||
cable['created'].strftim('%Y/%m'), cable['refid'])
|
||||
with open(html) as f:
|
||||
data = f.read()
|
||||
cable['released'] = re.compile("<a href='/reldate/.*?'>(.*?)</a>").findall(data)[-1]
|
||||
|
||||
data = cable['content']
|
||||
if 'SUBJECT: ' in data:
|
||||
cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)',
|
||||
re.DOTALL).findall(data)[0][0].replace('\n', '').strip()
|
||||
print cable['subject']
|
||||
if 'TAGS: ' in data:
|
||||
cable['tags'] = re.compile('TAGS: (.*?)( \n|$)',
|
||||
re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ')
|
||||
|
||||
print n, data['refid']
|
||||
cables.save(data)
|
||||
n += 1
|
||||
|
Loading…
Reference in a new issue