import csv import re from datetime import datetime from pymongo import Connection connection = Connection() db = connection.cablegates cables = db.cables from pymongo import ASCENDING #http://cryptome.org/z/z.7z csv_filename = 'data/cables.csv' #http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent html_cables = 'data/cables' #incomplete/same info as csv #http://file.wikileaks.org/torrent/cable_db_full.7z.torrent if __name__ == "__main__": cables.create_index([('refid', ASCENDING),]) csv.field_size_limit(1024*1024*2) reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\") n = 0 for row in reader: cable = {} keys = ['id', 'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content'] #skip id, integer id of csv dump for i in range(0, len(keys)): cable[keys[i]] = row[i].strip() cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M') html = '%s/cable/%s/%s.html' % (html_cables, cable['created'].strftim('%Y/%m'), cable['refid']) with open(html) as f: data = f.read() cable['released'] = re.compile("(.*?)").findall(data)[-1] data = cable['content'] if 'SUBJECT: ' in data: cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)', re.DOTALL).findall(data)[0][0].replace('\n', '').strip() print cable['subject'] if 'TAGS: ' in data: cable['tags'] = re.compile('TAGS: (.*?)( \n|$)', re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ') print n, data['refid'] cables.save(data) n += 1