import csv
import re
from datetime import datetime

from pymongo import Connection
connection = Connection()

db = connection.cablegates
cables = db.cables

from pymongo import ASCENDING

#http://cryptome.org/z/z.7z 
csv_filename = 'data/cables.csv'

#http://88.80.16.63/torrent/cablegate/cablegate-201108300212.7z.torrent
html_cables = 'data/cables'

#incomplete/same info as csv
#http://file.wikileaks.org/torrent/cable_db_full.7z.torrent

if __name__ == "__main__":
    cables.create_index([('refid', ASCENDING),])


    csv.field_size_limit(1024*1024*2)
    reader = csv.reader(open(csv_filename, 'rb'), delimiter=',', quotechar='"', escapechar="\\")

    n = 0
    for row in reader:
        cable = {}
        keys = ['id',
                'created', 'refid', 'origin', 'classification', 'destination', 'header', 'content']

        #skip id, integer id of csv dump
        for i in range(0, len(keys)):
            cable[keys[i]] = row[i].strip()

        cable['created'] = datetime.strptime(cable['created'], '%m/%d/%Y %H:%M')
        html = '%s/cable/%s/%s.html' % (html_cables, 
               cable['created'].strftim('%Y/%m'), cable['refid'])
        with open(html) as f:
            data = f.read()    
            cable['released'] = re.compile("<a href='/reldate/.*?'>(.*?)</a>").findall(data)[-1]

        data = cable['content']
        if 'SUBJECT: ' in data:
            cable['subject'] = re.compile('SUBJECT: (.*?)( \n \n|$)',
                               re.DOTALL).findall(data)[0][0].replace('\n', '').strip()
            print cable['subject']
        if 'TAGS: ' in data:
            cable['tags'] = re.compile('TAGS: (.*?)( \n|$)',
                            re.DOTALL).findall(data)[0][0].replace('\n', '').strip().split(', ')

        print n, data['refid']
        cables.save(data)
        n += 1