# -*- coding: utf-8 -*-
import codecs
import json
import os
import ox
import re
import sys
import urllib
CACHE = sys.argv[-1] == '-cache'
try:
DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc'))
except:
print 'parse error, see jsonc/debug.json'
ox.file.write_file('../jsonc/debug.json', ox.js.minify(ox.file.read_file('../jsonc/countries.jsonc')))
sys.exit()
GEO = {}
for country in json.loads(ox.file.read_file('../../../source/Ox.Geo/json/Ox.Geo.json')):
GEO[country['code']] = {}
for key in ['area', 'lat', 'lng', 'south', 'west', 'north', 'east']:
GEO[country['code']][key] = country[key]
LOGS = {}
def decode_wikipedia_id(id):
id = id.replace('_', ' ').encode('utf8')
return urllib.unquote(id).decode('utf8')
def encode_wikipedia_id(id):
# try:
# id = id.encode('utf8')
# except:
# pass
# return urllib2.quote(id.replace(' ', '_').encode('utf8'))
return id.replace(' ', '_').encode('utf8')
def get_countries():
def exists(country):
for c in countries:
if c['name'] == country['name']:
return True
return False
def fix(html):
html = html.split('The following alpha-2 codes were previously exceptionally reserved')[0]
for key, value in DATA['wikipedia_url'].iteritems():
html = html.replace(encode_wikipedia_id(key), encode_wikipedia_id(value))
return re.sub('[\w\s]+ !', '', html)
def parse(match):
country = {}
is_tuple = type(match) == tuple
name = decode_wikipedia_id(match[1] if is_tuple else match)
if is_tuple:
country['code'] = match[0]
if name in DATA['name']:
country['name'] = DATA['name'][name]
country['wikipediaName'] = name
else:
country['name'] = name
return country
def sort(country):
return country['code'] if 'code' in country else u'ZZ ' + country['name']
countries = map(lambda x: parse(x), DATA['wikipedia'])
# ISO 3166-3
html = read_wikipedia_url('ISO 3166-3').replace('Rhodesia', 'Southern Rhodesia') # FIXME: can this be avoided?
matches = re.compile('
.*?', re.DOTALL).findall(html)
countries += map(lambda x: parse(x), matches)
# ISO 3166-1 alpha-2
html = fix(read_wikipedia_url('ISO 3166-1 alpha-2'))
matches = re.compile('([A-Z]{2}) | \n .*? ?(.*?)\n').findall(html)
for match in matches:
code = match[0].upper()
name = re.sub('(.{2});', decode, match[1])
new = True
for country in countries:
if name == country['name'] or ('imdbName' in country and name == country['imdbName']):
new = False
if code == country['code']:
new = False
if name != country['name']:
imdb_countries[country['name']] = name
if not 'imdbName' in country or name != country['imdbName']:
LOGS['new countries'].append(name)
if not new:
break
if new:
print 'new', match
LOGS['new countries'].append(name)
ox.file.write_json('../json/imdbCountries.json', imdb_countries, indent=4, sort_keys=True)
def get_imdb_languages():
def decode(match):
return unichr(int(match.group(0)[3:-1], 16))
LOGS['new languages'] = []
imdb_languages = {}
html = read_url('http://www.imdb.com/language/')
matches = re.compile('(.*?)').findall(html)
for match in matches:
language = re.sub('(.{2});', decode, match)
language = re.sub('( languages| Sign Language)$', '', language)
imdb_languages[language] = ''
if not language in DATA['languages']:
LOGS['new languages'].append(language)
ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True)
def parse_txt():
data = {
'created': {},
'dissolved': {},
'independence': {}
}
f = codecs.open('../txt/countries.txt', 'r', 'utf-8')
lines = map(lambda x: x.strip(), f.readlines())
f.close()
for line in filter(lambda x: x[0] != '#', lines):
date, country_a, operator, country_b = re.compile(
'([\d\-]+) +(.+) ([\*=\+\-><]) (.+)'
).match(line).groups()
countries_a = country_a.split(' / ')
countries_b = country_b.split(' / ')
if operator == '*':
data['independence'][country_b] = {
'country': countries_a,
'date': date
}
elif operator == '=':
data['dissolved'][country_a] = {
'country': countries_b,
'date': date,
'dissolved': 'renamed'
}
data['created'][country_b] = {
'country': countries_a,
'date': date,
'created': 'renamed'
}
elif operator == '+':
for country in countries_a:
data['dissolved'][country] = {
'country': countries_b,
'date': date,
'dissolved': 'joined'
}
elif operator == '-':
for country in countries_b:
data['created'][country] = {
'country': countries_a,
'date': date,
'created': 'split'
}
elif operator == '>':
for country in countries_a:
data['dissolved'][country] = {
'country': countries_b,
'date': date,
'dissolved': 'merged'
}
data['created'][country_b] = {
'country': countries_a,
'date': date,
'created': 'merged'
}
elif operator == '<':
data['dissolved'][country_a] = {
'country': countries_b,
'date': date,
'dissolved': 'split'
}
for country in countries_b:
data['created'][country] = {
'country': countries_a,
'date': date,
'created': 'merged'
}
return data
def read_url(url):
print 'reading', url
return ox.cache.read_url(url) if CACHE else ox.net.read_url(url)
def read_wikipedia_url(id):
url = 'http://en.wikipedia.org/wiki/' + encode_wikipedia_id(id)
html = read_url(url)
try:
html = unicode(html, 'utf8')
except:
html = unicode(html, 'iso-8859-1')
return html
if __name__ == '__main__':
data = parse_txt()
DATA['created'] = data['created']
DATA['dissolved'] = data['dissolved']
DATA['independence'] = data['independence']
countries = get_countries()
ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True)
LOGS['total'] = len(countries)
for key in ['code', 'continent', 'flagURL']:
LOGS['no ' + key] = map(lambda x: x['name'], filter(lambda x: not key in x, countries))
LOGS['current independent'] = 0
LOGS['current dependent'] = 0
LOGS['current disputed'] = 0
LOGS['current exception'] = 0
LOGS['dissolved independent'] = 0
LOGS['dissolved dependent'] = 0
LOGS['dissolved disputed'] = 0
LOGS['dissolved exception'] = 0
for country in countries:
key = ' '.join([
'dissolved' if 'dissolved' in country else 'current',
'exception' if 'exception' in country else (
'disputed' if 'disputed' in country else (
'dependent' if 'dependency' in country else 'independent'
)
)
])
LOGS[key] += 1
get_imdb_countries(countries)
get_imdb_languages()
print json.dumps(LOGS, indent=4, sort_keys=True)
|