2011-05-23 19:38:52 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2012-04-02 22:20:18 +00:00
|
|
|
import codecs
|
2011-05-23 19:38:52 +00:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import ox
|
|
|
|
import re
|
2011-11-23 14:53:17 +00:00
|
|
|
import sys
|
|
|
|
import urllib
|
2011-05-23 19:38:52 +00:00
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
CACHE = sys.argv[-1] == '-cache'
|
2012-04-02 21:13:05 +00:00
|
|
|
|
2011-11-30 14:28:01 +00:00
|
|
|
try:
|
|
|
|
DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc'))
|
|
|
|
except:
|
2012-04-02 21:13:05 +00:00
|
|
|
print 'parse error, see jsonc/debug.json'
|
2011-11-30 14:28:01 +00:00
|
|
|
ox.file.write_file('../jsonc/debug.json', ox.js.minify(ox.file.read_file('../jsonc/countries.jsonc')))
|
|
|
|
sys.exit()
|
2012-04-02 21:13:05 +00:00
|
|
|
|
|
|
|
GEO = {}
|
|
|
|
for country in json.loads(ox.file.read_file('../../../source/Ox.Geo/json/Ox.Geo.json')):
|
|
|
|
GEO[country['code']] = {}
|
|
|
|
for key in ['area', 'lat', 'lng', 'south', 'west', 'north', 'east']:
|
|
|
|
GEO[country['code']][key] = country[key]
|
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
LOGS = {}
|
2011-05-23 19:38:52 +00:00
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
def decode_wikipedia_id(id):
|
|
|
|
id = id.replace('_', ' ').encode('utf8')
|
|
|
|
return urllib.unquote(id).decode('utf8')
|
2011-05-23 19:38:52 +00:00
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
def encode_wikipedia_id(id):
|
|
|
|
# try:
|
|
|
|
# id = id.encode('utf8')
|
|
|
|
# except:
|
|
|
|
# pass
|
|
|
|
# return urllib2.quote(id.replace(' ', '_').encode('utf8'))
|
|
|
|
return id.replace(' ', '_').encode('utf8')
|
2011-05-23 19:38:52 +00:00
|
|
|
|
|
|
|
def get_countries():
|
2011-11-23 14:53:17 +00:00
|
|
|
def exists(country):
|
|
|
|
for c in countries:
|
|
|
|
if c['name'] == country['name']:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
def fix(html):
|
|
|
|
html = html.split('The following alpha-2 codes were previously exceptionally reserved')[0]
|
|
|
|
for key, value in DATA['wikipedia_url'].iteritems():
|
|
|
|
html = html.replace(encode_wikipedia_id(key), encode_wikipedia_id(value))
|
|
|
|
return re.sub('<span style="display:none" class="sortkey">[\w\s]+ !</span><span class="sorttext">', '', html)
|
|
|
|
def parse(match):
|
|
|
|
country = {}
|
|
|
|
is_tuple = type(match) == tuple
|
|
|
|
name = decode_wikipedia_id(match[1] if is_tuple else match)
|
|
|
|
if is_tuple:
|
|
|
|
country['code'] = match[0]
|
|
|
|
if name in DATA['name']:
|
|
|
|
country['name'] = DATA['name'][name]
|
|
|
|
country['wikipediaName'] = name
|
2011-09-09 23:05:04 +00:00
|
|
|
else:
|
2011-11-23 14:53:17 +00:00
|
|
|
country['name'] = name
|
|
|
|
return country
|
|
|
|
def sort(country):
|
|
|
|
return country['code'] if 'code' in country else u'ZZ ' + country['name']
|
|
|
|
countries = map(lambda x: parse(x), DATA['wikipedia'])
|
|
|
|
# ISO 3166-3
|
2011-11-30 14:28:01 +00:00
|
|
|
html = read_wikipedia_url('ISO 3166-3').replace('Rhodesia', 'Southern Rhodesia') # FIXME: can this be avoided?
|
2011-11-23 14:53:17 +00:00
|
|
|
matches = re.compile('<td id="([A-Z]{4})">.*?<a href="/wiki/(.*?)".*?>', re.DOTALL).findall(html)
|
|
|
|
countries += map(lambda x: parse(x), matches)
|
|
|
|
# ISO 3166-1 alpha-2
|
|
|
|
html = fix(read_wikipedia_url('ISO 3166-1 alpha-2'))
|
|
|
|
matches = re.compile('<tt>([A-Z]{2})</tt></td>\n<td><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
|
|
countries += filter(lambda x: not exists(x), map(lambda x: parse(x), matches))
|
|
|
|
# List of sovereign states
|
|
|
|
html = read_wikipedia_url('List of sovereign states')
|
|
|
|
matches = re.compile('> </span><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
|
|
countries += filter(lambda x: not exists(x), map(lambda x: parse(x), matches))
|
|
|
|
'''
|
|
|
|
for year in range(1970, 2020, 10):
|
|
|
|
html = read_wikipedia_url('List of sovereign states in the %ds' % year)
|
|
|
|
matches = re.compile('class="thumbborder" />.*?</span> ?<a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
|
|
print year, '-' * 64
|
|
|
|
for x in map(lambda x: x['name'], filter(lambda x: not exists(x), map(lambda x: parse(x), matches))):
|
|
|
|
print x
|
2011-11-30 14:28:01 +00:00
|
|
|
sys.exit()
|
2011-11-23 14:53:17 +00:00
|
|
|
'''
|
|
|
|
# Country data
|
|
|
|
countries = sorted(countries, key=sort)
|
|
|
|
countries = map(lambda x: get_country_data(x), countries)
|
2012-04-02 21:13:05 +00:00
|
|
|
# Independence
|
|
|
|
for i, country in enumerate(countries):
|
|
|
|
if 'created' in country and not 'dependency' in country:
|
|
|
|
name = country['created']['country'][0]
|
|
|
|
data = filter(lambda x: x['name'] == name, countries)[0]
|
|
|
|
if 'dependency' in data:
|
|
|
|
countries[i]['independence'] = {
|
|
|
|
'country': data['dependency'],
|
|
|
|
'date': country['created']['date']
|
|
|
|
}
|
2011-11-23 14:53:17 +00:00
|
|
|
# Flags
|
|
|
|
countries = sorted(countries, key=sort)
|
|
|
|
flags = get_flags(countries)
|
2011-05-23 19:38:52 +00:00
|
|
|
return countries
|
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
def get_country_data(country):
|
|
|
|
name = country['name']
|
|
|
|
html = read_wikipedia_url(country['wikipediaName'] if 'wikipediaName' in country else name)
|
|
|
|
# code
|
|
|
|
if name in DATA['code']:
|
|
|
|
country['code'] = DATA['code'][name]
|
|
|
|
elif not 'code' in country:
|
|
|
|
match = re.search('"/wiki/ISO_3166-2:(\w{2})"', html)
|
|
|
|
if not match:
|
|
|
|
match = re.search('"/wiki/\.(\w{2})"', html)
|
|
|
|
if match:
|
|
|
|
country['code'] = match.group(1).upper()
|
|
|
|
# continents and regions
|
|
|
|
for continent, regions in DATA['continents'].iteritems():
|
|
|
|
for region, countries in regions.iteritems():
|
|
|
|
if name in countries:
|
|
|
|
country['continent'] = continent
|
|
|
|
country['region'] = region
|
|
|
|
break
|
2012-04-02 21:13:05 +00:00
|
|
|
# created
|
2011-11-23 14:53:17 +00:00
|
|
|
if name in DATA['created']:
|
|
|
|
country['created'] = DATA['created'][name]
|
|
|
|
# dependencies
|
|
|
|
for c, d in DATA['dependencies'].iteritems():
|
2012-04-02 22:20:18 +00:00
|
|
|
c = c.split('; ')
|
2011-11-23 14:53:17 +00:00
|
|
|
if name in c:
|
2011-11-24 18:38:10 +00:00
|
|
|
country['dependencies'] = d if not 'dependencies' in country else country['dependencies'] + d
|
2011-11-23 14:53:17 +00:00
|
|
|
elif name in d:
|
|
|
|
country['dependency'] = c if not 'dependency' in country else country['dependency'] + c
|
|
|
|
# disputes
|
|
|
|
for c, d in DATA['disputes'].iteritems():
|
2012-04-02 22:20:18 +00:00
|
|
|
c = c.split('; ')
|
2011-11-23 14:53:17 +00:00
|
|
|
if name in c:
|
|
|
|
country['disputes'] = d if not 'disputes' in country else country['disputes'] + d
|
|
|
|
elif name in d:
|
|
|
|
country['disputed'] = c if not 'disputed' in country else country['disputed'] + c
|
2012-04-02 21:13:05 +00:00
|
|
|
# dissolved
|
|
|
|
if name in DATA['dissolved']:
|
|
|
|
country['dissolved'] = DATA['dissolved'][name]
|
|
|
|
# exception
|
|
|
|
if country['code'] in DATA['exception']:
|
|
|
|
country['exception'] = True
|
2011-11-23 14:53:17 +00:00
|
|
|
# flag
|
|
|
|
if name in DATA['flag']:
|
|
|
|
file = DATA['flag'][name] if DATA['flag'][name][-4:] == '.png' else DATA['flag'][name] + '.svg'
|
|
|
|
country['flagURL'] = get_flag('File:' + file)
|
2011-05-23 19:38:52 +00:00
|
|
|
else:
|
2011-11-23 14:53:17 +00:00
|
|
|
match = re.search('vertical-align:middle;?"( align="center")?><a href="/wiki/(File:.*?)"', html)
|
|
|
|
if not match:
|
|
|
|
match = re.search('"/wiki/(File:Flag_.*?\.svg)"', html)
|
|
|
|
if match:
|
|
|
|
country['flagURL'] = get_flag(match.group(len(match.groups())))
|
|
|
|
# google
|
|
|
|
if name in DATA['google']:
|
|
|
|
country['googleName'] = DATA['google'][name]
|
|
|
|
# imdb
|
|
|
|
if name in DATA['imdb']:
|
|
|
|
country['imdbName'] = DATA['imdb'][name]
|
2012-04-02 21:13:05 +00:00
|
|
|
# independence
|
|
|
|
if name in DATA['independence']:
|
|
|
|
country['independence'] = DATA['independence'][name]
|
2011-11-23 14:53:17 +00:00
|
|
|
# languages
|
|
|
|
for language, c in DATA['languages'].iteritems():
|
|
|
|
if c == name:
|
|
|
|
if not 'languages' in country:
|
|
|
|
country['languages'] = [language]
|
2011-05-23 19:38:52 +00:00
|
|
|
else:
|
2011-11-23 14:53:17 +00:00
|
|
|
country['languages'].append(language)
|
2012-04-02 21:13:05 +00:00
|
|
|
# area, lat, lng, south, west, north, east
|
|
|
|
if country['code'] in GEO:
|
|
|
|
for key in GEO[country['code']]:
|
|
|
|
country[key] = GEO[country['code']][key]
|
2011-11-23 14:53:17 +00:00
|
|
|
return country
|
|
|
|
|
|
|
|
def get_flag(id):
|
|
|
|
html = read_wikipedia_url(id)
|
|
|
|
match = re.search('<div class="fullImageLink" id="file"><a href="(.*?)"', html)
|
|
|
|
return 'http:' + match.group(1)
|
|
|
|
|
|
|
|
def get_flags(countries):
|
|
|
|
def sort(country):
|
|
|
|
index = 1 if 'dependency' in country or 'dissolved' in country else 0
|
|
|
|
if country['name'] in DATA['flag_link']:
|
|
|
|
index = 2
|
|
|
|
return index
|
|
|
|
flags = {}
|
|
|
|
flag_countries = {}
|
|
|
|
for country in sorted(countries, key=lambda x: sort(x)):
|
|
|
|
if 'flagURL' in country: # account for errors
|
|
|
|
extension = country['flagURL'][-3:]
|
|
|
|
file = '../%s/flags/%s.%s' % (extension, country['code'], extension)
|
|
|
|
if not country['flagURL'] in flag_countries:
|
|
|
|
flag_countries[country['flagURL']] = country['code']
|
|
|
|
img = read_url(country['flagURL'])
|
|
|
|
if not os.path.exists(file) or ox.file.read_file(file) != img:
|
|
|
|
ox.file.write_path(file)
|
|
|
|
ox.file.write_file(file, img)
|
2011-05-23 19:38:52 +00:00
|
|
|
else:
|
2011-11-23 14:53:17 +00:00
|
|
|
flags[country['code']] = flag_countries[country['flagURL']]
|
|
|
|
if not os.path.lexists(file):
|
|
|
|
ox.file.write_link(flags[country['code']] + '.' + extension, file)
|
|
|
|
file = file.replace('/flags/', '/icons/')
|
|
|
|
if not os.path.lexists(file):
|
|
|
|
ox.file.write_link(flags[country['code']] + '.' + extension, file)
|
|
|
|
for size in [4096, 1024, 256, 64, 16]:
|
|
|
|
file = '../png/icons/%d/%s.png' % (size, country['code'])
|
|
|
|
if not os.path.lexists(file):
|
|
|
|
ox.file.write_path(file)
|
|
|
|
ox.file.write_link(flags[country['code']] + '.png', file)
|
|
|
|
return flags
|
|
|
|
|
|
|
|
def get_imdb_countries(countries):
|
|
|
|
def decode(match):
|
|
|
|
return unichr(int(match.group(0)[3:-1], 16))
|
|
|
|
LOGS['new countries'] = []
|
|
|
|
imdb_countries = DATA['imdb']
|
|
|
|
html = read_url('http://www.imdb.com/country/')
|
|
|
|
matches = re.compile('<a href="/country/(.*?)">(.*?)\n</a>').findall(html)
|
|
|
|
for match in matches:
|
|
|
|
code = match[0].upper()
|
|
|
|
name = re.sub('&#x(.{2});', decode, match[1])
|
|
|
|
new = True
|
|
|
|
for country in countries:
|
|
|
|
if name == country['name'] or ('imdbName' in country and name == country['imdbName']):
|
|
|
|
new = False
|
|
|
|
if code == country['code']:
|
|
|
|
new = False
|
|
|
|
if name != country['name']:
|
|
|
|
imdb_countries[country['name']] = name
|
|
|
|
if not 'imdbName' in country or name != country['imdbName']:
|
|
|
|
LOGS['new countries'].append(name)
|
|
|
|
if not new:
|
2011-05-23 19:38:52 +00:00
|
|
|
break
|
2011-11-23 14:53:17 +00:00
|
|
|
if new:
|
|
|
|
print 'new', match
|
|
|
|
LOGS['new countries'].append(name)
|
|
|
|
ox.file.write_json('../json/imdbCountries.json', imdb_countries, indent=4, sort_keys=True)
|
|
|
|
|
|
|
|
def get_imdb_languages():
|
|
|
|
def decode(match):
|
|
|
|
return unichr(int(match.group(0)[3:-1], 16))
|
|
|
|
LOGS['new languages'] = []
|
|
|
|
imdb_languages = {}
|
|
|
|
html = read_url('http://www.imdb.com/language/')
|
|
|
|
matches = re.compile('<a href="/language/.*?">(.*?)</a>').findall(html)
|
|
|
|
for match in matches:
|
|
|
|
language = re.sub('&#x(.{2});', decode, match)
|
|
|
|
language = re.sub('( languages| Sign Language)$', '', language)
|
|
|
|
imdb_languages[language] = ''
|
|
|
|
if not language in DATA['languages']:
|
|
|
|
LOGS['new languages'].append(language)
|
|
|
|
ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True)
|
|
|
|
|
2012-04-02 21:13:05 +00:00
|
|
|
def parse_txt():
|
|
|
|
data = {
|
|
|
|
'created': {},
|
|
|
|
'dissolved': {},
|
|
|
|
'independence': {}
|
|
|
|
}
|
2012-04-02 22:20:18 +00:00
|
|
|
f = codecs.open('../txt/countries.txt', 'r', 'utf-8')
|
2012-04-02 21:13:05 +00:00
|
|
|
lines = map(lambda x: x.strip(), f.readlines())
|
|
|
|
f.close()
|
|
|
|
for line in filter(lambda x: x[0] != '#', lines):
|
|
|
|
date, country_a, operator, country_b = re.compile(
|
|
|
|
'([\d\-]+) +(.+) ([\*=\+\-><]) (.+)'
|
|
|
|
).match(line).groups()
|
|
|
|
countries_a = country_a.split(' / ')
|
|
|
|
countries_b = country_b.split(' / ')
|
|
|
|
if operator == '*':
|
|
|
|
data['independence'][country_b] = {
|
|
|
|
'country': countries_a,
|
|
|
|
'date': date
|
|
|
|
}
|
|
|
|
elif operator == '=':
|
|
|
|
data['dissolved'][country_a] = {
|
|
|
|
'country': countries_b,
|
|
|
|
'date': date,
|
|
|
|
'dissolved': 'renamed'
|
|
|
|
}
|
|
|
|
data['created'][country_b] = {
|
|
|
|
'country': countries_a,
|
|
|
|
'date': date,
|
|
|
|
'created': 'renamed'
|
|
|
|
}
|
|
|
|
elif operator == '+':
|
|
|
|
for country in countries_a:
|
|
|
|
data['dissolved'][country] = {
|
|
|
|
'country': countries_b,
|
|
|
|
'date': date,
|
|
|
|
'dissolved': 'joined'
|
|
|
|
}
|
|
|
|
elif operator == '-':
|
|
|
|
for country in countries_b:
|
|
|
|
data['created'][country] = {
|
|
|
|
'country': countries_a,
|
|
|
|
'date': date,
|
|
|
|
'created': 'split'
|
|
|
|
}
|
|
|
|
elif operator == '>':
|
|
|
|
for country in countries_a:
|
|
|
|
data['dissolved'][country] = {
|
|
|
|
'country': countries_b,
|
|
|
|
'date': date,
|
|
|
|
'dissolved': 'merged'
|
|
|
|
}
|
|
|
|
data['created'][country_b] = {
|
|
|
|
'country': countries_a,
|
|
|
|
'date': date,
|
|
|
|
'created': 'merged'
|
|
|
|
}
|
|
|
|
elif operator == '<':
|
|
|
|
data['dissolved'][country_a] = {
|
|
|
|
'country': countries_b,
|
|
|
|
'date': date,
|
|
|
|
'dissolved': 'split'
|
|
|
|
}
|
|
|
|
for country in countries_b:
|
|
|
|
data['created'][country] = {
|
|
|
|
'country': countries_a,
|
|
|
|
'date': date,
|
|
|
|
'created': 'merged'
|
|
|
|
}
|
|
|
|
return data
|
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
def read_url(url):
|
2011-05-23 19:38:52 +00:00
|
|
|
print 'reading', url
|
2011-11-23 14:53:17 +00:00
|
|
|
return ox.cache.readUrl(url) if CACHE else ox.net.readUrl(url)
|
2011-05-23 19:38:52 +00:00
|
|
|
|
2011-11-23 14:53:17 +00:00
|
|
|
def read_wikipedia_url(id):
|
|
|
|
url = 'http://en.wikipedia.org/wiki/' + encode_wikipedia_id(id)
|
|
|
|
html = read_url(url)
|
|
|
|
try:
|
|
|
|
html = unicode(html, 'utf8')
|
|
|
|
except:
|
|
|
|
html = unicode(html, 'iso-8859-1')
|
|
|
|
return html
|
2011-05-23 19:38:52 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2012-04-02 21:13:05 +00:00
|
|
|
data = parse_txt()
|
|
|
|
DATA['created'] = data['created']
|
|
|
|
DATA['dissolved'] = data['dissolved']
|
|
|
|
DATA['independence'] = data['independence']
|
2011-05-23 19:38:52 +00:00
|
|
|
countries = get_countries()
|
2011-11-23 14:53:17 +00:00
|
|
|
ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True)
|
|
|
|
LOGS['total'] = len(countries)
|
|
|
|
for key in ['code', 'continent', 'flagURL']:
|
|
|
|
LOGS['no ' + key] = map(lambda x: x['name'], filter(lambda x: not key in x, countries))
|
|
|
|
LOGS['current independent'] = 0
|
|
|
|
LOGS['current dependent'] = 0
|
|
|
|
LOGS['current disputed'] = 0
|
2012-04-02 21:13:05 +00:00
|
|
|
LOGS['current exception'] = 0
|
2011-11-23 14:53:17 +00:00
|
|
|
LOGS['dissolved independent'] = 0
|
|
|
|
LOGS['dissolved dependent'] = 0
|
|
|
|
LOGS['dissolved disputed'] = 0
|
2012-04-02 21:13:05 +00:00
|
|
|
LOGS['dissolved exception'] = 0
|
2011-11-23 14:53:17 +00:00
|
|
|
for country in countries:
|
2012-04-02 21:13:05 +00:00
|
|
|
key = ' '.join([
|
2011-11-23 14:53:17 +00:00
|
|
|
'dissolved' if 'dissolved' in country else 'current',
|
2012-04-02 21:13:05 +00:00
|
|
|
'exception' if 'exception' in country else (
|
|
|
|
'disputed' if 'disputed' in country else (
|
|
|
|
'dependent' if 'dependency' in country else 'independent'
|
|
|
|
)
|
|
|
|
)
|
2011-11-23 14:53:17 +00:00
|
|
|
])
|
|
|
|
LOGS[key] += 1
|
|
|
|
get_imdb_countries(countries)
|
2011-05-23 19:38:52 +00:00
|
|
|
get_imdb_languages()
|
2011-11-23 14:53:17 +00:00
|
|
|
print json.dumps(LOGS, indent=4, sort_keys=True)
|
|
|
|
|
|
|
|
|