2011-05-23 21:38:52 +02:00
# -*- coding: utf-8 -*-
import json
import os
import ox
import re
2011-11-23 15:53:17 +01:00
import sys
import urllib
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
CACHE = sys.argv[-1] == '-cache'
DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc'))
LOGS = {}
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
def decode_wikipedia_id(id):
id = id.replace('_', ' ').encode('utf8')
return urllib.unquote(id).decode('utf8')
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
def encode_wikipedia_id(id):
# try:
# id = id.encode('utf8')
# except:
# pass
# return urllib2.quote(id.replace(' ', '_').encode('utf8'))
return id.replace(' ', '_').encode('utf8')
2011-05-23 21:38:52 +02:00
def get_countries():
2011-11-23 15:53:17 +01:00
def exists(country):
for c in countries:
if c['name'] == country['name']:
return True
return False
def fix(html):
html = html.split('The following alpha-2 codes were previously exceptionally reserved')[0]
for key, value in DATA['wikipedia_url'].iteritems():
html = html.replace(encode_wikipedia_id(key), encode_wikipedia_id(value))
return re.sub('<span style="display:none" class="sortkey">[\w\s]+ !</span><span class="sorttext">', '', html)
def parse(match):
country = {}
is_tuple = type(match) == tuple
name = decode_wikipedia_id(match[1] if is_tuple else match)
if is_tuple:
country['code'] = match[0]
if name in DATA['name']:
country['name'] = DATA['name'][name]
country['wikipediaName'] = name
2011-11-23 15:53:17 +01:00
country['name'] = name
return country
def sort(country):
return country['code'] if 'code' in country else u'ZZ ' + country['name']
countries = map(lambda x: parse(x), DATA['wikipedia'])
# ISO 3166-3
html = read_wikipedia_url('ISO 3166-3')
matches = re.compile('<td id="([A-Z]{4})">.*?<a href="/wiki/(.*?)".*?>', re.DOTALL).findall(html)
countries += map(lambda x: parse(x), matches)
print sorted(map(lambda x: x['name'], countries))
# ISO 3166-1 alpha-2
html = fix(read_wikipedia_url('ISO 3166-1 alpha-2'))
matches = re.compile('<tt>([A-Z]{2})</tt></td>\n<td><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
countries += filter(lambda x: not exists(x), map(lambda x: parse(x), matches))
print sorted(map(lambda x: x['name'], countries))
# List of sovereign states
html = read_wikipedia_url('List of sovereign states')
matches = re.compile('>&#160;</span><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
countries += filter(lambda x: not exists(x), map(lambda x: parse(x), matches))
print sorted(map(lambda x: x['name'], countries))
for year in range(1970, 2020, 10):
html = read_wikipedia_url('List of sovereign states in the %ds' % year)
matches = re.compile('class="thumbborder" />.*?</span> ?<a href="/wiki/(.*?)"', re.DOTALL).findall(html)
print year, '-' * 64
for x in map(lambda x: x['name'], filter(lambda x: not exists(x), map(lambda x: parse(x), matches))):
print x
# Country data
countries = sorted(countries, key=sort)
countries = map(lambda x: get_country_data(x), countries)
# Flags
countries = sorted(countries, key=sort)
flags = get_flags(countries)
for country in countries:
if country['code'] in flags:
country['flag'] = flags[country['code']]
2011-05-23 21:38:52 +02:00
return countries
2011-11-23 15:53:17 +01:00
def get_country_data(country):
name = country['name']
html = read_wikipedia_url(country['wikipediaName'] if 'wikipediaName' in country else name)
# code
if name in DATA['code']:
country['code'] = DATA['code'][name]
elif not 'code' in country:
match = re.search('"/wiki/ISO_3166-2:(\w{2})"', html)
if not match:
match = re.search('"/wiki/\.(\w{2})"', html)
if match:
country['code'] = match.group(1).upper()
# continents and regions
for continent, regions in DATA['continents'].iteritems():
for region, countries in regions.iteritems():
if name in countries:
country['continent'] = continent
country['region'] = region
# created and dissolved
if name in DATA['created']:
country['created'] = DATA['created'][name]
if name in DATA['dissolved']:
country['dissolved'] = DATA['dissolved'][name]
for c, d in DATA['dissolved'].iteritems():
if d['dissolved'] in ['merged', 'split']:
cs = d['country'] if type(d['country']) == list else [d['country']]
if name in cs:
country['created'] = {
'country': c,
'date': d['date']
# dependencies
for c, d in DATA['dependencies'].iteritems():
c = c.split(', ')
if name in c:
country['dependecies'] = d if not 'dependencies' in country else country['dependencies'] + d
elif name in d:
country['dependency'] = c if not 'dependency' in country else country['dependency'] + c
# disputes
for c, d in DATA['disputes'].iteritems():
c = c.split(', ')
if name in c:
country['disputes'] = d if not 'disputes' in country else country['disputes'] + d
elif name in d:
country['disputed'] = c if not 'disputed' in country else country['disputed'] + c
# flag
if name in DATA['flag']:
file = DATA['flag'][name] if DATA['flag'][name][-4:] == '.png' else DATA['flag'][name] + '.svg'
country['flagURL'] = get_flag('File:' + file)
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
match = re.search('vertical-align:middle;?"( align="center")?><a href="/wiki/(File:.*?)"', html)
if not match:
match = re.search('"/wiki/(File:Flag_.*?\.svg)"', html)
if match:
country['flagURL'] = get_flag(match.group(len(match.groups())))
# google
if name in DATA['google']:
country['googleName'] = DATA['google'][name]
# imdb
if name in DATA['imdb']:
country['imdbName'] = DATA['imdb'][name]
# languages
for language, c in DATA['languages'].iteritems():
if c == name:
if not 'languages' in country:
country['languages'] = [language]
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
# location
if name in DATA['location']:
for key, value in DATA['location'][name].iteritems():
country[key] = value
return country
def get_flag(id):
html = read_wikipedia_url(id)
match = re.search('<div class="fullImageLink" id="file"><a href="(.*?)"', html)
return 'http:' + match.group(1)
def get_flags(countries):
def sort(country):
index = 1 if 'dependency' in country or 'dissolved' in country else 0
if country['name'] in DATA['flag_link']:
index = 2
return index
flags = {}
flag_countries = {}
for country in sorted(countries, key=lambda x: sort(x)):
if 'flagURL' in country: # account for errors
extension = country['flagURL'][-3:]
file = '../%s/flags/%s.%s' % (extension, country['code'], extension)
if not country['flagURL'] in flag_countries:
flag_countries[country['flagURL']] = country['code']
img = read_url(country['flagURL'])
if not os.path.exists(file) or ox.file.read_file(file) != img:
ox.file.write_file(file, img)
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
flags[country['code']] = flag_countries[country['flagURL']]
if not os.path.lexists(file):
ox.file.write_link(flags[country['code']] + '.' + extension, file)
file = file.replace('/flags/', '/icons/')
if not os.path.lexists(file):
ox.file.write_link(flags[country['code']] + '.' + extension, file)
for size in [4096, 1024, 256, 64, 16]:
file = '../png/icons/%d/%s.png' % (size, country['code'])
if not os.path.lexists(file):
ox.file.write_link(flags[country['code']] + '.png', file)
return flags
def get_imdb_countries(countries):
def decode(match):
return unichr(int(match.group(0)[3:-1], 16))
LOGS['new countries'] = []
imdb_countries = DATA['imdb']
html = read_url('http://www.imdb.com/country/')
matches = re.compile('<a href="/country/(.*?)">(.*?)\n</a>').findall(html)
for match in matches:
code = match[0].upper()
name = re.sub('&#x(.{2});', decode, match[1])
new = True
for country in countries:
if name == country['name'] or ('imdbName' in country and name == country['imdbName']):
new = False
if code == country['code']:
new = False
if name != country['name']:
imdb_countries[country['name']] = name
if not 'imdbName' in country or name != country['imdbName']:
LOGS['new countries'].append(name)
if not new:
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
if new:
print 'new', match
LOGS['new countries'].append(name)
ox.file.write_json('../json/imdbCountries.json', imdb_countries, indent=4, sort_keys=True)
def get_imdb_languages():
def decode(match):
return unichr(int(match.group(0)[3:-1], 16))
LOGS['new languages'] = []
imdb_languages = {}
html = read_url('http://www.imdb.com/language/')
matches = re.compile('<a href="/language/.*?">(.*?)</a>').findall(html)
for match in matches:
language = re.sub('&#x(.{2});', decode, match)
language = re.sub('( languages| Sign Language)$', '', language)
imdb_languages[language] = ''
if not language in DATA['languages']:
LOGS['new languages'].append(language)
ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True)
def read_url(url):
2011-05-23 21:38:52 +02:00
print 'reading', url
2011-11-23 15:53:17 +01:00
return ox.cache.readUrl(url) if CACHE else ox.net.readUrl(url)
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
def read_wikipedia_url(id):
url = 'http://en.wikipedia.org/wiki/' + encode_wikipedia_id(id)
html = read_url(url)
html = unicode(html, 'utf8')
html = unicode(html, 'iso-8859-1')
return html
2011-05-23 21:38:52 +02:00
if __name__ == '__main__':
countries = get_countries()
2011-11-23 15:53:17 +01:00
ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True)
LOGS['total'] = len(countries)
for key in ['code', 'continent', 'flagURL']:
LOGS['no ' + key] = map(lambda x: x['name'], filter(lambda x: not key in x, countries))
LOGS['current independent'] = 0
LOGS['current dependent'] = 0
LOGS['current disputed'] = 0
LOGS['dissolved independent'] = 0
LOGS['dissolved dependent'] = 0
LOGS['dissolved disputed'] = 0
for country in countries:
key = ' '.join([
'dissolved' if 'dissolved' in country else 'current',
'disputed' if 'disputed' in country else ('dependent' if 'dependency' in country else 'independent')
LOGS[key] += 1
2011-05-23 21:38:52 +02:00
2011-11-23 15:53:17 +01:00
print json.dumps(LOGS, indent=4, sort_keys=True)