# -*- coding: utf-8 -*-
import json
import os
import ox
import re
import sys
import urllib
CACHE = sys.argv[-1] == '-cache'
try:
DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc'))
except:
ox.file.write_file('../jsonc/debug.json', ox.js.minify(ox.file.read_file('../jsonc/countries.jsonc')))
sys.exit()
LOGS = {}
def decode_wikipedia_id(id):
id = id.replace('_', ' ').encode('utf8')
return urllib.unquote(id).decode('utf8')
def encode_wikipedia_id(id):
# try:
# id = id.encode('utf8')
# except:
# pass
# return urllib2.quote(id.replace(' ', '_').encode('utf8'))
return id.replace(' ', '_').encode('utf8')
def get_countries():
def exists(country):
for c in countries:
if c['name'] == country['name']:
return True
return False
def fix(html):
html = html.split('The following alpha-2 codes were previously exceptionally reserved')[0]
for key, value in DATA['wikipedia_url'].iteritems():
html = html.replace(encode_wikipedia_id(key), encode_wikipedia_id(value))
return re.sub('[\w\s]+ !', '', html)
def parse(match):
country = {}
is_tuple = type(match) == tuple
name = decode_wikipedia_id(match[1] if is_tuple else match)
if is_tuple:
country['code'] = match[0]
if name in DATA['name']:
country['name'] = DATA['name'][name]
country['wikipediaName'] = name
else:
country['name'] = name
return country
def sort(country):
return country['code'] if 'code' in country else u'ZZ ' + country['name']
countries = map(lambda x: parse(x), DATA['wikipedia'])
# ISO 3166-3
html = read_wikipedia_url('ISO 3166-3').replace('Rhodesia', 'Southern Rhodesia') # FIXME: can this be avoided?
matches = re.compile('
.*?', re.DOTALL).findall(html)
countries += map(lambda x: parse(x), matches)
# ISO 3166-1 alpha-2
html = fix(read_wikipedia_url('ISO 3166-1 alpha-2'))
matches = re.compile('([A-Z]{2}) | \n .*? ?(.*?)\n').findall(html)
for match in matches:
code = match[0].upper()
name = re.sub('(.{2});', decode, match[1])
new = True
for country in countries:
if name == country['name'] or ('imdbName' in country and name == country['imdbName']):
new = False
if code == country['code']:
new = False
if name != country['name']:
imdb_countries[country['name']] = name
if not 'imdbName' in country or name != country['imdbName']:
LOGS['new countries'].append(name)
if not new:
break
if new:
print 'new', match
LOGS['new countries'].append(name)
ox.file.write_json('../json/imdbCountries.json', imdb_countries, indent=4, sort_keys=True)
def get_imdb_languages():
def decode(match):
return unichr(int(match.group(0)[3:-1], 16))
LOGS['new languages'] = []
imdb_languages = {}
html = read_url('http://www.imdb.com/language/')
matches = re.compile('(.*?)').findall(html)
for match in matches:
language = re.sub('(.{2});', decode, match)
language = re.sub('( languages| Sign Language)$', '', language)
imdb_languages[language] = ''
if not language in DATA['languages']:
LOGS['new languages'].append(language)
ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True)
def read_url(url):
print 'reading', url
return ox.cache.readUrl(url) if CACHE else ox.net.readUrl(url)
def read_wikipedia_url(id):
url = 'http://en.wikipedia.org/wiki/' + encode_wikipedia_id(id)
html = read_url(url)
try:
html = unicode(html, 'utf8')
except:
html = unicode(html, 'iso-8859-1')
return html
if __name__ == '__main__':
countries = get_countries()
ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True)
LOGS['total'] = len(countries)
for key in ['code', 'continent', 'flagURL']:
LOGS['no ' + key] = map(lambda x: x['name'], filter(lambda x: not key in x, countries))
LOGS['current independent'] = 0
LOGS['current dependent'] = 0
LOGS['current disputed'] = 0
LOGS['dissolved independent'] = 0
LOGS['dissolved dependent'] = 0
LOGS['dissolved disputed'] = 0
LOGS['alias'] = 0
for country in countries:
key = 'alias' if 'alias' in country else ' '.join([
'dissolved' if 'dissolved' in country else 'current',
'disputed' if 'disputed' in country else ('dependent' if 'dependency' in country else 'independent')
])
LOGS[key] += 1
get_imdb_countries(countries)
get_imdb_languages()
print json.dumps(LOGS, indent=4, sort_keys=True)
|