# -*- coding: utf-8 -*- import json import os import ox import re import sys import urllib CACHE = sys.argv[-1] == '-cache' DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc')) LOGS = {} def decode_wikipedia_id(id): id = id.replace('_', ' ').encode('utf8') return urllib.unquote(id).decode('utf8') def encode_wikipedia_id(id): # try: # id = id.encode('utf8') # except: # pass # return urllib2.quote(id.replace(' ', '_').encode('utf8')) return id.replace(' ', '_').encode('utf8') def get_countries(): def exists(country): for c in countries: if c['name'] == country['name']: return True return False def fix(html): html = html.split('The following alpha-2 codes were previously exceptionally reserved')[0] for key, value in DATA['wikipedia_url'].iteritems(): html = html.replace(encode_wikipedia_id(key), encode_wikipedia_id(value)) return re.sub('', '', html) def parse(match): country = {} is_tuple = type(match) == tuple name = decode_wikipedia_id(match[1] if is_tuple else match) if is_tuple: country['code'] = match[0] if name in DATA['name']: country['name'] = DATA['name'][name] country['wikipediaName'] = name else: country['name'] = name return country def sort(country): return country['code'] if 'code' in country else u'ZZ ' + country['name'] countries = map(lambda x: parse(x), DATA['wikipedia']) # ISO 3166-3 html = read_wikipedia_url('ISO 3166-3') matches = re.compile('.*?', re.DOTALL).findall(html) countries += map(lambda x: parse(x), matches) # ISO 3166-1 alpha-2 html = fix(read_wikipedia_url('ISO 3166-1 alpha-2')) matches = re.compile('([A-Z]{2})\n .*? ?(.*?)\n').findall(html) for match in matches: code = match[0].upper() name = re.sub('&#x(.{2});', decode, match[1]) new = True for country in countries: if name == country['name'] or ('imdbName' in country and name == country['imdbName']): new = False if code == country['code']: new = False if name != country['name']: imdb_countries[country['name']] = name if not 'imdbName' in country or name != country['imdbName']: LOGS['new countries'].append(name) if not new: break if new: print 'new', match LOGS['new countries'].append(name) ox.file.write_json('../json/imdbCountries.json', imdb_countries, indent=4, sort_keys=True) def get_imdb_languages(): def decode(match): return unichr(int(match.group(0)[3:-1], 16)) LOGS['new languages'] = [] imdb_languages = {} html = read_url('http://www.imdb.com/language/') matches = re.compile('(.*?)').findall(html) for match in matches: language = re.sub('&#x(.{2});', decode, match) language = re.sub('( languages| Sign Language)$', '', language) imdb_languages[language] = '' if not language in DATA['languages']: LOGS['new languages'].append(language) ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True) def read_url(url): print 'reading', url return ox.cache.readUrl(url) if CACHE else ox.net.readUrl(url) def read_wikipedia_url(id): url = 'http://en.wikipedia.org/wiki/' + encode_wikipedia_id(id) html = read_url(url) try: html = unicode(html, 'utf8') except: html = unicode(html, 'iso-8859-1') return html if __name__ == '__main__': countries = get_countries() ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True) LOGS['total'] = len(countries) for key in ['code', 'continent', 'flagURL']: LOGS['no ' + key] = map(lambda x: x['name'], filter(lambda x: not key in x, countries)) LOGS['current independent'] = 0 LOGS['current dependent'] = 0 LOGS['current disputed'] = 0 LOGS['dissolved independent'] = 0 LOGS['dissolved dependent'] = 0 LOGS['dissolved disputed'] = 0 for country in countries: key = ' '.join([ 'dissolved' if 'dissolved' in country else 'current', 'disputed' if 'disputed' in country else ('dependent' if 'dependency' in country else 'independent') ]) LOGS[key] += 1 get_imdb_countries(countries) get_imdb_languages() print json.dumps(LOGS, indent=4, sort_keys=True)