From a7c50d840b5f0106f8642e7ff25003307b877193 Mon Sep 17 00:00:00 2001 From: rolux Date: Mon, 2 Apr 2012 23:13:05 +0200 Subject: [PATCH] update Geo module --- tools/geo/py/geo.py | 137 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 20 deletions(-) diff --git a/tools/geo/py/geo.py b/tools/geo/py/geo.py index 5ecfd7d1..9f6d6996 100644 --- a/tools/geo/py/geo.py +++ b/tools/geo/py/geo.py @@ -8,11 +8,20 @@ import sys import urllib CACHE = sys.argv[-1] == '-cache' + try: DATA = ox.jsonc.loads(ox.file.read_file('../jsonc/countries.jsonc')) except: + print 'parse error, see jsonc/debug.json' ox.file.write_file('../jsonc/debug.json', ox.js.minify(ox.file.read_file('../jsonc/countries.jsonc'))) sys.exit() + +GEO = {} +for country in json.loads(ox.file.read_file('../../../source/Ox.Geo/json/Ox.Geo.json')): + GEO[country['code']] = {} + for key in ['area', 'lat', 'lng', 'south', 'west', 'north', 'east']: + GEO[country['code']][key] = country[key] + LOGS = {} def decode_wikipedia_id(id): @@ -77,12 +86,20 @@ def get_countries(): # Country data countries = sorted(countries, key=sort) countries = map(lambda x: get_country_data(x), countries) + # Independence + for i, country in enumerate(countries): + if 'created' in country and not 'dependency' in country: + name = country['created']['country'][0] + data = filter(lambda x: x['name'] == name, countries)[0] + if 'dependency' in data: + countries[i]['independence'] = { + 'country': data['dependency'], + 'date': country['created']['date'] + } + sys.exit() # Flags countries = sorted(countries, key=sort) flags = get_flags(countries) - for country in countries: - if country['code'] in flags: - country['flag'] = flags[country['code']] return countries def get_country_data(country): @@ -97,9 +114,6 @@ def get_country_data(country): match = re.search('"/wiki/\.(\w{2})"', html) if match: country['code'] = match.group(1).upper() - # alias - if country['code'] in DATA['alias']: - country['alias'] = True # continents and regions for continent, regions in DATA['continents'].iteritems(): for region, countries in regions.iteritems(): @@ -107,19 +121,9 @@ def get_country_data(country): country['continent'] = continent country['region'] = region break - # created and dissolved + # created if name in DATA['created']: country['created'] = DATA['created'][name] - if name in DATA['dissolved']: - country['dissolved'] = DATA['dissolved'][name] - for c, d in DATA['dissolved'].iteritems(): - if d['dissolved'] in ['merged', 'split']: - cs = d['country'] if type(d['country']) == list else [d['country']] - if name in cs: - country['created'] = { - 'country': c, - 'date': d['date'] - } # dependencies for c, d in DATA['dependencies'].iteritems(): c = c.split(', ') @@ -134,6 +138,12 @@ def get_country_data(country): country['disputes'] = d if not 'disputes' in country else country['disputes'] + d elif name in d: country['disputed'] = c if not 'disputed' in country else country['disputed'] + c + # dissolved + if name in DATA['dissolved']: + country['dissolved'] = DATA['dissolved'][name] + # exception + if country['code'] in DATA['exception']: + country['exception'] = True # flag if name in DATA['flag']: file = DATA['flag'][name] if DATA['flag'][name][-4:] == '.png' else DATA['flag'][name] + '.svg' @@ -150,6 +160,9 @@ def get_country_data(country): # imdb if name in DATA['imdb']: country['imdbName'] = DATA['imdb'][name] + # independence + if name in DATA['independence']: + country['independence'] = DATA['independence'][name] # languages for language, c in DATA['languages'].iteritems(): if c == name: @@ -157,6 +170,10 @@ def get_country_data(country): country['languages'] = [language] else: country['languages'].append(language) + # area, lat, lng, south, west, north, east + if country['code'] in GEO: + for key in GEO[country['code']]: + country[key] = GEO[country['code']][key] return country def get_flag(id): @@ -238,6 +255,77 @@ def get_imdb_languages(): LOGS['new languages'].append(language) ox.file.write_json('../json/imdbLanguages.json', imdb_languages, indent=4, sort_keys=True) +def parse_txt(): + data = { + 'created': {}, + 'dissolved': {}, + 'independence': {} + } + f = open('../txt/countries.txt') + lines = map(lambda x: x.strip(), f.readlines()) + f.close() + for line in filter(lambda x: x[0] != '#', lines): + date, country_a, operator, country_b = re.compile( + '([\d\-]+) +(.+) ([\*=\+\-><]) (.+)' + ).match(line).groups() + countries_a = country_a.split(' / ') + countries_b = country_b.split(' / ') + if operator == '*': + data['independence'][country_b] = { + 'country': countries_a, + 'date': date + } + elif operator == '=': + data['dissolved'][country_a] = { + 'country': countries_b, + 'date': date, + 'dissolved': 'renamed' + } + data['created'][country_b] = { + 'country': countries_a, + 'date': date, + 'created': 'renamed' + } + elif operator == '+': + for country in countries_a: + data['dissolved'][country] = { + 'country': countries_b, + 'date': date, + 'dissolved': 'joined' + } + elif operator == '-': + for country in countries_b: + data['created'][country] = { + 'country': countries_a, + 'date': date, + 'created': 'split' + } + elif operator == '>': + for country in countries_a: + data['dissolved'][country] = { + 'country': countries_b, + 'date': date, + 'dissolved': 'merged' + } + data['created'][country_b] = { + 'country': countries_a, + 'date': date, + 'created': 'merged' + } + elif operator == '<': + data['dissolved'][country_a] = { + 'country': countries_b, + 'date': date, + 'dissolved': 'split' + } + for country in countries_b: + data['created'][country] = { + 'country': countries_a, + 'date': date, + 'created': 'merged' + } + return data + def read_url(url): print 'reading', url return ox.cache.readUrl(url) if CACHE else ox.net.readUrl(url) @@ -252,6 +340,10 @@ def read_wikipedia_url(id): return html if __name__ == '__main__': + data = parse_txt() + DATA['created'] = data['created'] + DATA['dissolved'] = data['dissolved'] + DATA['independence'] = data['independence'] countries = get_countries() ox.file.write_json('../json/countries.json', countries, indent=4, sort_keys=True) LOGS['total'] = len(countries) @@ -260,14 +352,19 @@ if __name__ == '__main__': LOGS['current independent'] = 0 LOGS['current dependent'] = 0 LOGS['current disputed'] = 0 + LOGS['current exception'] = 0 LOGS['dissolved independent'] = 0 LOGS['dissolved dependent'] = 0 LOGS['dissolved disputed'] = 0 - LOGS['alias'] = 0 + LOGS['dissolved exception'] = 0 for country in countries: - key = 'alias' if 'alias' in country else ' '.join([ + key = ' '.join([ 'dissolved' if 'dissolved' in country else 'current', - 'disputed' if 'disputed' in country else ('dependent' if 'dependency' in country else 'independent') + 'exception' if 'exception' in country else ( + 'disputed' if 'disputed' in country else ( + 'dependent' if 'dependency' in country else 'independent' + ) + ) ]) LOGS[key] += 1 get_imdb_countries(countries)