# -*- coding: utf-8 -*- import Image import re from geo import geo, read_url, write_log from geo import write_json # special is unused, geo in geo.py is used special = { 'code': { # incorrect on wikipedia 'Democratic Republic of the Congo': 'CD', # missing on wikipedia 'Neutral Zone': 'NTHH', 'Réunion': 'RE', # unrecognized 'Abkhazia': 'GE-AB', 'Kosovo': 'RS-KO', 'Nagorno-Karabakh': 'AZ-NK', 'Northern Cyprus': 'CY-NC', 'South Ossetia': 'GE-SO', 'Somaliland': 'SO-SO', 'South Sudan': 'SD-SS', 'Transnistria': 'MD-TR', # dependent 'Adélie Land': 'FR-AQ', 'Akrotiri and Dhekelia': 'GB-AD', 'Antártica': 'CL-AQ', 'Argentine Antarctica': 'AR-AQ', 'Ashmore and Cartier Islands': 'AU-AC', 'Australian Antarctic Territory': 'AU-AQ', 'Coral Sea Islands': 'AU-CS', 'England': 'GB-ENG', 'Northern Ireland': 'GB-NIR', 'Peter I Island': 'NO-PI', 'Ross Dependency': 'NZ-AQ', 'Scotland': 'GB-SCT', 'Wales': 'GB-WLS', # exceptional 'Canary Islands': 'IC', 'Ceuta and Melilla': 'EA', 'UK': 'UK', # former 'Korea': 'KOHH', 'Netherlands Antilles': 'ANHH', 'Siam': 'SITH', 'West Germany': 'DEDE' }, 'dependency': { 'Argentina': [ 'Argentine Antarctica' ], 'Argentina, Australia, Chile, France, New Zealand, Norway, United Kingdom': [ 'Antarctica' ], 'Australia': [ 'Ashmore and Cartier Islands', 'Australian Antarctic Territory', 'Christmas Island', 'Cocos Islands', 'Coral Sea Islands', 'Heard Island and McDonald Islands', 'Norfolk Island' ], 'Chile': [ 'Antártica' ], 'China': [ 'Hong Kong', 'Macau' ], 'Denmark': [ 'Faroe Islands', 'Greenland' ], 'Finland': [ 'Åland' ], 'France': [ 'Adélie Land', 'Clipperton Island', 'French Afar and Issas', 'French Guiana', 'French Polynesia', 'French Southern and Antarctic Territories', 'French Southern Territories', 'Guadeloupe', 'Martinique', 'Mayotte', 'New Caledonia', 'Réunion', 'Saint Barthélemy', 'Saint Martin', 'Saint Pierre and Miquelon', 'Wallis and Futuna' ], 'France, United Kingdom': [ 'New Hebrides' ], 'Iraq, Saudi Arabia': [ 'Neutral Zone' ], 'Netherlands': [ 'Aruba', 'Bonaire, Saint Eustatius and Saba', 'Curaçao', 'Netherlands Antilles', 'Sint Maarten' ], 'New Zealand': [ 'Cook Islands', 'Niue', 'Ross Dependency', 'Tokelau' ], 'Norway': [ 'Bouvet Island', 'Peter I Island', 'Queen Maud Land', 'Svalbard and Jan Mayen' ], 'Spain': [ 'Canary Islands', 'Ceuta and Melilla' ], 'Soviet Union': [ 'Byelorussian Soviet Socialist Republic' ], 'United Kingdom': [ 'Akrotiri and Dhekelia', 'Anguilla', 'Ascension Island', 'Bermuda', 'British Antarctic Territory', 'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Diego Garcia', 'England', 'Falkland Islands', 'Gibraltar', 'Gilbert and Ellice Islands', 'Guernsey', 'Isle of Man', 'Jersey', 'Montserrat', 'Northern Ireland', 'Pitcairn Islands', 'Saint Helena', 'Scotland', 'South Georgia and the South Sandwich Islands', 'Tristan da Cunha', 'Turks and Caicos Islands', 'Wales' ], 'United Kingdom, United States': [ 'Canton and Enderbury Islands' ], 'United States': [ 'American Samoa', 'Guam', 'Northern Mariana Islands', 'Johnston Island', 'Midway Islands', 'Pacific Islands', 'Panama Canal Zone', 'Puerto Rico', 'United States Minor Outlying Islands', 'United States Miscellaneous Pacific Islands', 'United States Virgin Islands', 'Wake Island' ] }, 'flag': { 'Australian Antarctic Territory': 'Flag_of_Australia.svg', 'Antarctica': 'Flag_of_the_Antarctic_Treaty.svg', 'Antártica': 'Flag_of_Magallanes,_Chile.svg', 'Ashmore and Cartier Islands': 'Flag_of_Australia.svg', 'Bonaire, Saint Eustatius and Saba': 'Flag_of_Bonaire.svg', 'Burma': 'Flag_of_Myanmar_(1974-2010).svg', 'Byelorussian Soviet Socialist Republic': 'Flag_of_Byelorussian_SSR.svg', 'Canton and Enderbury Islands': 'Flag_of_Gilbert_and_Ellice_Islands.svg', 'Ceuta and Melilla': 'Flag_Ceuta.svg', 'Coral Sea Islands': 'Flag_of_Australia.svg', 'Diego Garcia': 'Flag_of_the_British_Indian_Ocean_Territory.svg', 'French Guiana': 'Flag_of_French_Guiana.svg', 'Korea': 'Flag_of_Korea_1882.svg', # 'Libya': 'Flag_of_the_Libyan_Jamahiriya_1977.svg', 'Metropolitan France': 'Flag_of_France.svg', 'Neutral Zone': 'Flag_of_the_United_Nations.svg', 'New Hebrides': 'Flag_of_Anglo-French_Joint_Naval_Commission.svg', 'Northern Ireland': 'Ulster_banner.svg', 'Panama Canal Zone': 'Panama_Canal_Zone_Flag.png', 'Peter I Island': 'Flag_of_Norway.svg', 'Réunion': 'Drapeau_Reunion_APDR.png', 'Saint Martin': 'Flag_of_Saint-Martin_(local).svg', 'Siam': 'State_Flag_of_Thailand_(1916).svg', 'Svalbard and Jan Mayen': 'Flag_of_Norway.svg', 'United States Miscellaneous Pacific Islands': 'Flag_of_the_United_States.svg', 'Wallis and Futuna': 'Flag_of_Wallis_and_Futuna.svg', 'Yugoslavia': 'Flag_of_SFR_Yugoslavia.svg' }, 'name': { # simplified names (ambigious in wikipedia) 'Cocos (Keeling) Islands': 'Cocos Islands', 'Collectivity of Saint Martin': 'Saint Martin', 'Federated States of Micronesia': 'Micronesia', 'French Territory of the Afars and the Issas': 'French Afar and Issas', 'Georgia (country)': 'Georgia', 'Nagorno-Karabakh Republic': 'Nagorno-Karabakh', 'People\'s Republic of China': 'China', 'Republic of China': 'Taiwan', 'Republic of Dahomey': 'Dahomey', 'Republic of Ireland': 'Ireland', 'Republic of Kosovo': 'Kosovo', 'Republic of Macedonia': 'Macedonia', 'Republic of Upper Volta': 'Upper Volta', 'Sahrawi Arab Democratic Republic': 'Sahrawi', 'Saudi-Iraqi neutral zone': 'Neutral Zone', 'State of Palestine': 'Palestine', 'Trust Territory of the Pacific Islands': 'Pacific Islands' }, 'url': { 'Alderney': '', # depencency of Guernsey 'Herm': '', # depencency of Guernsey 'Sark': '', # depencency of Guernsey 'Azad_Kashmir': '', # territory of Pakistan 'Gilgit-Baltistan': '', # territory of Pakistan 'Coral_Sea_Islands_Territory': 'Coral_Sea_Islands', # wrong in "List of sovereign states" 'Kingdom_of_the_Netherlands': 'Netherlands', # wrong in "List of sovereign states" 'Saint-Barth%C3%A9lemy': 'Saint_Barth%C3%A9lemy', # wrong in "List of sovereign states" 'Saint_Martin': 'Collectivity_of_Saint_Martin', # wrong in "List of sovereign states" 'Caribbean_Netherlands': 'Bonaire,_Saint_Eustatius_and_Saba', # wrong in "ISO 3166-1 alpha-2" 'Ceuta': 'Ceuta_and_Melilla', # wrong in "ISO 3166-1 alpha-2" 'Palestinian_territories': 'State_of_Palestine', # wrong in "ISO 3166-1 alpha-2" 'Saudi%E2%80%93Iraqi_neutral_zone': 'Saudi-Iraqi_neutral_zone', # wrong in "ISO 3166-1 alpha-2" 'Western_Sahara': 'Sahrawi_Arab_Democratic_Republic', # wrong in "ISO 3166-1 alpha-2" 'Johnston_Atoll': 'Johnston_Island', # wrong in "ISO 3166-3" 'Midway_Atoll': 'Midway_Islands', # wrong in "ISO 3166-3" '%C3%85land_Islands': 'Åland', # wrong in all 'East Timor': 'Timor-Leste', # wrong in all 'Cocos_(Keeling)_Islands': 'Cocos_Islands', # wrong in all 'French_Southern_and_Antarctic_Lands': 'French_Southern_and_Antarctic_Territories', # wrong in all 'Saint_Helena,_Ascension_and_Tristan_da_Cunha': 'Saint_Helena', # wrong in all 'The_Bahamas': 'Bahamas', # wrong in all 'The_Gambia': 'Gambia' # wrong in all }, 'urls': [ # not in any list 'Adélie_Land', 'Antártica', 'Argentine_Antarctica', # Antarctic claims 'England', 'Northern_Ireland', 'Scotland', 'Wales', # Sports 'Korea', 'Siam', 'West_Germany', # IMDb 'East Timor', 'French_Southern_Territories', 'Peter_I_Island', 'South_Sudan', 'UK' ] } """ geo = { # disambiguation 'url': { 'Acheng': 'Acheng_District', 'Guatemala': 'Guatemala_City', 'Jilin': 'Jilin_City' } } """ wiki = 'http://en.wikipedia.org/wiki/' def fix_html(html): # in lieu of a better regexp html = html.replace('', '') html = html.replace('', '') html = html.replace('', '') html = html.replace('"/wiki/Taiwan"', '"/wiki/Republic_of_China"') return html def get_cities(): write_log('../log/no_latlng.log', None) write_log('../log/no_pop.log', None) cities = [] for url in get_city_urls(): cities.append(get_city_data(url)) write_json('../json/_cities.json', sorted(cities, key=lambda x: -x['population'] if x['population'] else 0), True) return cities def get_city_data(url): data = {'wikipediaURL': url} html = read_url(url) # name results = re.compile('

(.*?)

', re.DOTALL).findall(html) data['name'] = results[0] # latitude, longitude data['latitude'] = None data['longitude'] = None results = re.compile('geohack.php\?.*?&params=(.*?[WE])').findall(html) if results: result = results[0] values = [0, 0] lat = 'S' if 'S' in result else 'N' lng = 'W' if 'W' in result else 'E' halves = result[:-1].split('_' + lat + '_') for h, half in enumerate(halves): parts = half.split('_') for p, part in enumerate(parts): if part: value = float(part) if p == 0: values[h] += value elif p == 1: values[h] += value / 60 elif p == 2: values[h] += value / 3600 data['latitude'] = -values[0] if lat == 'S' else values[0] data['longitude'] = -values[1] if lng == 'W' else values[1] print results[0], data['latitude'], data['longitude'] else: write_log('../log/no_latlng.log', data['wikipediaURL']) # population results = re.compile('Population.*?.*?(\d+,\d+[\d,]+).*?', re.DOTALL).findall(html) """ if not results: results = re.compile('population of ([\d,\.])').findall(html) """ data['population'] = None if results: data['population'] = int(results[0].replace(',', '')) print results[0], data['population'] else: write_log('../log/no_pop.log', data['wikipediaURL']) return data def get_city_urls(): urls = [] html = read_url(wiki + 'List_of_capital_cities') urls += re.compile('(.*?)', re.DOTALL).findall(html) if not results: results = re.compile('

(.*?)

', re.DOTALL).findall(html) data['name'] = geo['wikipedia_name'][results[0]] if results[0] in geo['wikipedia_name'] else results[0] # code if data['name'] in geo['code']: data['code'] = geo['code'][data['name']] else: html_ = fix_html(read_url(wiki + 'ISO_3166-3')) url_ = url.replace(wiki, '') for k, v in geo['wikipedia_url'].iteritems(): if v == url_: url_ = k break results = re.compile('([A-Z]{2})\n 
[A-Z]{2}\n.*?', re.DOTALL).findall(html) urls = map(lambda x: geo['wikipedia_url'][x] if x in geo['wikipedia_url'] else x, urls) urls = list(set(urls)) # make unique urls.remove('') return map(lambda x: wiki + x, sorted(urls))