# -*- coding: utf-8 -*-
import Image
import re
from geo import geo, read_url, write_log
from geo import write_json
# special is unused, geo in geo.py is used
special = {
'code': {
# incorrect on wikipedia
'Democratic Republic of the Congo': 'CD',
# missing on wikipedia
'Neutral Zone': 'NTHH',
'Réunion': 'RE',
# unrecognized
'Abkhazia': 'GE-AB',
'Kosovo': 'RS-KO',
'Nagorno-Karabakh': 'AZ-NK',
'Northern Cyprus': 'CY-NC',
'South Ossetia': 'GE-SO',
'Somaliland': 'SO-SO',
'South Sudan': 'SD-SS',
'Transnistria': 'MD-TR',
# dependent
'Adélie Land': 'FR-AQ',
'Akrotiri and Dhekelia': 'GB-AD',
'Antártica': 'CL-AQ',
'Argentine Antarctica': 'AR-AQ',
'Ashmore and Cartier Islands': 'AU-AC',
'Australian Antarctic Territory': 'AU-AQ',
'Coral Sea Islands': 'AU-CS',
'England': 'GB-ENG',
'Northern Ireland': 'GB-NIR',
'Peter I Island': 'NO-PI',
'Ross Dependency': 'NZ-AQ',
'Scotland': 'GB-SCT',
'Wales': 'GB-WLS',
# exceptional
'Canary Islands': 'IC',
'Ceuta and Melilla': 'EA',
'UK': 'UK',
# former
'Korea': 'KOHH',
'Netherlands Antilles': 'ANHH',
'Siam': 'SITH',
'West Germany': 'DEDE'
},
'dependency': {
'Argentina': [
'Argentine Antarctica'
],
'Argentina, Australia, Chile, France, New Zealand, Norway, United Kingdom': [
'Antarctica'
],
'Australia': [
'Ashmore and Cartier Islands', 'Australian Antarctic Territory', 'Christmas Island', 'Cocos Islands', 'Coral Sea Islands',
'Heard Island and McDonald Islands', 'Norfolk Island'
],
'Chile': [
'Antártica'
],
'China': [
'Hong Kong', 'Macau'
],
'Denmark': [
'Faroe Islands', 'Greenland'
],
'Finland': [
'Åland'
],
'France': [
'Adélie Land', 'Clipperton Island', 'French Afar and Issas', 'French Guiana', 'French Polynesia',
'French Southern and Antarctic Territories', 'French Southern Territories', 'Guadeloupe', 'Martinique', 'Mayotte',
'New Caledonia', 'Réunion', 'Saint Barthélemy', 'Saint Martin', 'Saint Pierre and Miquelon',
'Wallis and Futuna'
],
'France, United Kingdom': [
'New Hebrides'
],
'Iraq, Saudi Arabia': [
'Neutral Zone'
],
'Netherlands': [
'Aruba', 'Bonaire, Saint Eustatius and Saba', 'Curaçao', 'Netherlands Antilles', 'Sint Maarten'
],
'New Zealand': [
'Cook Islands', 'Niue', 'Ross Dependency', 'Tokelau'
],
'Norway': [
'Bouvet Island', 'Peter I Island', 'Queen Maud Land', 'Svalbard and Jan Mayen'
],
'Spain': [
'Canary Islands', 'Ceuta and Melilla'
],
'Soviet Union': [
'Byelorussian Soviet Socialist Republic'
],
'United Kingdom': [
'Akrotiri and Dhekelia', 'Anguilla', 'Ascension Island', 'Bermuda', 'British Antarctic Territory',
'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Diego Garcia', 'England',
'Falkland Islands', 'Gibraltar', 'Gilbert and Ellice Islands', 'Guernsey', 'Isle of Man',
'Jersey', 'Montserrat', 'Northern Ireland', 'Pitcairn Islands', 'Saint Helena',
'Scotland', 'South Georgia and the South Sandwich Islands', 'Tristan da Cunha', 'Turks and Caicos Islands', 'Wales'
],
'United Kingdom, United States': [
'Canton and Enderbury Islands'
],
'United States': [
'American Samoa', 'Guam', 'Northern Mariana Islands', 'Johnston Island', 'Midway Islands',
'Pacific Islands', 'Panama Canal Zone', 'Puerto Rico', 'United States Minor Outlying Islands', 'United States Miscellaneous Pacific Islands',
'United States Virgin Islands', 'Wake Island'
]
},
'flag': {
'Australian Antarctic Territory': 'Flag_of_Australia.svg',
'Antarctica': 'Flag_of_the_Antarctic_Treaty.svg',
'Antártica': 'Flag_of_Magallanes,_Chile.svg',
'Ashmore and Cartier Islands': 'Flag_of_Australia.svg',
'Bonaire, Saint Eustatius and Saba': 'Flag_of_Bonaire.svg',
'Burma': 'Flag_of_Myanmar_(1974-2010).svg',
'Byelorussian Soviet Socialist Republic': 'Flag_of_Byelorussian_SSR.svg',
'Canton and Enderbury Islands': 'Flag_of_Gilbert_and_Ellice_Islands.svg',
'Ceuta and Melilla': 'Flag_Ceuta.svg',
'Coral Sea Islands': 'Flag_of_Australia.svg',
'Diego Garcia': 'Flag_of_the_British_Indian_Ocean_Territory.svg',
'French Guiana': 'Flag_of_French_Guiana.svg',
'Korea': 'Flag_of_Korea_1882.svg',
# 'Libya': 'Flag_of_the_Libyan_Jamahiriya_1977.svg',
'Metropolitan France': 'Flag_of_France.svg',
'Neutral Zone': 'Flag_of_the_United_Nations.svg',
'New Hebrides': 'Flag_of_Anglo-French_Joint_Naval_Commission.svg',
'Northern Ireland': 'Ulster_banner.svg',
'Panama Canal Zone': 'Panama_Canal_Zone_Flag.png',
'Peter I Island': 'Flag_of_Norway.svg',
'Réunion': 'Drapeau_Reunion_APDR.png',
'Saint Martin': 'Flag_of_Saint-Martin_(local).svg',
'Siam': 'State_Flag_of_Thailand_(1916).svg',
'Svalbard and Jan Mayen': 'Flag_of_Norway.svg',
'United States Miscellaneous Pacific Islands': 'Flag_of_the_United_States.svg',
'Wallis and Futuna': 'Flag_of_Wallis_and_Futuna.svg',
'Yugoslavia': 'Flag_of_SFR_Yugoslavia.svg'
},
'name': {
# simplified names (ambigious in wikipedia)
'Cocos (Keeling) Islands': 'Cocos Islands',
'Collectivity of Saint Martin': 'Saint Martin',
'Federated States of Micronesia': 'Micronesia',
'French Territory of the Afars and the Issas': 'French Afar and Issas',
'Georgia (country)': 'Georgia',
'Nagorno-Karabakh Republic': 'Nagorno-Karabakh',
'People\'s Republic of China': 'China',
'Republic of China': 'Taiwan',
'Republic of Dahomey': 'Dahomey',
'Republic of Ireland': 'Ireland',
'Republic of Kosovo': 'Kosovo',
'Republic of Macedonia': 'Macedonia',
'Republic of Upper Volta': 'Upper Volta',
'Sahrawi Arab Democratic Republic': 'Sahrawi',
'Saudi-Iraqi neutral zone': 'Neutral Zone',
'State of Palestine': 'Palestine',
'Trust Territory of the Pacific Islands': 'Pacific Islands'
},
'url': {
'Alderney': '', # depencency of Guernsey
'Herm': '', # depencency of Guernsey
'Sark': '', # depencency of Guernsey
'Azad_Kashmir': '', # territory of Pakistan
'Gilgit-Baltistan': '', # territory of Pakistan
'Coral_Sea_Islands_Territory': 'Coral_Sea_Islands', # wrong in "List of sovereign states"
'Kingdom_of_the_Netherlands': 'Netherlands', # wrong in "List of sovereign states"
'Saint-Barth%C3%A9lemy': 'Saint_Barth%C3%A9lemy', # wrong in "List of sovereign states"
'Saint_Martin': 'Collectivity_of_Saint_Martin', # wrong in "List of sovereign states"
'Caribbean_Netherlands': 'Bonaire,_Saint_Eustatius_and_Saba', # wrong in "ISO 3166-1 alpha-2"
'Ceuta': 'Ceuta_and_Melilla', # wrong in "ISO 3166-1 alpha-2"
'Palestinian_territories': 'State_of_Palestine', # wrong in "ISO 3166-1 alpha-2"
'Saudi%E2%80%93Iraqi_neutral_zone': 'Saudi-Iraqi_neutral_zone', # wrong in "ISO 3166-1 alpha-2"
'Western_Sahara': 'Sahrawi_Arab_Democratic_Republic', # wrong in "ISO 3166-1 alpha-2"
'Johnston_Atoll': 'Johnston_Island', # wrong in "ISO 3166-3"
'Midway_Atoll': 'Midway_Islands', # wrong in "ISO 3166-3"
'%C3%85land_Islands': 'Åland', # wrong in all
'East Timor': 'Timor-Leste', # wrong in all
'Cocos_(Keeling)_Islands': 'Cocos_Islands', # wrong in all
'French_Southern_and_Antarctic_Lands': 'French_Southern_and_Antarctic_Territories', # wrong in all
'Saint_Helena,_Ascension_and_Tristan_da_Cunha': 'Saint_Helena', # wrong in all
'The_Bahamas': 'Bahamas', # wrong in all
'The_Gambia': 'Gambia' # wrong in all
},
'urls': [
# not in any list
'Adélie_Land', 'Antártica', 'Argentine_Antarctica', # Antarctic claims
'England', 'Northern_Ireland', 'Scotland', 'Wales', # Sports
'Korea', 'Siam', 'West_Germany', # IMDb
'East Timor',
'French_Southern_Territories',
'Peter_I_Island',
'South_Sudan',
'UK'
]
}
"""
geo = {
# disambiguation
'url': {
'Acheng': 'Acheng_District',
'Guatemala': 'Guatemala_City',
'Jilin': 'Jilin_City'
}
}
"""
wiki = 'http://en.wikipedia.org/wiki/'
def fix_html(html):
# in lieu of a better regexp
html = html.replace('', '')
html = html.replace('', '')
html = html.replace('', '')
html = html.replace('"/wiki/Taiwan"', '"/wiki/Republic_of_China"')
return html
def get_cities():
write_log('../log/no_latlng.log', None)
write_log('../log/no_pop.log', None)
cities = []
for url in get_city_urls():
cities.append(get_city_data(url))
write_json('../json/_cities.json', sorted(cities, key=lambda x: -x['population'] if x['population'] else 0), True)
return cities
def get_city_data(url):
data = {'wikipediaURL': url}
html = read_url(url)
# name
results = re.compile('(.*?)
', re.DOTALL).findall(html)
data['name'] = results[0]
# latitude, longitude
data['latitude'] = None
data['longitude'] = None
results = re.compile('geohack.php\?.*?¶ms=(.*?[WE])').findall(html)
if results:
result = results[0]
values = [0, 0]
lat = 'S' if 'S' in result else 'N'
lng = 'W' if 'W' in result else 'E'
halves = result[:-1].split('_' + lat + '_')
for h, half in enumerate(halves):
parts = half.split('_')
for p, part in enumerate(parts):
if part:
value = float(part)
if p == 0:
values[h] += value
elif p == 1:
values[h] += value / 60
elif p == 2:
values[h] += value / 3600
data['latitude'] = -values[0] if lat == 'S' else values[0]
data['longitude'] = -values[1] if lng == 'W' else values[1]
print results[0], data['latitude'], data['longitude']
else:
write_log('../log/no_latlng.log', data['wikipediaURL'])
# population
results = re.compile('Population.*?