357 lines
No EOL
15 KiB
Python
357 lines
No EOL
15 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import Image
|
|
import re
|
|
from geo import geo, read_url, write_log
|
|
from geo import write_json
|
|
|
|
# special is unused, geo in geo.py is used
|
|
special = {
|
|
'code': {
|
|
# incorrect on wikipedia
|
|
'Democratic Republic of the Congo': 'CD',
|
|
# missing on wikipedia
|
|
'Neutral Zone': 'NTHH',
|
|
'Réunion': 'RE',
|
|
# unrecognized
|
|
'Abkhazia': 'GE-AB',
|
|
'Kosovo': 'RS-KO',
|
|
'Nagorno-Karabakh': 'AZ-NK',
|
|
'Northern Cyprus': 'CY-NC',
|
|
'South Ossetia': 'GE-SO',
|
|
'Somaliland': 'SO-SO',
|
|
'South Sudan': 'SD-SS',
|
|
'Transnistria': 'MD-TR',
|
|
# dependent
|
|
'Adélie Land': 'FR-AQ',
|
|
'Akrotiri and Dhekelia': 'GB-AD',
|
|
'Antártica': 'CL-AQ',
|
|
'Argentine Antarctica': 'AR-AQ',
|
|
'Ashmore and Cartier Islands': 'AU-AC',
|
|
'Australian Antarctic Territory': 'AU-AQ',
|
|
'Coral Sea Islands': 'AU-CS',
|
|
'England': 'GB-ENG',
|
|
'Northern Ireland': 'GB-NIR',
|
|
'Peter I Island': 'NO-PI',
|
|
'Ross Dependency': 'NZ-AQ',
|
|
'Scotland': 'GB-SCT',
|
|
'Wales': 'GB-WLS',
|
|
# exceptional
|
|
'Canary Islands': 'IC',
|
|
'Ceuta and Melilla': 'EA',
|
|
'UK': 'UK',
|
|
# former
|
|
'Korea': 'KOHH',
|
|
'Netherlands Antilles': 'ANHH',
|
|
'Siam': 'SITH',
|
|
'West Germany': 'DEDE'
|
|
},
|
|
'dependency': {
|
|
'Argentina': [
|
|
'Argentine Antarctica'
|
|
],
|
|
'Argentina, Australia, Chile, France, New Zealand, Norway, United Kingdom': [
|
|
'Antarctica'
|
|
],
|
|
'Australia': [
|
|
'Ashmore and Cartier Islands', 'Australian Antarctic Territory', 'Christmas Island', 'Cocos Islands', 'Coral Sea Islands',
|
|
'Heard Island and McDonald Islands', 'Norfolk Island'
|
|
],
|
|
'Chile': [
|
|
'Antártica'
|
|
],
|
|
'China': [
|
|
'Hong Kong', 'Macau'
|
|
],
|
|
'Denmark': [
|
|
'Faroe Islands', 'Greenland'
|
|
],
|
|
'Finland': [
|
|
'Åland'
|
|
],
|
|
'France': [
|
|
'Adélie Land', 'Clipperton Island', 'French Afar and Issas', 'French Guiana', 'French Polynesia',
|
|
'French Southern and Antarctic Territories', 'French Southern Territories', 'Guadeloupe', 'Martinique', 'Mayotte',
|
|
'New Caledonia', 'Réunion', 'Saint Barthélemy', 'Saint Martin', 'Saint Pierre and Miquelon',
|
|
'Wallis and Futuna'
|
|
],
|
|
'France, United Kingdom': [
|
|
'New Hebrides'
|
|
],
|
|
'Iraq, Saudi Arabia': [
|
|
'Neutral Zone'
|
|
],
|
|
'Netherlands': [
|
|
'Aruba', 'Bonaire, Saint Eustatius and Saba', 'Curaçao', 'Netherlands Antilles', 'Sint Maarten'
|
|
],
|
|
'New Zealand': [
|
|
'Cook Islands', 'Niue', 'Ross Dependency', 'Tokelau'
|
|
],
|
|
'Norway': [
|
|
'Bouvet Island', 'Peter I Island', 'Queen Maud Land', 'Svalbard and Jan Mayen'
|
|
],
|
|
'Spain': [
|
|
'Canary Islands', 'Ceuta and Melilla'
|
|
],
|
|
'Soviet Union': [
|
|
'Byelorussian Soviet Socialist Republic'
|
|
],
|
|
'United Kingdom': [
|
|
'Akrotiri and Dhekelia', 'Anguilla', 'Ascension Island', 'Bermuda', 'British Antarctic Territory',
|
|
'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Diego Garcia', 'England',
|
|
'Falkland Islands', 'Gibraltar', 'Gilbert and Ellice Islands', 'Guernsey', 'Isle of Man',
|
|
'Jersey', 'Montserrat', 'Northern Ireland', 'Pitcairn Islands', 'Saint Helena',
|
|
'Scotland', 'South Georgia and the South Sandwich Islands', 'Tristan da Cunha', 'Turks and Caicos Islands', 'Wales'
|
|
],
|
|
'United Kingdom, United States': [
|
|
'Canton and Enderbury Islands'
|
|
],
|
|
'United States': [
|
|
'American Samoa', 'Guam', 'Northern Mariana Islands', 'Johnston Island', 'Midway Islands',
|
|
'Pacific Islands', 'Panama Canal Zone', 'Puerto Rico', 'United States Minor Outlying Islands', 'United States Miscellaneous Pacific Islands',
|
|
'United States Virgin Islands', 'Wake Island'
|
|
]
|
|
},
|
|
'flag': {
|
|
'Australian Antarctic Territory': 'Flag_of_Australia.svg',
|
|
'Antarctica': 'Flag_of_the_Antarctic_Treaty.svg',
|
|
'Antártica': 'Flag_of_Magallanes,_Chile.svg',
|
|
'Ashmore and Cartier Islands': 'Flag_of_Australia.svg',
|
|
'Bonaire, Saint Eustatius and Saba': 'Flag_of_Bonaire.svg',
|
|
'Burma': 'Flag_of_Myanmar_(1974-2010).svg',
|
|
'Byelorussian Soviet Socialist Republic': 'Flag_of_Byelorussian_SSR.svg',
|
|
'Canton and Enderbury Islands': 'Flag_of_Gilbert_and_Ellice_Islands.svg',
|
|
'Ceuta and Melilla': 'Flag_Ceuta.svg',
|
|
'Coral Sea Islands': 'Flag_of_Australia.svg',
|
|
'Diego Garcia': 'Flag_of_the_British_Indian_Ocean_Territory.svg',
|
|
'French Guiana': 'Flag_of_French_Guiana.svg',
|
|
'Korea': 'Flag_of_Korea_1882.svg',
|
|
# 'Libya': 'Flag_of_the_Libyan_Jamahiriya_1977.svg',
|
|
'Metropolitan France': 'Flag_of_France.svg',
|
|
'Neutral Zone': 'Flag_of_the_United_Nations.svg',
|
|
'New Hebrides': 'Flag_of_Anglo-French_Joint_Naval_Commission.svg',
|
|
'Northern Ireland': 'Ulster_banner.svg',
|
|
'Panama Canal Zone': 'Panama_Canal_Zone_Flag.png',
|
|
'Peter I Island': 'Flag_of_Norway.svg',
|
|
'Réunion': 'Drapeau_Reunion_APDR.png',
|
|
'Saint Martin': 'Flag_of_Saint-Martin_(local).svg',
|
|
'Siam': 'State_Flag_of_Thailand_(1916).svg',
|
|
'Svalbard and Jan Mayen': 'Flag_of_Norway.svg',
|
|
'United States Miscellaneous Pacific Islands': 'Flag_of_the_United_States.svg',
|
|
'Wallis and Futuna': 'Flag_of_Wallis_and_Futuna.svg',
|
|
'Yugoslavia': 'Flag_of_SFR_Yugoslavia.svg'
|
|
},
|
|
'name': {
|
|
# simplified names (ambigious in wikipedia)
|
|
'Cocos (Keeling) Islands': 'Cocos Islands',
|
|
'Collectivity of Saint Martin': 'Saint Martin',
|
|
'Federated States of Micronesia': 'Micronesia',
|
|
'French Territory of the Afars and the Issas': 'French Afar and Issas',
|
|
'Georgia (country)': 'Georgia',
|
|
'Nagorno-Karabakh Republic': 'Nagorno-Karabakh',
|
|
'People\'s Republic of China': 'China',
|
|
'Republic of China': 'Taiwan',
|
|
'Republic of Dahomey': 'Dahomey',
|
|
'Republic of Ireland': 'Ireland',
|
|
'Republic of Kosovo': 'Kosovo',
|
|
'Republic of Macedonia': 'Macedonia',
|
|
'Republic of Upper Volta': 'Upper Volta',
|
|
'Sahrawi Arab Democratic Republic': 'Sahrawi',
|
|
'Saudi-Iraqi neutral zone': 'Neutral Zone',
|
|
'State of Palestine': 'Palestine',
|
|
'Trust Territory of the Pacific Islands': 'Pacific Islands'
|
|
},
|
|
'url': {
|
|
'Alderney': '', # depencency of Guernsey
|
|
'Herm': '', # depencency of Guernsey
|
|
'Sark': '', # depencency of Guernsey
|
|
'Azad_Kashmir': '', # territory of Pakistan
|
|
'Gilgit-Baltistan': '', # territory of Pakistan
|
|
'Coral_Sea_Islands_Territory': 'Coral_Sea_Islands', # wrong in "List of sovereign states"
|
|
'Kingdom_of_the_Netherlands': 'Netherlands', # wrong in "List of sovereign states"
|
|
'Saint-Barth%C3%A9lemy': 'Saint_Barth%C3%A9lemy', # wrong in "List of sovereign states"
|
|
'Saint_Martin': 'Collectivity_of_Saint_Martin', # wrong in "List of sovereign states"
|
|
'Caribbean_Netherlands': 'Bonaire,_Saint_Eustatius_and_Saba', # wrong in "ISO 3166-1 alpha-2"
|
|
'Ceuta': 'Ceuta_and_Melilla', # wrong in "ISO 3166-1 alpha-2"
|
|
'Palestinian_territories': 'State_of_Palestine', # wrong in "ISO 3166-1 alpha-2"
|
|
'Saudi%E2%80%93Iraqi_neutral_zone': 'Saudi-Iraqi_neutral_zone', # wrong in "ISO 3166-1 alpha-2"
|
|
'Western_Sahara': 'Sahrawi_Arab_Democratic_Republic', # wrong in "ISO 3166-1 alpha-2"
|
|
'Johnston_Atoll': 'Johnston_Island', # wrong in "ISO 3166-3"
|
|
'Midway_Atoll': 'Midway_Islands', # wrong in "ISO 3166-3"
|
|
'%C3%85land_Islands': 'Åland', # wrong in all
|
|
'East Timor': 'Timor-Leste', # wrong in all
|
|
'Cocos_(Keeling)_Islands': 'Cocos_Islands', # wrong in all
|
|
'French_Southern_and_Antarctic_Lands': 'French_Southern_and_Antarctic_Territories', # wrong in all
|
|
'Saint_Helena,_Ascension_and_Tristan_da_Cunha': 'Saint_Helena', # wrong in all
|
|
'The_Bahamas': 'Bahamas', # wrong in all
|
|
'The_Gambia': 'Gambia' # wrong in all
|
|
},
|
|
'urls': [
|
|
# not in any list
|
|
'Adélie_Land', 'Antártica', 'Argentine_Antarctica', # Antarctic claims
|
|
'England', 'Northern_Ireland', 'Scotland', 'Wales', # Sports
|
|
'Korea', 'Siam', 'West_Germany', # IMDb
|
|
'East Timor',
|
|
'French_Southern_Territories',
|
|
'Peter_I_Island',
|
|
'South_Sudan',
|
|
'UK'
|
|
]
|
|
}
|
|
|
|
"""
|
|
geo = {
|
|
# disambiguation
|
|
'url': {
|
|
'Acheng': 'Acheng_District',
|
|
'Guatemala': 'Guatemala_City',
|
|
'Jilin': 'Jilin_City'
|
|
}
|
|
}
|
|
"""
|
|
|
|
wiki = 'http://en.wikipedia.org/wiki/'
|
|
|
|
def fix_html(html):
|
|
# in lieu of a better regexp
|
|
html = html.replace('<span style="display:none" class="sortkey">Reunion !</span><span class="sorttext">', '')
|
|
html = html.replace('<span style="display:none" class="sortkey">Us Miscellaneous Pacific Islands !</span><span class="sorttext">', '')
|
|
html = html.replace('<span style="display:none" class="sortkey">Ussr !</span><span class="sorttext">', '')
|
|
html = html.replace('"/wiki/Taiwan"', '"/wiki/Republic_of_China"')
|
|
return html
|
|
|
|
def get_cities():
|
|
write_log('../log/no_latlng.log', None)
|
|
write_log('../log/no_pop.log', None)
|
|
cities = []
|
|
for url in get_city_urls():
|
|
cities.append(get_city_data(url))
|
|
write_json('../json/_cities.json', sorted(cities, key=lambda x: -x['population'] if x['population'] else 0), True)
|
|
return cities
|
|
|
|
def get_city_data(url):
|
|
data = {'wikipediaURL': url}
|
|
html = read_url(url)
|
|
# name
|
|
results = re.compile('<h1 id="firstHeading" class="firstHeading">(.*?)</h1>', re.DOTALL).findall(html)
|
|
data['name'] = results[0]
|
|
# latitude, longitude
|
|
data['latitude'] = None
|
|
data['longitude'] = None
|
|
results = re.compile('geohack.php\?.*?&params=(.*?[WE])').findall(html)
|
|
if results:
|
|
result = results[0]
|
|
values = [0, 0]
|
|
lat = 'S' if 'S' in result else 'N'
|
|
lng = 'W' if 'W' in result else 'E'
|
|
halves = result[:-1].split('_' + lat + '_')
|
|
for h, half in enumerate(halves):
|
|
parts = half.split('_')
|
|
for p, part in enumerate(parts):
|
|
if part:
|
|
value = float(part)
|
|
if p == 0:
|
|
values[h] += value
|
|
elif p == 1:
|
|
values[h] += value / 60
|
|
elif p == 2:
|
|
values[h] += value / 3600
|
|
data['latitude'] = -values[0] if lat == 'S' else values[0]
|
|
data['longitude'] = -values[1] if lng == 'W' else values[1]
|
|
print results[0], data['latitude'], data['longitude']
|
|
else:
|
|
write_log('../log/no_latlng.log', data['wikipediaURL'])
|
|
# population
|
|
results = re.compile('Population.*?<td>.*?(\d+,\d+[\d,]+).*?</td>', re.DOTALL).findall(html)
|
|
"""
|
|
if not results:
|
|
results = re.compile('population of ([\d,\.])').findall(html)
|
|
"""
|
|
data['population'] = None
|
|
if results:
|
|
data['population'] = int(results[0].replace(',', ''))
|
|
print results[0], data['population']
|
|
else:
|
|
write_log('../log/no_pop.log', data['wikipediaURL'])
|
|
return data
|
|
|
|
def get_city_urls():
|
|
urls = []
|
|
html = read_url(wiki + 'List_of_capital_cities')
|
|
urls += re.compile('<td><a href="/wiki/(.*?)"').findall(html)
|
|
html = read_url(wiki + 'List_of_countries_with_multiple_capitals')
|
|
urls += re.compile('<td><a href="/wiki/(.*?)"').findall(html)
|
|
page = 'List_of_towns_and_cities_with_100,000_or_more_inhabitants'
|
|
html = read_url(wiki + page)
|
|
results = re.compile('(/cityname:_.*?)"').findall(html)
|
|
for result in results:
|
|
html = read_url(wiki + page + result)
|
|
urls += re.compile('<li><a href="/wiki/(.*?)"').findall(html)
|
|
urls = list(set(urls))
|
|
urls = map(lambda x: geo['url'][x] if x in geo['url'] else x, urls)
|
|
return map(lambda x: wiki + x, sorted(urls))
|
|
|
|
def get_countries():
|
|
countries = []
|
|
for url in get_country_urls():
|
|
countries.append(get_country_data(url))
|
|
return countries
|
|
|
|
def get_country_data(url):
|
|
data = {'wikipediaURL': url}
|
|
html = read_url(url)
|
|
# name
|
|
results = re.compile('Redirected from <a.*?>(.*?)</a>', re.DOTALL).findall(html)
|
|
if not results:
|
|
results = re.compile('<h1 id="firstHeading" class="firstHeading">(.*?)</h1>', re.DOTALL).findall(html)
|
|
data['name'] = geo['wikipedia_name'][results[0]] if results[0] in geo['wikipedia_name'] else results[0]
|
|
# code
|
|
if data['name'] in geo['code']:
|
|
data['code'] = geo['code'][data['name']]
|
|
else:
|
|
html_ = fix_html(read_url(wiki + 'ISO_3166-3'))
|
|
url_ = url.replace(wiki, '')
|
|
for k, v in geo['wikipedia_url'].iteritems():
|
|
if v == url_:
|
|
url_ = k
|
|
break
|
|
results = re.compile('<td id="([A-Z]{4})"><a href="/wiki/' + url_ + '"', re.DOTALL).findall(html_)
|
|
if not results:
|
|
html_ = fix_html(read_url(wiki + 'ISO_3166-1_alpha-2'))
|
|
results = re.compile('<tt>([A-Z]{2})</tt></td>\n<td><a href="/wiki/' + url_ + '"', re.DOTALL).findall(html_)
|
|
if not results:
|
|
results = re.compile('"/wiki/ISO_3166-2:(.*?)"', re.DOTALL).findall(html)
|
|
if not results:
|
|
results = re.compile('"/wiki/\.(\w{2})"', re.DOTALL).findall(html)
|
|
data['code'] = results[0].upper()
|
|
# flag_url
|
|
if data['name'] in geo['flag']:
|
|
flag_url = wiki + 'File:' + geo['flag'][data['name']]
|
|
else:
|
|
results = re.compile('style="width:58%; vertical-align:middle;"><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
if not results:
|
|
results = re.compile('align="center" style="vertical-align:middle;"><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
if not results:
|
|
results = re.compile('href="/wiki/(File:Flag.*?)"', re.DOTALL).findall(html)
|
|
flag_url = wiki + results[0]
|
|
data['flagURL'] = get_country_flag_url(data['code'], flag_url)
|
|
return data
|
|
|
|
def get_country_flag_url(code, url):
|
|
html = read_url(url)
|
|
results = re.compile('<div class="fullImageLink" id="file"><a href="(.*?)"', re.DOTALL).findall(html)
|
|
url = 'http:' + results[0]
|
|
return url
|
|
|
|
def get_country_urls():
|
|
urls = geo['wikipedia_urls']
|
|
html = read_url(wiki + 'List_of_sovereign_states')
|
|
urls += re.compile('> </span><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
html = fix_html(read_url(wiki + 'ISO_3166-1_alpha-2').split('Edit section: Indeterminate reservations')[0])
|
|
urls += re.compile('<tt>[A-Z]{2}</tt></td>\n<td><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
|
|
html = fix_html(read_url(wiki + 'ISO_3166-3'))
|
|
urls += re.compile('<td id="[A-Z]{4}">.*?<a href="/wiki/(.*?)".*?>', re.DOTALL).findall(html)
|
|
urls = map(lambda x: geo['wikipedia_url'][x] if x in geo['wikipedia_url'] else x, urls)
|
|
urls = list(set(urls)) # make unique
|
|
urls.remove('')
|
|
return map(lambda x: wiki + x, sorted(urls)) |