oxjs/tools/geo/py/wikipedia.py

357 lines
15 KiB
Python
Raw Normal View History

2011-05-23 19:38:52 +00:00
# -*- coding: utf-8 -*-
import Image
import re
from geo import geo, read_url, write_log
from geo import write_json
# special is unused, geo in geo.py is used
2011-05-23 19:38:52 +00:00
special = {
'code': {
# incorrect on wikipedia
'Democratic Republic of the Congo': 'CD',
# missing on wikipedia
'Neutral Zone': 'NTHH',
'Réunion': 'RE',
# unrecognized
'Abkhazia': 'GE-AB',
'Kosovo': 'RS-KO',
'Nagorno-Karabakh': 'AZ-NK',
'Northern Cyprus': 'CY-NC',
'South Ossetia': 'GE-SO',
'Somaliland': 'SO-SO',
'South Sudan': 'SD-SS',
'Transnistria': 'MD-TR',
# dependent
'Adélie Land': 'FR-AQ',
'Akrotiri and Dhekelia': 'GB-AD',
'Antártica': 'CL-AQ',
'Argentine Antarctica': 'AR-AQ',
'Ashmore and Cartier Islands': 'AU-AC',
'Australian Antarctic Territory': 'AU-AQ',
'Coral Sea Islands': 'AU-CS',
'England': 'GB-ENG',
'Northern Ireland': 'GB-NIR',
'Peter I Island': 'NO-PI',
'Ross Dependency': 'NZ-AQ',
'Scotland': 'GB-SCT',
'Wales': 'GB-WLS',
# exceptional
'Canary Islands': 'IC',
'Ceuta and Melilla': 'EA',
'UK': 'UK',
# former
'Korea': 'KOHH',
'Netherlands Antilles': 'ANHH',
'Siam': 'SITH',
'West Germany': 'DEDE'
},
'dependency': {
'Argentina': [
'Argentine Antarctica'
],
'Argentina, Australia, Chile, France, New Zealand, Norway, United Kingdom': [
'Antarctica'
],
'Australia': [
'Ashmore and Cartier Islands', 'Australian Antarctic Territory', 'Christmas Island', 'Cocos Islands', 'Coral Sea Islands',
'Heard Island and McDonald Islands', 'Norfolk Island'
],
'Chile': [
'Antártica'
],
'China': [
'Hong Kong', 'Macau'
],
'Denmark': [
'Faroe Islands', 'Greenland'
],
'Finland': [
'Åland'
],
'France': [
'Adélie Land', 'Clipperton Island', 'French Afar and Issas', 'French Guiana', 'French Polynesia',
'French Southern and Antarctic Territories', 'French Southern Territories', 'Guadeloupe', 'Martinique', 'Mayotte',
'New Caledonia', 'Réunion', 'Saint Barthélemy', 'Saint Martin', 'Saint Pierre and Miquelon',
'Wallis and Futuna'
],
'France, United Kingdom': [
'New Hebrides'
],
'Iraq, Saudi Arabia': [
'Neutral Zone'
],
'Netherlands': [
'Aruba', 'Bonaire, Saint Eustatius and Saba', 'Curaçao', 'Netherlands Antilles', 'Sint Maarten'
],
'New Zealand': [
'Cook Islands', 'Niue', 'Ross Dependency', 'Tokelau'
],
'Norway': [
'Bouvet Island', 'Peter I Island', 'Queen Maud Land', 'Svalbard and Jan Mayen'
],
'Spain': [
'Canary Islands', 'Ceuta and Melilla'
],
'Soviet Union': [
'Byelorussian Soviet Socialist Republic'
],
'United Kingdom': [
'Akrotiri and Dhekelia', 'Anguilla', 'Ascension Island', 'Bermuda', 'British Antarctic Territory',
'British Indian Ocean Territory', 'British Virgin Islands', 'Cayman Islands', 'Diego Garcia', 'England',
'Falkland Islands', 'Gibraltar', 'Gilbert and Ellice Islands', 'Guernsey', 'Isle of Man',
'Jersey', 'Montserrat', 'Northern Ireland', 'Pitcairn Islands', 'Saint Helena',
'Scotland', 'South Georgia and the South Sandwich Islands', 'Tristan da Cunha', 'Turks and Caicos Islands', 'Wales'
],
'United Kingdom, United States': [
'Canton and Enderbury Islands'
],
'United States': [
'American Samoa', 'Guam', 'Northern Mariana Islands', 'Johnston Island', 'Midway Islands',
'Pacific Islands', 'Panama Canal Zone', 'Puerto Rico', 'United States Minor Outlying Islands', 'United States Miscellaneous Pacific Islands',
'United States Virgin Islands', 'Wake Island'
]
},
'flag': {
'Australian Antarctic Territory': 'Flag_of_Australia.svg',
'Antarctica': 'Flag_of_the_Antarctic_Treaty.svg',
'Antártica': 'Flag_of_Magallanes,_Chile.svg',
'Ashmore and Cartier Islands': 'Flag_of_Australia.svg',
'Bonaire, Saint Eustatius and Saba': 'Flag_of_Bonaire.svg',
'Burma': 'Flag_of_Myanmar_(1974-2010).svg',
'Byelorussian Soviet Socialist Republic': 'Flag_of_Byelorussian_SSR.svg',
'Canton and Enderbury Islands': 'Flag_of_Gilbert_and_Ellice_Islands.svg',
'Ceuta and Melilla': 'Flag_Ceuta.svg',
'Coral Sea Islands': 'Flag_of_Australia.svg',
'Diego Garcia': 'Flag_of_the_British_Indian_Ocean_Territory.svg',
'French Guiana': 'Flag_of_French_Guiana.svg',
'Korea': 'Flag_of_Korea_1882.svg',
# 'Libya': 'Flag_of_the_Libyan_Jamahiriya_1977.svg',
2011-05-23 19:38:52 +00:00
'Metropolitan France': 'Flag_of_France.svg',
'Neutral Zone': 'Flag_of_the_United_Nations.svg',
'New Hebrides': 'Flag_of_Anglo-French_Joint_Naval_Commission.svg',
'Northern Ireland': 'Ulster_banner.svg',
'Panama Canal Zone': 'Panama_Canal_Zone_Flag.png',
'Peter I Island': 'Flag_of_Norway.svg',
'Réunion': 'Drapeau_Reunion_APDR.png',
'Saint Martin': 'Flag_of_Saint-Martin_(local).svg',
'Siam': 'State_Flag_of_Thailand_(1916).svg',
'Svalbard and Jan Mayen': 'Flag_of_Norway.svg',
'United States Miscellaneous Pacific Islands': 'Flag_of_the_United_States.svg',
'Wallis and Futuna': 'Flag_of_Wallis_and_Futuna.svg',
'Yugoslavia': 'Flag_of_SFR_Yugoslavia.svg'
},
'name': {
# simplified names (ambigious in wikipedia)
'Cocos (Keeling) Islands': 'Cocos Islands',
'Collectivity of Saint Martin': 'Saint Martin',
'Federated States of Micronesia': 'Micronesia',
'French Territory of the Afars and the Issas': 'French Afar and Issas',
'Georgia (country)': 'Georgia',
'Nagorno-Karabakh Republic': 'Nagorno-Karabakh',
'People\'s Republic of China': 'China',
'Republic of China': 'Taiwan',
'Republic of Dahomey': 'Dahomey',
'Republic of Ireland': 'Ireland',
'Republic of Kosovo': 'Kosovo',
'Republic of Macedonia': 'Macedonia',
'Republic of Upper Volta': 'Upper Volta',
'Sahrawi Arab Democratic Republic': 'Sahrawi',
'Saudi-Iraqi neutral zone': 'Neutral Zone',
'State of Palestine': 'Palestine',
'Trust Territory of the Pacific Islands': 'Pacific Islands'
},
'url': {
'Alderney': '', # depencency of Guernsey
'Herm': '', # depencency of Guernsey
'Sark': '', # depencency of Guernsey
'Azad_Kashmir': '', # territory of Pakistan
'Gilgit-Baltistan': '', # territory of Pakistan
'Coral_Sea_Islands_Territory': 'Coral_Sea_Islands', # wrong in "List of sovereign states"
'Kingdom_of_the_Netherlands': 'Netherlands', # wrong in "List of sovereign states"
'Saint-Barth%C3%A9lemy': 'Saint_Barth%C3%A9lemy', # wrong in "List of sovereign states"
'Saint_Martin': 'Collectivity_of_Saint_Martin', # wrong in "List of sovereign states"
'Caribbean_Netherlands': 'Bonaire,_Saint_Eustatius_and_Saba', # wrong in "ISO 3166-1 alpha-2"
'Ceuta': 'Ceuta_and_Melilla', # wrong in "ISO 3166-1 alpha-2"
'Palestinian_territories': 'State_of_Palestine', # wrong in "ISO 3166-1 alpha-2"
'Saudi%E2%80%93Iraqi_neutral_zone': 'Saudi-Iraqi_neutral_zone', # wrong in "ISO 3166-1 alpha-2"
'Western_Sahara': 'Sahrawi_Arab_Democratic_Republic', # wrong in "ISO 3166-1 alpha-2"
'Johnston_Atoll': 'Johnston_Island', # wrong in "ISO 3166-3"
'Midway_Atoll': 'Midway_Islands', # wrong in "ISO 3166-3"
'%C3%85land_Islands': 'Åland', # wrong in all
'East Timor': 'Timor-Leste', # wrong in all
'Cocos_(Keeling)_Islands': 'Cocos_Islands', # wrong in all
'French_Southern_and_Antarctic_Lands': 'French_Southern_and_Antarctic_Territories', # wrong in all
'Saint_Helena,_Ascension_and_Tristan_da_Cunha': 'Saint_Helena', # wrong in all
'The_Bahamas': 'Bahamas', # wrong in all
'The_Gambia': 'Gambia' # wrong in all
},
'urls': [
# not in any list
'Adélie_Land', 'Antártica', 'Argentine_Antarctica', # Antarctic claims
'England', 'Northern_Ireland', 'Scotland', 'Wales', # Sports
'Korea', 'Siam', 'West_Germany', # IMDb
'East Timor',
'French_Southern_Territories',
'Peter_I_Island',
'South_Sudan',
'UK'
]
}
"""
geo = {
# disambiguation
'url': {
'Acheng': 'Acheng_District',
'Guatemala': 'Guatemala_City',
'Jilin': 'Jilin_City'
}
}
"""
wiki = 'http://en.wikipedia.org/wiki/'
def fix_html(html):
# in lieu of a better regexp
html = html.replace('<span style="display:none" class="sortkey">Reunion !</span><span class="sorttext">', '')
html = html.replace('<span style="display:none" class="sortkey">Us Miscellaneous Pacific Islands !</span><span class="sorttext">', '')
html = html.replace('<span style="display:none" class="sortkey">Ussr !</span><span class="sorttext">', '')
html = html.replace('"/wiki/Taiwan"', '"/wiki/Republic_of_China"')
return html
def get_cities():
write_log('../log/no_latlng.log', None)
write_log('../log/no_pop.log', None)
cities = []
for url in get_city_urls():
cities.append(get_city_data(url))
write_json('../json/_cities.json', sorted(cities, key=lambda x: -x['population'] if x['population'] else 0), True)
return cities
def get_city_data(url):
data = {'wikipediaURL': url}
html = read_url(url)
# name
results = re.compile('<h1 id="firstHeading" class="firstHeading">(.*?)</h1>', re.DOTALL).findall(html)
data['name'] = results[0]
# latitude, longitude
data['latitude'] = None
data['longitude'] = None
results = re.compile('geohack.php\?.*?&amp;params=(.*?[WE])').findall(html)
if results:
result = results[0]
values = [0, 0]
lat = 'S' if 'S' in result else 'N'
lng = 'W' if 'W' in result else 'E'
halves = result[:-1].split('_' + lat + '_')
for h, half in enumerate(halves):
parts = half.split('_')
for p, part in enumerate(parts):
if part:
value = float(part)
if p == 0:
values[h] += value
elif p == 1:
values[h] += value / 60
elif p == 2:
values[h] += value / 3600
data['latitude'] = -values[0] if lat == 'S' else values[0]
data['longitude'] = -values[1] if lng == 'W' else values[1]
print results[0], data['latitude'], data['longitude']
else:
write_log('../log/no_latlng.log', data['wikipediaURL'])
# population
results = re.compile('Population.*?<td>.*?(\d+,\d+[\d,]+).*?</td>', re.DOTALL).findall(html)
"""
if not results:
results = re.compile('population of ([\d,\.])').findall(html)
"""
data['population'] = None
if results:
data['population'] = int(results[0].replace(',', ''))
print results[0], data['population']
else:
write_log('../log/no_pop.log', data['wikipediaURL'])
return data
def get_city_urls():
urls = []
html = read_url(wiki + 'List_of_capital_cities')
urls += re.compile('<td><a href="/wiki/(.*?)"').findall(html)
html = read_url(wiki + 'List_of_countries_with_multiple_capitals')
urls += re.compile('<td><a href="/wiki/(.*?)"').findall(html)
page = 'List_of_towns_and_cities_with_100,000_or_more_inhabitants'
html = read_url(wiki + page)
results = re.compile('(/cityname:_.*?)"').findall(html)
for result in results:
html = read_url(wiki + page + result)
urls += re.compile('<li><a href="/wiki/(.*?)"').findall(html)
urls = list(set(urls))
urls = map(lambda x: geo['url'][x] if x in geo['url'] else x, urls)
return map(lambda x: wiki + x, sorted(urls))
def get_countries():
countries = []
for url in get_country_urls():
countries.append(get_country_data(url))
return countries
def get_country_data(url):
data = {'wikipediaURL': url}
html = read_url(url)
# name
results = re.compile('Redirected from <a.*?>(.*?)</a>', re.DOTALL).findall(html)
if not results:
results = re.compile('<h1 id="firstHeading" class="firstHeading">(.*?)</h1>', re.DOTALL).findall(html)
data['name'] = geo['wikipedia_name'][results[0]] if results[0] in geo['wikipedia_name'] else results[0]
# code
if data['name'] in geo['code']:
data['code'] = geo['code'][data['name']]
else:
html_ = fix_html(read_url(wiki + 'ISO_3166-3'))
url_ = url.replace(wiki, '')
for k, v in geo['wikipedia_url'].iteritems():
if v == url_:
url_ = k
break
results = re.compile('<td id="([A-Z]{4})"><a href="/wiki/' + url_ + '"', re.DOTALL).findall(html_)
if not results:
html_ = fix_html(read_url(wiki + 'ISO_3166-1_alpha-2'))
results = re.compile('<tt>([A-Z]{2})</tt></td>\n<td><a href="/wiki/' + url_ + '"', re.DOTALL).findall(html_)
if not results:
results = re.compile('"/wiki/ISO_3166-2:(.*?)"', re.DOTALL).findall(html)
if not results:
results = re.compile('"/wiki/\.(\w{2})"', re.DOTALL).findall(html)
data['code'] = results[0].upper()
# flag_url
if data['name'] in geo['flag']:
flag_url = wiki + 'File:' + geo['flag'][data['name']]
else:
results = re.compile('style="width:58%; vertical-align:middle;"><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
if not results:
results = re.compile('align="center" style="vertical-align:middle;"><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
if not results:
results = re.compile('href="/wiki/(File:Flag.*?)"', re.DOTALL).findall(html)
flag_url = wiki + results[0]
data['flagURL'] = get_country_flag_url(data['code'], flag_url)
return data
def get_country_flag_url(code, url):
html = read_url(url)
results = re.compile('<div class="fullImageLink" id="file"><a href="(.*?)"', re.DOTALL).findall(html)
2011-10-12 00:14:05 +00:00
url = 'http:' + results[0]
2011-05-23 19:38:52 +00:00
return url
def get_country_urls():
urls = geo['wikipedia_urls']
html = read_url(wiki + 'List_of_sovereign_states')
urls += re.compile('>&#160;</span><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
html = fix_html(read_url(wiki + 'ISO_3166-1_alpha-2').split('Edit section: Indeterminate reservations')[0])
urls += re.compile('<tt>[A-Z]{2}</tt></td>\n<td><a href="/wiki/(.*?)"', re.DOTALL).findall(html)
html = fix_html(read_url(wiki + 'ISO_3166-3'))
urls += re.compile('<td id="[A-Z]{4}">.*?<a href="/wiki/(.*?)".*?>', re.DOTALL).findall(html)
urls = map(lambda x: geo['wikipedia_url'][x] if x in geo['wikipedia_url'] else x, urls)
urls = list(set(urls)) # make unique
urls.remove('')
return map(lambda x: wiki + x, sorted(urls))