cinematools/films_by_country.py
2013-07-01 12:06:47 +02:00

52 lines
1.5 KiB
Python
Executable file

#!/usr/bin/python
import ox.web.imdb
import re
import json
import sys
from optparse import OptionParser
'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
'''
def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0)
if __name__ == '__main__':
usage = "usage: %prog [options] country output.json"
parser = OptionParser(usage=usage)
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
(opts, args) = parser.parse_args()
if len(args) != 2:
parser.print_help()
sys.exit(1)
films = []
country, output = args
base_url = 'http://akas.imdb.com'
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
data = ox.web.imdb.read_url(url)
n = True
while n:
n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
if n:
n = '%s%s' % (base_url, n[0].split('href="')[-1])
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
if results:
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
print n
print len(films), 'films'
if n:
data = ox.web.imdb.read_url(n)
else:
with open('last.html', 'w') as f:
f.write(data)
if len(films) % 1000 == 0:
with open(filename, 'w') as f:
json.dump(films, f, indent=2)
with open(filename, 'w') as f:
json.dump(films, f, indent=2)