2013-07-01 10:06:47 +00:00
|
|
|
#!/usr/bin/python
|
|
|
|
import ox.web.imdb
|
|
|
|
import re
|
|
|
|
import json
|
|
|
|
import sys
|
2013-08-08 08:08:17 +00:00
|
|
|
import codecs
|
2013-07-01 10:06:47 +00:00
|
|
|
from optparse import OptionParser
|
|
|
|
|
|
|
|
'''
|
|
|
|
python allofcountry.py in idsofindia.json
|
|
|
|
python allofcountry.py tr idsofturkey.json
|
|
|
|
'''
|
|
|
|
|
|
|
|
def reset_url(url):
|
|
|
|
x = ox.web.imdb.read_url(url, timeout=0)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2013-08-08 08:08:17 +00:00
|
|
|
usage = "usage: %prog [options] countrycode output.json"
|
2013-07-01 10:06:47 +00:00
|
|
|
parser = OptionParser(usage=usage)
|
|
|
|
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
|
|
|
|
(opts, args) = parser.parse_args()
|
|
|
|
if len(args) != 2:
|
|
|
|
parser.print_help()
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
films = []
|
2013-08-08 08:08:17 +00:00
|
|
|
country, filename = args
|
|
|
|
|
|
|
|
if opts.reset:
|
|
|
|
reset_url(opts.reset)
|
2013-07-01 10:06:47 +00:00
|
|
|
|
|
|
|
base_url = 'http://akas.imdb.com'
|
|
|
|
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
|
|
|
data = ox.web.imdb.read_url(url)
|
|
|
|
n = True
|
|
|
|
while n:
|
|
|
|
n = re.compile('<a href="(.*?)">Next »</a>').findall(data)
|
|
|
|
if n:
|
|
|
|
n = '%s%s' % (base_url, n[0].split('href="')[-1])
|
|
|
|
|
|
|
|
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
|
|
|
|
if results:
|
|
|
|
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
|
|
|
|
print n
|
|
|
|
print len(films), 'films'
|
|
|
|
if n:
|
|
|
|
data = ox.web.imdb.read_url(n)
|
|
|
|
else:
|
|
|
|
with open('last.html', 'w') as f:
|
|
|
|
f.write(data)
|
|
|
|
if len(films) % 1000 == 0:
|
2013-08-08 08:08:17 +00:00
|
|
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
|
|
|
json.dump([{
|
|
|
|
'imdbId': f[0],
|
|
|
|
'title': ox.decode_html(f[1])
|
|
|
|
} for f in films], fd, indent=1, ensure_ascii=False)
|
2013-07-01 10:06:47 +00:00
|
|
|
|
2013-08-08 08:08:17 +00:00
|
|
|
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
|
|
|
json.dump([{
|
|
|
|
'imdbId': f[0],
|
|
|
|
'title': ox.decode_html(f[1])
|
|
|
|
} for f in films], fd, indent=1, ensure_ascii=False)
|