#!/usr/bin/python import ox.web.imdb import re import json import sys from optparse import OptionParser ''' python allofcountry.py in idsofindia.json python allofcountry.py tr idsofturkey.json ''' def reset_url(url): x = ox.web.imdb.read_url(url, timeout=0) if __name__ == '__main__': usage = "usage: %prog [options] country output.json" parser = OptionParser(usage=usage) parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url") (opts, args) = parser.parse_args() if len(args) != 2: parser.print_help() sys.exit(1) films = [] country, output = args base_url = 'http://akas.imdb.com' url = '%s/search/title?countries=%s&sort=year' % (base_url, country) data = ox.web.imdb.read_url(url) n = True while n: n = re.compile('Next »').findall(data) if n: n = '%s%s' % (base_url, n[0].split('href="')[-1]) results = re.compile('(.*?)
', re.DOTALL).findall(data) if results: films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0]) print n print len(films), 'films' if n: data = ox.web.imdb.read_url(n) else: with open('last.html', 'w') as f: f.write(data) if len(films) % 1000 == 0: with open(filename, 'w') as f: json.dump(films, f, indent=2) with open(filename, 'w') as f: json.dump(films, f, indent=2)