cinematools/films_by_country.py

#!/usr/bin/python
import ox.web.imdb
import re
import json
import sys
import codecs
from optparse import OptionParser

'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
'''

def reset_url(url):
    x = ox.web.imdb.read_url(url, timeout=0)

if __name__ == '__main__':
    usage = "usage: %prog [options] countrycode output.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
    (opts, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_help()
        sys.exit(1)

    films = []
    country, filename = args

    if opts.reset:
        reset_url(opts.reset)
    
    base_url = 'http://akas.imdb.com'
    url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
    data = ox.web.imdb.read_url(url)
    n = True
    while n:
        n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
        if n:
            n = '%s%s' % (base_url, n[0].split('href="')[-1])

        results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
        if results:
            films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
        print n
        print len(films), 'films'
        if n:
            data = ox.web.imdb.read_url(n)
        else:
            with open('last.html', 'w') as f:
                f.write(data)
        if len(films) % 1000 == 0:
            with codecs.open(filename, 'w', encoding='utf-8') as fd:
                json.dump([{
                    'imdbId': f[0],
                    'title': ox.decode_html(f[1])
                } for f in films], fd, indent=1, ensure_ascii=False)

    with codecs.open(filename, 'w', encoding='utf-8') as fd:
        json.dump([{
            'imdbId': f[0],
            'title': ox.decode_html(f[1])
        } for f in films], fd, indent=1, ensure_ascii=False)
cinematools 2013-07-01 10:06:47 +00:00			`#!/usr/bin/python`
			`import ox.web.imdb`
			`import re`
			`import json`
			`import sys`
script to add metadata to json; general cleanup 2013-08-08 08:08:17 +00:00			`import codecs`
cinematools 2013-07-01 10:06:47 +00:00			`from optparse import OptionParser`

			`'''`
			`python allofcountry.py in idsofindia.json`
			`python allofcountry.py tr idsofturkey.json`
			`'''`

			`def reset_url(url):`
			`x = ox.web.imdb.read_url(url, timeout=0)`

			`if __name__ == '__main__':`
script to add metadata to json; general cleanup 2013-08-08 08:08:17 +00:00			`usage = "usage: %prog [options] countrycode output.json"`
cinematools 2013-07-01 10:06:47 +00:00			`parser = OptionParser(usage=usage)`
			`parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")`
			`(opts, args) = parser.parse_args()`
			`if len(args) != 2:`
			`parser.print_help()`
			`sys.exit(1)`

			`films = []`
script to add metadata to json; general cleanup 2013-08-08 08:08:17 +00:00			`country, filename = args`

			`if opts.reset:`
			`reset_url(opts.reset)`
cinematools 2013-07-01 10:06:47 +00:00
			`base_url = 'http://akas.imdb.com'`
			`url = '%s/search/title?countries=%s&sort=year' % (base_url, country)`
			`data = ox.web.imdb.read_url(url)`
			`n = True`
			`while n:`
			`n = re.compile('<a href="(.*?)">Next »</a>').findall(data)`
			`if n:`
			`n = '%s%s' % (base_url, n[0].split('href="')[-1])`

			`results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)`
			`if results:`
			`films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])`
			`print n`
			`print len(films), 'films'`
			`if n:`
			`data = ox.web.imdb.read_url(n)`
			`else:`
			`with open('last.html', 'w') as f:`
			`f.write(data)`
			`if len(films) % 1000 == 0:`
script to add metadata to json; general cleanup 2013-08-08 08:08:17 +00:00			`with codecs.open(filename, 'w', encoding='utf-8') as fd:`
			`json.dump([{`
			`'imdbId': f[0],`
			`'title': ox.decode_html(f[1])`
			`} for f in films], fd, indent=1, ensure_ascii=False)`
cinematools 2013-07-01 10:06:47 +00:00
script to add metadata to json; general cleanup 2013-08-08 08:08:17 +00:00			`with codecs.open(filename, 'w', encoding='utf-8') as fd:`
			`json.dump([{`
			`'imdbId': f[0],`
			`'title': ox.decode_html(f[1])`
			`} for f in films], fd, indent=1, ensure_ascii=False)`