cinematools/films_by_country.py

#!/usr/bin/python3
import ox.web.imdb
import re
import json
import sys
import codecs

from datetime import datetime
from optparse import OptionParser

import lxml.html

'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
'''

def reset_url(url):
    x = ox.web.imdb.read_url(url, timeout=0)


def write(films, filename):
    data = []
    for id, film in films.items():
        data.append({
            'imdbId': id,
            'title': film[0],
            'year': film[1],
        })

    with codecs.open(filename, 'w', encoding='utf-8') as fd:
        json.dump(data, fd, indent=1, ensure_ascii=False)


if __name__ == '__main__':
    usage = "usage: %prog [options] countrycode output.json"
    parser = OptionParser(usage=usage)
    parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
    (opts, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_help()
        sys.exit(1)

    films = {}
    country, filename = args
    current_year = datetime.now().strftime('Y')

    if opts.reset:
        reset_url(opts.reset)

    base_url = 'http://www.imdb.com'
    #url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
    year = 1880

    added = 0

    while year < datetime.now().year:
        print('<<', year)
        url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)

        data = ox.web.imdb.read_url(url, unicode=True)
        n = True
        page = 1
        while n:
            n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
            if n:
                n = '%s&page=%s' % (url, page)
                page += 1
            doc = lxml.html.fromstring(data)
            article = doc.find_class('article')
            if article:
                article = article[0]
            else:
                n = None
            for header in article.find_class('lister-item-header'):
                a = header.xpath('.//a')[0]
                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
                title = a.text_content()
                try:
                    fully = y = header.find_class('lister-item-year')[0].text_content()
                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
                    if not y:
                        y = year
                    else:
                        y = int(y)
                except:
                    print(n)
                    print(header.find_class('lister-item-year')[0].text_content())
                    raise
                if id not in films:
                    films[id] = (title, y)
                    added += 1
            '''
            for a in article.xpath('.//a'):
                if '/title/tt' in a.attrib['href']:
                    img = a.xpath('.//img')
                    if img:
                        id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
                        if id not in films:
                            title = img[0].attrib['alt']
                            title = ox.decode_html(title)
                            films[id] = title
                            added += 1
            '''
            print(len(films), 'films')
            if n:
                data = ox.web.imdb.read_url(n, unicode=True)
            else:
                with open('last.html', 'w') as f:
                    f.write(data)
            if added > 1000:
                added = 0
                write(films, filename)
        print('>> year', year)
        year += 1

    write(films, filename)