cinematools/films_by_country.py

117 lines
3.6 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import ox.web.imdb
import re
import json
import sys
import codecs
from datetime import datetime
from optparse import OptionParser
import lxml.html
'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
'''
def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0)
def write(films, filename):
data = []
for id, film in films.items():
data.append({
'imdbId': id,
'title': film[0],
'year': film[1],
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage)
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
(opts, args) = parser.parse_args()
if len(args) != 2:
parser.print_help()
sys.exit(1)
films = {}
country, filename = args
current_year = datetime.now().strftime('Y')
if opts.reset:
reset_url(opts.reset)
base_url = 'http://www.imdb.com'
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
year = 1880
added = 0
while year < datetime.now().year:
print('<<', year)
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
data = ox.web.imdb.read_url(url, unicode=True)
n = True
page = 1
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content()
try:
fully = y = header.find_class('lister-item-year')[0].text_content()
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
if id not in films:
films[id] = (title, y)
added += 1
'''
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
'''
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
print('>> year', year)
year += 1
write(films, filename)