117 lines
3.6 KiB
Python
Executable file
117 lines
3.6 KiB
Python
Executable file
#!/usr/bin/python3
|
||
import ox.web.imdb
|
||
import re
|
||
import json
|
||
import sys
|
||
import codecs
|
||
|
||
from datetime import datetime
|
||
from optparse import OptionParser
|
||
|
||
import lxml.html
|
||
|
||
'''
|
||
python allofcountry.py in idsofindia.json
|
||
python allofcountry.py tr idsofturkey.json
|
||
'''
|
||
|
||
def reset_url(url):
|
||
x = ox.web.imdb.read_url(url, timeout=0)
|
||
|
||
|
||
def write(films, filename):
|
||
data = []
|
||
for id, film in films.items():
|
||
data.append({
|
||
'imdbId': id,
|
||
'title': film[0],
|
||
'year': film[1],
|
||
})
|
||
|
||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||
json.dump(data, fd, indent=1, ensure_ascii=False)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
usage = "usage: %prog [options] countrycode output.json"
|
||
parser = OptionParser(usage=usage)
|
||
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
|
||
(opts, args) = parser.parse_args()
|
||
if len(args) != 2:
|
||
parser.print_help()
|
||
sys.exit(1)
|
||
|
||
films = {}
|
||
country, filename = args
|
||
current_year = datetime.now().strftime('Y')
|
||
|
||
if opts.reset:
|
||
reset_url(opts.reset)
|
||
|
||
base_url = 'http://www.imdb.com'
|
||
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
||
year = 1880
|
||
|
||
added = 0
|
||
|
||
while year < datetime.now().year:
|
||
print('<<', year)
|
||
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
|
||
|
||
data = ox.web.imdb.read_url(url, unicode=True)
|
||
n = True
|
||
page = 1
|
||
while n:
|
||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||
if n:
|
||
n = '%s&page=%s' % (url, page)
|
||
page += 1
|
||
doc = lxml.html.fromstring(data)
|
||
article = doc.find_class('article')
|
||
if article:
|
||
article = article[0]
|
||
else:
|
||
n = None
|
||
for header in article.find_class('lister-item-header'):
|
||
a = header.xpath('.//a')[0]
|
||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||
title = a.text_content()
|
||
try:
|
||
fully = y = header.find_class('lister-item-year')[0].text_content()
|
||
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
||
if not y:
|
||
y = year
|
||
else:
|
||
y = int(y)
|
||
except:
|
||
print(n)
|
||
print(header.find_class('lister-item-year')[0].text_content())
|
||
raise
|
||
if id not in films:
|
||
films[id] = (title, y)
|
||
added += 1
|
||
'''
|
||
for a in article.xpath('.//a'):
|
||
if '/title/tt' in a.attrib['href']:
|
||
img = a.xpath('.//img')
|
||
if img:
|
||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||
if id not in films:
|
||
title = img[0].attrib['alt']
|
||
title = ox.decode_html(title)
|
||
films[id] = title
|
||
added += 1
|
||
'''
|
||
print(len(films), 'films')
|
||
if n:
|
||
data = ox.web.imdb.read_url(n, unicode=True)
|
||
else:
|
||
with open('last.html', 'w') as f:
|
||
f.write(data)
|
||
if added > 1000:
|
||
added = 0
|
||
write(films, filename)
|
||
print('>> year', year)
|
||
year += 1
|
||
|
||
write(films, filename)
|