update imdb parser

This commit is contained in:
j 2025-01-28 18:47:11 +05:30
commit e8a78709f7
3 changed files with 121 additions and 111 deletions

View file

@ -2,5 +2,6 @@
COUNTRY=in
NAME=India
python films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
python3 films_by_country.py $COUNTRY films_${COUNTRY}.json
python3 add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
python3 add_to_site.py <sitename> films_${COUNTRY}_metadata.json

View file

@ -1,41 +1,115 @@
#!/usr/bin/python3
import ox.web.imdb
import re
import json
import sys
import codecs
from datetime import datetime
from datetime import datetime, timedelta
from optparse import OptionParser
import codecs
import json
import re
import sys
import lxml.html
from ox.web.imdb import cache, read_url
import ox.geo
'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
QUERY = '''
query advancedSearch{
advancedTitleSearch(
first: 1000, sort: {sortBy: RELEASE_DATE, sortOrder: ASC}
constraints: {
releaseDateConstraint: {releaseDateRange: {start: "%s" end: "%s"}}
originCountryConstraint: {anyCountries: ["%s"]}
titleTypeConstraint: {anyTitleTypeIds: ["movie", "video", "tvMovie", "short"]}
}
) {
edges {
node{
title {
id
originalTitleText {
text
}
titleText {
text
}
titleType {
text
}
releaseYear {
year
endYear
}
countriesOfOrigin {
countries {
id
}
}
}
}
}
}
}
'''
def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0)
url = 'https://caching.graphql.imdb.com/'
headers = cache.DEFAULT_HEADERS.copy()
headers.update({
'Accept': 'application/graphql+json, application/json',
'Origin': 'https://www.imdb.com',
'Referer': 'https://www.imdb.com',
'x-imdb-user-country': 'US',
'x-imdb-user-language': 'en-US',
'content-type': 'application/json',
'Accept-Language': 'en,en-US;q=0.5'
})
def write(films, filename):
data = []
for id, film in films.items():
data.append({
'imdbId': id,
'title': film[0],
'year': film[1],
def get_year(year, country):
items = []
start = datetime(year, 1, 1)
while start.year == year:
query = QUERY % (start.strftime('%Y-%m-%d'), start.strftime('%Y-%m-%d'), country.upper())
response = json.loads(read_url(url, data=json.dumps({
"query": query
}), headers=headers))
edges = response['data']['advancedTitleSearch']['edges']
base_query = query
if len(response['data']['advancedTitleSearch']['edges']) == 1000:
query = query.replace('sortOrder: ASC', 'sortOrder: DESC')
response = json.loads(read_url(url + '?' + params, data=json.dumps({
"query": query
}), headers=headers))
print(response)
existing = [n["node"]["title"]["id"] for n in edges]
for edge in response['data']['advancedTitleSearch']['edges']:
if edge["node"]["title"]["id"] not in existing:
edges.append(edge)
print(start.date(), len(edges))
for row in edges:
title = row["node"]['title']
if title and title.get('countriesOfOrigin') and \
title.get('countriesOfOrigin', {}).get('countries'):
countries = [c['id'].upper() for c in title['countriesOfOrigin']['countries']]
else:
print("WTF", row)
countries = []
if country.upper() in countries:
items.append({
"imdbId": title["id"][2:],
"title": title["titleText"]["text"],
"type": title["titleType"]["text"],
"country": [ox.geo.get_country_name(c) for c in countries],
"year": year
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
start = start + timedelta(days=1)
items = {item["imdbId"]: item for item in items}
return list(items.values())
if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage)
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
parser.add_option('-y', '--year', dest='year', default=1880, help="start from year")
parser.add_option('-e', '--end', dest='end', default=None, help="end at year")
(opts, args) = parser.parse_args()
if len(args) != 2:
parser.print_help()
@ -43,75 +117,18 @@ if __name__ == '__main__':
films = {}
country, filename = args
current_year = datetime.now().strftime('Y')
country = country.upper()
if opts.reset:
reset_url(opts.reset)
year = int(opts.year)
end_year = datetime.now().year
if opts.end:
end_year = int(opts.end) + 1
base_url = 'http://www.imdb.com'
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
year = 1880
added = 0
while year < datetime.now().year:
films = []
for year in range(year, end_year):
print('<<', year)
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
data = ox.web.imdb.read_url(url, unicode=True)
n = True
page = 1
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content()
try:
fully = y = header.find_class('lister-item-year')[0].text_content()
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
if id not in films:
films[id] = (title, y)
added += 1
'''
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
'''
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
print('>> year', year)
year += 1
write(films, filename)
more = get_year(year, country)
print('>>', year, len(more))
films += more
with open(filename, "w") as fd:
json.dump(films, fd, indent=1, ensure_ascii=False)

View file

@ -1,28 +1,20 @@
import json
import sys
from collections import Counter
if len(sys.argv) != 2:
print "usage: %s idsofcountry.json" % sys.argv[0]
print("usage: %s idsofcountry.json" % sys.argv[0])
sys.exit(1)
idsofcountry = sys.argv[1]
data = json.load(open(idsofcountry))
mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
tv_series = filter(lambda x: 'TV Series' in x['title'], data)
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
tv_special = filter(lambda x: 'TV Special' in x['title'], data)
documentary = filter(lambda x: 'Documentary' in x['title'], data)
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
years = Counter()
for film in data:
years[film['year']] += 1
print len(tv_special), 'TV Specials'
print len(tv_series), 'TV Series'
print len(tv_movies), 'TV Movies'
print len(mini_series), 'Mini-Series'
print len(documentary), 'Documentaries'
#print len(cinema), 'Cinema'
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
print len(data), 'total'
for year in sorted(years):
print(year, years[year])