From e8a78709f7cd4f5512ccf2ed8d72b0b7acd87ce6 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 28 Jan 2025 18:47:11 +0530 Subject: [PATCH] update imdb parser --- README.md | 5 +- films_by_country.py | 203 ++++++++++++++++++++++++-------------------- stats.py | 24 ++---- 3 files changed, 121 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 5e4154f..5769012 100644 --- a/README.md +++ b/README.md @@ -2,5 +2,6 @@ COUNTRY=in NAME=India - python films_by_country.py $COUNTRY films_${COUNTRY}.json - python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json + python3 films_by_country.py $COUNTRY films_${COUNTRY}.json + python3 add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json + python3 add_to_site.py films_${COUNTRY}_metadata.json diff --git a/films_by_country.py b/films_by_country.py index 58c063c..b4fc514 100755 --- a/films_by_country.py +++ b/films_by_country.py @@ -1,41 +1,115 @@ #!/usr/bin/python3 -import ox.web.imdb -import re -import json -import sys -import codecs - -from datetime import datetime +from datetime import datetime, timedelta from optparse import OptionParser +import codecs +import json +import re +import sys -import lxml.html +from ox.web.imdb import cache, read_url +import ox.geo -''' -python allofcountry.py in idsofindia.json -python allofcountry.py tr idsofturkey.json +QUERY = ''' +query advancedSearch{ + advancedTitleSearch( + first: 1000, sort: {sortBy: RELEASE_DATE, sortOrder: ASC} + constraints: { + releaseDateConstraint: {releaseDateRange: {start: "%s" end: "%s"}} + originCountryConstraint: {anyCountries: ["%s"]} + titleTypeConstraint: {anyTitleTypeIds: ["movie", "video", "tvMovie", "short"]} + } + ) { + edges { + node{ + title { + id + originalTitleText { + text + } + titleText { + text + } + titleType { + text + } + releaseYear { + year + endYear + } + countriesOfOrigin { + countries { + id + + } + } + } + } + } + } +} ''' -def reset_url(url): - x = ox.web.imdb.read_url(url, timeout=0) +url = 'https://caching.graphql.imdb.com/' +headers = cache.DEFAULT_HEADERS.copy() +headers.update({ + 'Accept': 'application/graphql+json, application/json', + 'Origin': 'https://www.imdb.com', + 'Referer': 'https://www.imdb.com', + 'x-imdb-user-country': 'US', + 'x-imdb-user-language': 'en-US', + 'content-type': 'application/json', + 'Accept-Language': 'en,en-US;q=0.5' +}) -def write(films, filename): - data = [] - for id, film in films.items(): - data.append({ - 'imdbId': id, - 'title': film[0], - 'year': film[1], - }) +def get_year(year, country): + items = [] + start = datetime(year, 1, 1) + while start.year == year: + query = QUERY % (start.strftime('%Y-%m-%d'), start.strftime('%Y-%m-%d'), country.upper()) - with codecs.open(filename, 'w', encoding='utf-8') as fd: - json.dump(data, fd, indent=1, ensure_ascii=False) + response = json.loads(read_url(url, data=json.dumps({ + "query": query + }), headers=headers)) + edges = response['data']['advancedTitleSearch']['edges'] + base_query = query + if len(response['data']['advancedTitleSearch']['edges']) == 1000: + query = query.replace('sortOrder: ASC', 'sortOrder: DESC') + response = json.loads(read_url(url + '?' + params, data=json.dumps({ + "query": query + }), headers=headers)) + print(response) + existing = [n["node"]["title"]["id"] for n in edges] + for edge in response['data']['advancedTitleSearch']['edges']: + if edge["node"]["title"]["id"] not in existing: + edges.append(edge) + print(start.date(), len(edges)) + for row in edges: + title = row["node"]['title'] + if title and title.get('countriesOfOrigin') and \ + title.get('countriesOfOrigin', {}).get('countries'): + countries = [c['id'].upper() for c in title['countriesOfOrigin']['countries']] + else: + print("WTF", row) + countries = [] + if country.upper() in countries: + items.append({ + "imdbId": title["id"][2:], + "title": title["titleText"]["text"], + "type": title["titleType"]["text"], + "country": [ox.geo.get_country_name(c) for c in countries], + "year": year + }) + start = start + timedelta(days=1) + items = {item["imdbId"]: item for item in items} + return list(items.values()) if __name__ == '__main__': usage = "usage: %prog [options] countrycode output.json" parser = OptionParser(usage=usage) - parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url") + parser.add_option('-y', '--year', dest='year', default=1880, help="start from year") + parser.add_option('-e', '--end', dest='end', default=None, help="end at year") (opts, args) = parser.parse_args() if len(args) != 2: parser.print_help() @@ -43,75 +117,18 @@ if __name__ == '__main__': films = {} country, filename = args - current_year = datetime.now().strftime('Y') + country = country.upper() - if opts.reset: - reset_url(opts.reset) + year = int(opts.year) + end_year = datetime.now().year + if opts.end: + end_year = int(opts.end) + 1 - base_url = 'http://www.imdb.com' - #url = '%s/search/title?countries=%s&sort=year' % (base_url, country) - year = 1880 - - added = 0 - - while year < datetime.now().year: + films = [] + for year in range(year, end_year): print('<<', year) - url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) - - data = ox.web.imdb.read_url(url, unicode=True) - n = True - page = 1 - while n: - n = re.compile('Next »', re.DOTALL).findall(data) - if n: - n = '%s&page=%s' % (url, page) - page += 1 - doc = lxml.html.fromstring(data) - article = doc.find_class('article') - if article: - article = article[0] - else: - n = None - for header in article.find_class('lister-item-header'): - a = header.xpath('.//a')[0] - id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - title = a.text_content() - try: - fully = y = header.find_class('lister-item-year')[0].text_content() - y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip() - if not y: - y = year - else: - y = int(y) - except: - print(n) - print(header.find_class('lister-item-year')[0].text_content()) - raise - if id not in films: - films[id] = (title, y) - added += 1 - ''' - for a in article.xpath('.//a'): - if '/title/tt' in a.attrib['href']: - img = a.xpath('.//img') - if img: - id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] - if id not in films: - title = img[0].attrib['alt'] - title = ox.decode_html(title) - films[id] = title - added += 1 - ''' - print(len(films), 'films') - if n: - data = ox.web.imdb.read_url(n, unicode=True) - else: - with open('last.html', 'w') as f: - f.write(data) - if added > 1000: - added = 0 - write(films, filename) - print('>> year', year) - year += 1 - - write(films, filename) + more = get_year(year, country) + print('>>', year, len(more)) + films += more + with open(filename, "w") as fd: + json.dump(films, fd, indent=1, ensure_ascii=False) diff --git a/stats.py b/stats.py index 3a3d055..d553fb2 100644 --- a/stats.py +++ b/stats.py @@ -1,28 +1,20 @@ import json import sys +from collections import Counter if len(sys.argv) != 2: - print "usage: %s idsofcountry.json" % sys.argv[0] + print("usage: %s idsofcountry.json" % sys.argv[0]) sys.exit(1) idsofcountry = sys.argv[1] data = json.load(open(idsofcountry)) -mini_series = filter(lambda x: 'Mini-Series' in x['title'], data) -tv_series = filter(lambda x: 'TV Series' in x['title'], data) -tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) -tv_special = filter(lambda x: 'TV Special' in x['title'], data) -documentary = filter(lambda x: 'Documentary' in x['title'], data) -#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies) +years = Counter() + +for film in data: + years[film['year']] += 1 -print len(tv_special), 'TV Specials' -print len(tv_series), 'TV Series' -print len(tv_movies), 'TV Movies' -print len(mini_series), 'Mini-Series' -print len(documentary), 'Documentaries' -#print len(cinema), 'Cinema' -print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies' -print len(data), 'total' - +for year in sorted(years): + print(year, years[year])