update imdb parser

2025-01-28 18:47:11 +05:30 · 2025-01-28 18:47:11 +05:30 · e8a78709f7
commit e8a78709f7
parent b876eef0d0
3 changed files with 121 additions and 111 deletions
--- a/README.md
+++ b/README.md
@ -2,5 +2,6 @@

    COUNTRY=in
    NAME=India
-    python films_by_country.py $COUNTRY films_${COUNTRY}.json
-    python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
+    python3 films_by_country.py $COUNTRY films_${COUNTRY}.json
+    python3 add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
+    python3 add_to_site.py <sitename> films_${COUNTRY}_metadata.json
--- a/films_by_country.py
+++ b/films_by_country.py
@ -1,41 +1,115 @@
 #!/usr/bin/python3
-import ox.web.imdb
-import re
-import json
-import sys
-import codecs
-
-from datetime import datetime
+from datetime import datetime, timedelta
 from optparse import OptionParser
+import codecs
+import json
+import re
+import sys

-import lxml.html
+from ox.web.imdb import cache, read_url
+import ox.geo

-'''
-python allofcountry.py in idsofindia.json
-python allofcountry.py tr idsofturkey.json
+QUERY = '''
+query advancedSearch{
+  advancedTitleSearch(
+    first: 1000, sort: {sortBy: RELEASE_DATE, sortOrder: ASC}
+    constraints: {
+      releaseDateConstraint: {releaseDateRange: {start: "%s" end: "%s"}}
+      originCountryConstraint: {anyCountries: ["%s"]}
+      titleTypeConstraint: {anyTitleTypeIds: ["movie", "video", "tvMovie", "short"]}
+    }
+  ) {
+    edges {
+      node{
+        title {
+          id
+          originalTitleText {
+            text
+          }
+          titleText {
+            text
+          }
+          titleType {
+            text
+          }
+          releaseYear {
+            year
+            endYear
+          }
+          countriesOfOrigin {
+             countries {
+                 id
+
+             }
+          }
+        }
+      }
+    }
+  }
+}
 '''

-def reset_url(url):
-    x = ox.web.imdb.read_url(url, timeout=0)
+url = 'https://caching.graphql.imdb.com/'
+headers = cache.DEFAULT_HEADERS.copy()
+headers.update({
+    'Accept': 'application/graphql+json, application/json',
+    'Origin': 'https://www.imdb.com',
+    'Referer': 'https://www.imdb.com',
+    'x-imdb-user-country': 'US',
+    'x-imdb-user-language': 'en-US',
+    'content-type': 'application/json',
+    'Accept-Language': 'en,en-US;q=0.5'
+})


-def write(films, filename):
-    data = []
-    for id, film in films.items():
-        data.append({
-            'imdbId': id,
-            'title': film[0],
-            'year': film[1],
+def get_year(year, country):
+    items = []
+    start = datetime(year, 1, 1)
+    while start.year == year:
+        query = QUERY % (start.strftime('%Y-%m-%d'), start.strftime('%Y-%m-%d'), country.upper())
+
+        response = json.loads(read_url(url, data=json.dumps({
+            "query": query
+        }), headers=headers))
+        edges = response['data']['advancedTitleSearch']['edges']
+        base_query = query
+        if len(response['data']['advancedTitleSearch']['edges']) == 1000:
+            query = query.replace('sortOrder: ASC', 'sortOrder: DESC')
+            response = json.loads(read_url(url + '?' + params, data=json.dumps({
+                "query": query
+            }), headers=headers))
+            print(response)
+            existing = [n["node"]["title"]["id"] for n in edges]
+            for edge in response['data']['advancedTitleSearch']['edges']:
+                if edge["node"]["title"]["id"] not in existing:
+                    edges.append(edge)
+        print(start.date(), len(edges))
+        for row in edges:
+            title = row["node"]['title']
+            if title and title.get('countriesOfOrigin') and \
+                    title.get('countriesOfOrigin', {}).get('countries'):
+                countries = [c['id'].upper() for c in title['countriesOfOrigin']['countries']]
+            else:
+                print("WTF", row)
+                countries = []
+            if country.upper() in countries:
+                items.append({
+                    "imdbId": title["id"][2:],
+                    "title": title["titleText"]["text"],
+                    "type": title["titleType"]["text"],
+                    "country": [ox.geo.get_country_name(c) for c in countries],
+                    "year": year
                })
-
-    with codecs.open(filename, 'w', encoding='utf-8') as fd:
-        json.dump(data, fd, indent=1, ensure_ascii=False)
+        start = start + timedelta(days=1)
+    items = {item["imdbId"]: item for item in items}
+    return list(items.values())


 if __name__ == '__main__':
    usage = "usage: %prog [options] countrycode output.json"
    parser = OptionParser(usage=usage)
-    parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
+    parser.add_option('-y', '--year', dest='year', default=1880, help="start from year")
+    parser.add_option('-e', '--end', dest='end', default=None, help="end at year")
    (opts, args) = parser.parse_args()
    if len(args) != 2:
        parser.print_help()
@ -43,75 +117,18 @@ if __name__ == '__main__':

    films = {}
    country, filename = args
-    current_year = datetime.now().strftime('Y')
+    country = country.upper()

-    if opts.reset:
-        reset_url(opts.reset)
+    year = int(opts.year)
+    end_year = datetime.now().year
+    if opts.end:
+        end_year = int(opts.end) + 1

-    base_url = 'http://www.imdb.com'
-    #url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
-    year = 1880
-
-    added = 0
-
-    while year < datetime.now().year:
+    films = []
+    for year in range(year, end_year):
        print('<<', year)
-        url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
-
-        data = ox.web.imdb.read_url(url, unicode=True)
-        n = True
-        page = 1
-        while n:
-            n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
-            if n:
-                n = '%s&page=%s' % (url, page)
-                page += 1
-            doc = lxml.html.fromstring(data)
-            article = doc.find_class('article')
-            if article:
-                article = article[0]
-            else:
-                n = None
-            for header in article.find_class('lister-item-header'):
-                a = header.xpath('.//a')[0]
-                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-                title = a.text_content()
-                try:
-                    fully = y = header.find_class('lister-item-year')[0].text_content()
-                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
-                    if not y:
-                        y = year
-                    else:
-                        y = int(y)
-                except:
-                    print(n)
-                    print(header.find_class('lister-item-year')[0].text_content())
-                    raise
-                if id not in films:
-                    films[id] = (title, y)
-                    added += 1
-            '''
-            for a in article.xpath('.//a'):
-                if '/title/tt' in a.attrib['href']:
-                    img = a.xpath('.//img')
-                    if img:
-                        id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-                        if id not in films:
-                            title = img[0].attrib['alt']
-                            title = ox.decode_html(title)
-                            films[id] = title
-                            added += 1
-            '''
-            print(len(films), 'films')
-            if n:
-                data = ox.web.imdb.read_url(n, unicode=True)
-            else:
-                with open('last.html', 'w') as f:
-                    f.write(data)
-            if added > 1000:
-                added = 0
-                write(films, filename)
-        print('>> year', year)
-        year += 1
-
-    write(films, filename)
+        more = get_year(year, country)
+        print('>>', year, len(more))
+        films += more
+        with open(filename, "w") as fd:
+            json.dump(films, fd, indent=1, ensure_ascii=False)
--- a/stats.py
+++ b/stats.py
@ -1,28 +1,20 @@
 import json
 import sys
+from collections import Counter

 if len(sys.argv) != 2:
-    print "usage: %s idsofcountry.json" % sys.argv[0]
+    print("usage: %s idsofcountry.json" % sys.argv[0])
    sys.exit(1)

 idsofcountry = sys.argv[1]
 data = json.load(open(idsofcountry))

-mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
-tv_series = filter(lambda x: 'TV Series' in x['title'], data)
-tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
-tv_special = filter(lambda x: 'TV Special' in x['title'], data)
-documentary = filter(lambda x: 'Documentary' in x['title'], data)
-#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
+years = Counter()
+
+for film in data:
+    years[film['year']] += 1


-print len(tv_special), 'TV Specials'
-print len(tv_series), 'TV Series'
-print len(tv_movies), 'TV Movies'
-print len(mini_series), 'Mini-Series'
-print len(documentary), 'Documentaries'
-#print len(cinema), 'Cinema'
-print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
-print len(data), 'total'
-
+for year in sorted(years):
+    print(year, years[year])