get all connections

2023-07-07 14:50:14 +05:30 · 2023-07-07 14:50:14 +05:30 · 677b61877e
commit 677b61877e
parent 773d288f55
2 changed files with 62 additions and 2 deletions
--- a/ox/cache.py
+++ b/ox/cache.py
@ -104,6 +104,9 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
        if USE_REQUESTS:
            if headers is None:
                headers = DEFAULT_HEADERS.copy()
            if data:
                r = requests_session.post(url, data=data, headers=headers)
            else:
                r = requests_session.get(url, headers=headers)
            for key in r.headers:
                url_headers[key.lower()] = r.headers[key]
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@ -122,6 +122,7 @@ def tech_spec(metadata):
 def movie_connections(metadata):
    connections = {}
    if 'props' not in metadata:
        return connections
@ -428,6 +429,7 @@ class Imdb(SiteParser):
    def __init__(self, id, timeout=-1):
        # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
        self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
        self._id = id
        if timeout != 0:
            self._cache = {}
            url = self.baseUrl + 'releaseinfo'
@ -576,7 +578,9 @@ class Imdb(SiteParser):
            except:
                pass
-        self['connections'] = movie_connections(self.get_page_data('movieconnections'))
+        #self['connections'] = movie_connections(self.get_page_data('movieconnections'))
        self['connections'] = self._get_connections()
        spec = tech_spec(self.get_page_data('technical'))
        for key in spec:
            if not self.get(key):
@ -682,6 +686,59 @@ class Imdb(SiteParser):
                self['episodeDirector'] = self['director']
            self['director'] = self['creator']
    def _get_connections(self):
        query = '''query {
    title(id: "tt%s") {
        id
        titleText {
           text
        }
        connections(first: 5000) {
            edges {
                node {
                    associatedTitle {
                        id
                        titleText {
                            text
                        }
                    }
                    category {
                        text
                    }
                    text
                }
            }
        }
    }
 }
 ''' % self._id
        url = 'https://caching.graphql.imdb.com/'
        headers = cache.DEFAULT_HEADERS.copy()
        headers.update({
             'Accept': 'application/graphql+json, application/json',
             'Origin': 'https://www.imdb.com',
             'Referer': 'https://www.imdb.com',
             'x-imdb-user-country': 'US',
             'x-imdb-user-language': 'en-US',
             'content-type': 'application/json',
             'Accept-Language': 'en,en-US;q=0.5'
        })
        #response = requests.post(url, json=
        response = json.loads(read_url(url, data=json.dumps({
            "query": query
        }), headers=headers))
        connections = {}
        for c in response['data']['title']['connections']['edges']:
            cat = c['node']['category']['text']
            if cat not in connections:
                connections[cat] = []
            connections[cat].append({
                'id': c['node']['associatedTitle']['id'][2:],
                'title': c['node']['associatedTitle']['titleText']['text'],
                'description': c['node'].get('text')
            })
        return connections
 class ImdbCombined(Imdb):
    def __init__(self, id, timeout=-1):