get all connections

This commit is contained in:
j 2023-07-07 14:50:14 +05:30
parent 773d288f55
commit 677b61877e
2 changed files with 62 additions and 2 deletions

View file

@ -104,7 +104,10 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
if USE_REQUESTS: if USE_REQUESTS:
if headers is None: if headers is None:
headers = DEFAULT_HEADERS.copy() headers = DEFAULT_HEADERS.copy()
r = requests_session.get(url, headers=headers) if data:
r = requests_session.post(url, data=data, headers=headers)
else:
r = requests_session.get(url, headers=headers)
for key in r.headers: for key in r.headers:
url_headers[key.lower()] = r.headers[key] url_headers[key.lower()] = r.headers[key]
result = r.content result = r.content

View file

@ -122,6 +122,7 @@ def tech_spec(metadata):
def movie_connections(metadata): def movie_connections(metadata):
connections = {} connections = {}
if 'props' not in metadata: if 'props' not in metadata:
return connections return connections
@ -428,6 +429,7 @@ class Imdb(SiteParser):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
self._id = id
if timeout != 0: if timeout != 0:
self._cache = {} self._cache = {}
url = self.baseUrl + 'releaseinfo' url = self.baseUrl + 'releaseinfo'
@ -576,7 +578,9 @@ class Imdb(SiteParser):
except: except:
pass pass
self['connections'] = movie_connections(self.get_page_data('movieconnections')) #self['connections'] = movie_connections(self.get_page_data('movieconnections'))
self['connections'] = self._get_connections()
spec = tech_spec(self.get_page_data('technical')) spec = tech_spec(self.get_page_data('technical'))
for key in spec: for key in spec:
if not self.get(key): if not self.get(key):
@ -682,6 +686,59 @@ class Imdb(SiteParser):
self['episodeDirector'] = self['director'] self['episodeDirector'] = self['director']
self['director'] = self['creator'] self['director'] = self['creator']
def _get_connections(self):
query = '''query {
title(id: "tt%s") {
id
titleText {
text
}
connections(first: 5000) {
edges {
node {
associatedTitle {
id
titleText {
text
}
}
category {
text
}
text
}
}
}
}
}
''' % self._id
url = 'https://caching.graphql.imdb.com/'
headers = cache.DEFAULT_HEADERS.copy()
headers.update({
'Accept': 'application/graphql+json, application/json',
'Origin': 'https://www.imdb.com',
'Referer': 'https://www.imdb.com',
'x-imdb-user-country': 'US',
'x-imdb-user-language': 'en-US',
'content-type': 'application/json',
'Accept-Language': 'en,en-US;q=0.5'
})
#response = requests.post(url, json=
response = json.loads(read_url(url, data=json.dumps({
"query": query
}), headers=headers))
connections = {}
for c in response['data']['title']['connections']['edges']:
cat = c['node']['category']['text']
if cat not in connections:
connections[cat] = []
connections[cat].append({
'id': c['node']['associatedTitle']['id'][2:],
'title': c['node']['associatedTitle']['titleText']['text'],
'description': c['node'].get('text')
})
return connections
class ImdbCombined(Imdb): class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):