update imdb parser

This commit is contained in:
j 2025-01-28 18:47:11 +05:30
commit e8a78709f7
3 changed files with 121 additions and 111 deletions

View file

@ -2,5 +2,6 @@
COUNTRY=in COUNTRY=in
NAME=India NAME=India
python films_by_country.py $COUNTRY films_${COUNTRY}.json python3 films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json python3 add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
python3 add_to_site.py <sitename> films_${COUNTRY}_metadata.json

View file

@ -1,41 +1,115 @@
#!/usr/bin/python3 #!/usr/bin/python3
import ox.web.imdb from datetime import datetime, timedelta
import re
import json
import sys
import codecs
from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
import codecs
import json
import re
import sys
import lxml.html from ox.web.imdb import cache, read_url
import ox.geo
''' QUERY = '''
python allofcountry.py in idsofindia.json query advancedSearch{
python allofcountry.py tr idsofturkey.json advancedTitleSearch(
first: 1000, sort: {sortBy: RELEASE_DATE, sortOrder: ASC}
constraints: {
releaseDateConstraint: {releaseDateRange: {start: "%s" end: "%s"}}
originCountryConstraint: {anyCountries: ["%s"]}
titleTypeConstraint: {anyTitleTypeIds: ["movie", "video", "tvMovie", "short"]}
}
) {
edges {
node{
title {
id
originalTitleText {
text
}
titleText {
text
}
titleType {
text
}
releaseYear {
year
endYear
}
countriesOfOrigin {
countries {
id
}
}
}
}
}
}
}
''' '''
def reset_url(url): url = 'https://caching.graphql.imdb.com/'
x = ox.web.imdb.read_url(url, timeout=0) headers = cache.DEFAULT_HEADERS.copy()
headers.update({
'Accept': 'application/graphql+json, application/json',
'Origin': 'https://www.imdb.com',
'Referer': 'https://www.imdb.com',
'x-imdb-user-country': 'US',
'x-imdb-user-language': 'en-US',
'content-type': 'application/json',
'Accept-Language': 'en,en-US;q=0.5'
})
def write(films, filename): def get_year(year, country):
data = [] items = []
for id, film in films.items(): start = datetime(year, 1, 1)
data.append({ while start.year == year:
'imdbId': id, query = QUERY % (start.strftime('%Y-%m-%d'), start.strftime('%Y-%m-%d'), country.upper())
'title': film[0],
'year': film[1],
})
with codecs.open(filename, 'w', encoding='utf-8') as fd: response = json.loads(read_url(url, data=json.dumps({
json.dump(data, fd, indent=1, ensure_ascii=False) "query": query
}), headers=headers))
edges = response['data']['advancedTitleSearch']['edges']
base_query = query
if len(response['data']['advancedTitleSearch']['edges']) == 1000:
query = query.replace('sortOrder: ASC', 'sortOrder: DESC')
response = json.loads(read_url(url + '?' + params, data=json.dumps({
"query": query
}), headers=headers))
print(response)
existing = [n["node"]["title"]["id"] for n in edges]
for edge in response['data']['advancedTitleSearch']['edges']:
if edge["node"]["title"]["id"] not in existing:
edges.append(edge)
print(start.date(), len(edges))
for row in edges:
title = row["node"]['title']
if title and title.get('countriesOfOrigin') and \
title.get('countriesOfOrigin', {}).get('countries'):
countries = [c['id'].upper() for c in title['countriesOfOrigin']['countries']]
else:
print("WTF", row)
countries = []
if country.upper() in countries:
items.append({
"imdbId": title["id"][2:],
"title": title["titleText"]["text"],
"type": title["titleType"]["text"],
"country": [ox.geo.get_country_name(c) for c in countries],
"year": year
})
start = start + timedelta(days=1)
items = {item["imdbId"]: item for item in items}
return list(items.values())
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json" usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url") parser.add_option('-y', '--year', dest='year', default=1880, help="start from year")
parser.add_option('-e', '--end', dest='end', default=None, help="end at year")
(opts, args) = parser.parse_args() (opts, args) = parser.parse_args()
if len(args) != 2: if len(args) != 2:
parser.print_help() parser.print_help()
@ -43,75 +117,18 @@ if __name__ == '__main__':
films = {} films = {}
country, filename = args country, filename = args
current_year = datetime.now().strftime('Y') country = country.upper()
if opts.reset: year = int(opts.year)
reset_url(opts.reset) end_year = datetime.now().year
if opts.end:
end_year = int(opts.end) + 1
base_url = 'http://www.imdb.com' films = []
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country) for year in range(year, end_year):
year = 1880
added = 0
while year < datetime.now().year:
print('<<', year) print('<<', year)
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country) more = get_year(year, country)
print('>>', year, len(more))
data = ox.web.imdb.read_url(url, unicode=True) films += more
n = True with open(filename, "w") as fd:
page = 1 json.dump(films, fd, indent=1, ensure_ascii=False)
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for header in article.find_class('lister-item-header'):
a = header.xpath('.//a')[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content()
try:
fully = y = header.find_class('lister-item-year')[0].text_content()
y = y.rsplit('(', 1)[-1].split(')')[0].split('')[0].split(' ')[0].strip()
if not y:
y = year
else:
y = int(y)
except:
print(n)
print(header.find_class('lister-item-year')[0].text_content())
raise
if id not in films:
films[id] = (title, y)
added += 1
'''
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
'''
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
print('>> year', year)
year += 1
write(films, filename)

View file

@ -1,28 +1,20 @@
import json import json
import sys import sys
from collections import Counter
if len(sys.argv) != 2: if len(sys.argv) != 2:
print "usage: %s idsofcountry.json" % sys.argv[0] print("usage: %s idsofcountry.json" % sys.argv[0])
sys.exit(1) sys.exit(1)
idsofcountry = sys.argv[1] idsofcountry = sys.argv[1]
data = json.load(open(idsofcountry)) data = json.load(open(idsofcountry))
mini_series = filter(lambda x: 'Mini-Series' in x['title'], data) years = Counter()
tv_series = filter(lambda x: 'TV Series' in x['title'], data)
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) for film in data:
tv_special = filter(lambda x: 'TV Special' in x['title'], data) years[film['year']] += 1
documentary = filter(lambda x: 'Documentary' in x['title'], data)
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
print len(tv_special), 'TV Specials' for year in sorted(years):
print len(tv_series), 'TV Series' print(year, years[year])
print len(tv_movies), 'TV Movies'
print len(mini_series), 'Mini-Series'
print len(documentary), 'Documentaries'
#print len(cinema), 'Cinema'
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
print len(data), 'total'