update imdb parser
This commit is contained in:
parent
b876eef0d0
commit
e8a78709f7
3 changed files with 121 additions and 111 deletions
|
|
@ -2,5 +2,6 @@
|
||||||
|
|
||||||
COUNTRY=in
|
COUNTRY=in
|
||||||
NAME=India
|
NAME=India
|
||||||
python films_by_country.py $COUNTRY films_${COUNTRY}.json
|
python3 films_by_country.py $COUNTRY films_${COUNTRY}.json
|
||||||
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
|
python3 add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
|
||||||
|
python3 add_to_site.py <sitename> films_${COUNTRY}_metadata.json
|
||||||
|
|
|
||||||
|
|
@ -1,41 +1,115 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
import ox.web.imdb
|
from datetime import datetime, timedelta
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import codecs
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
import lxml.html
|
from ox.web.imdb import cache, read_url
|
||||||
|
import ox.geo
|
||||||
|
|
||||||
'''
|
QUERY = '''
|
||||||
python allofcountry.py in idsofindia.json
|
query advancedSearch{
|
||||||
python allofcountry.py tr idsofturkey.json
|
advancedTitleSearch(
|
||||||
|
first: 1000, sort: {sortBy: RELEASE_DATE, sortOrder: ASC}
|
||||||
|
constraints: {
|
||||||
|
releaseDateConstraint: {releaseDateRange: {start: "%s" end: "%s"}}
|
||||||
|
originCountryConstraint: {anyCountries: ["%s"]}
|
||||||
|
titleTypeConstraint: {anyTitleTypeIds: ["movie", "video", "tvMovie", "short"]}
|
||||||
|
}
|
||||||
|
) {
|
||||||
|
edges {
|
||||||
|
node{
|
||||||
|
title {
|
||||||
|
id
|
||||||
|
originalTitleText {
|
||||||
|
text
|
||||||
|
}
|
||||||
|
titleText {
|
||||||
|
text
|
||||||
|
}
|
||||||
|
titleType {
|
||||||
|
text
|
||||||
|
}
|
||||||
|
releaseYear {
|
||||||
|
year
|
||||||
|
endYear
|
||||||
|
}
|
||||||
|
countriesOfOrigin {
|
||||||
|
countries {
|
||||||
|
id
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def reset_url(url):
|
url = 'https://caching.graphql.imdb.com/'
|
||||||
x = ox.web.imdb.read_url(url, timeout=0)
|
headers = cache.DEFAULT_HEADERS.copy()
|
||||||
|
headers.update({
|
||||||
|
'Accept': 'application/graphql+json, application/json',
|
||||||
|
'Origin': 'https://www.imdb.com',
|
||||||
|
'Referer': 'https://www.imdb.com',
|
||||||
|
'x-imdb-user-country': 'US',
|
||||||
|
'x-imdb-user-language': 'en-US',
|
||||||
|
'content-type': 'application/json',
|
||||||
|
'Accept-Language': 'en,en-US;q=0.5'
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def write(films, filename):
|
def get_year(year, country):
|
||||||
data = []
|
items = []
|
||||||
for id, film in films.items():
|
start = datetime(year, 1, 1)
|
||||||
data.append({
|
while start.year == year:
|
||||||
'imdbId': id,
|
query = QUERY % (start.strftime('%Y-%m-%d'), start.strftime('%Y-%m-%d'), country.upper())
|
||||||
'title': film[0],
|
|
||||||
'year': film[1],
|
|
||||||
})
|
|
||||||
|
|
||||||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
response = json.loads(read_url(url, data=json.dumps({
|
||||||
json.dump(data, fd, indent=1, ensure_ascii=False)
|
"query": query
|
||||||
|
}), headers=headers))
|
||||||
|
edges = response['data']['advancedTitleSearch']['edges']
|
||||||
|
base_query = query
|
||||||
|
if len(response['data']['advancedTitleSearch']['edges']) == 1000:
|
||||||
|
query = query.replace('sortOrder: ASC', 'sortOrder: DESC')
|
||||||
|
response = json.loads(read_url(url + '?' + params, data=json.dumps({
|
||||||
|
"query": query
|
||||||
|
}), headers=headers))
|
||||||
|
print(response)
|
||||||
|
existing = [n["node"]["title"]["id"] for n in edges]
|
||||||
|
for edge in response['data']['advancedTitleSearch']['edges']:
|
||||||
|
if edge["node"]["title"]["id"] not in existing:
|
||||||
|
edges.append(edge)
|
||||||
|
print(start.date(), len(edges))
|
||||||
|
for row in edges:
|
||||||
|
title = row["node"]['title']
|
||||||
|
if title and title.get('countriesOfOrigin') and \
|
||||||
|
title.get('countriesOfOrigin', {}).get('countries'):
|
||||||
|
countries = [c['id'].upper() for c in title['countriesOfOrigin']['countries']]
|
||||||
|
else:
|
||||||
|
print("WTF", row)
|
||||||
|
countries = []
|
||||||
|
if country.upper() in countries:
|
||||||
|
items.append({
|
||||||
|
"imdbId": title["id"][2:],
|
||||||
|
"title": title["titleText"]["text"],
|
||||||
|
"type": title["titleType"]["text"],
|
||||||
|
"country": [ox.geo.get_country_name(c) for c in countries],
|
||||||
|
"year": year
|
||||||
|
})
|
||||||
|
start = start + timedelta(days=1)
|
||||||
|
items = {item["imdbId"]: item for item in items}
|
||||||
|
return list(items.values())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
usage = "usage: %prog [options] countrycode output.json"
|
usage = "usage: %prog [options] countrycode output.json"
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option('-r', '--reset', dest='reset', default=None, help="reset given url")
|
parser.add_option('-y', '--year', dest='year', default=1880, help="start from year")
|
||||||
|
parser.add_option('-e', '--end', dest='end', default=None, help="end at year")
|
||||||
(opts, args) = parser.parse_args()
|
(opts, args) = parser.parse_args()
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
|
|
@ -43,75 +117,18 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
films = {}
|
films = {}
|
||||||
country, filename = args
|
country, filename = args
|
||||||
current_year = datetime.now().strftime('Y')
|
country = country.upper()
|
||||||
|
|
||||||
if opts.reset:
|
year = int(opts.year)
|
||||||
reset_url(opts.reset)
|
end_year = datetime.now().year
|
||||||
|
if opts.end:
|
||||||
|
end_year = int(opts.end) + 1
|
||||||
|
|
||||||
base_url = 'http://www.imdb.com'
|
films = []
|
||||||
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
for year in range(year, end_year):
|
||||||
year = 1880
|
|
||||||
|
|
||||||
added = 0
|
|
||||||
|
|
||||||
while year < datetime.now().year:
|
|
||||||
print('<<', year)
|
print('<<', year)
|
||||||
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
|
more = get_year(year, country)
|
||||||
|
print('>>', year, len(more))
|
||||||
data = ox.web.imdb.read_url(url, unicode=True)
|
films += more
|
||||||
n = True
|
with open(filename, "w") as fd:
|
||||||
page = 1
|
json.dump(films, fd, indent=1, ensure_ascii=False)
|
||||||
while n:
|
|
||||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
|
||||||
if n:
|
|
||||||
n = '%s&page=%s' % (url, page)
|
|
||||||
page += 1
|
|
||||||
doc = lxml.html.fromstring(data)
|
|
||||||
article = doc.find_class('article')
|
|
||||||
if article:
|
|
||||||
article = article[0]
|
|
||||||
else:
|
|
||||||
n = None
|
|
||||||
for header in article.find_class('lister-item-header'):
|
|
||||||
a = header.xpath('.//a')[0]
|
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
|
||||||
title = a.text_content()
|
|
||||||
try:
|
|
||||||
fully = y = header.find_class('lister-item-year')[0].text_content()
|
|
||||||
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
|
||||||
if not y:
|
|
||||||
y = year
|
|
||||||
else:
|
|
||||||
y = int(y)
|
|
||||||
except:
|
|
||||||
print(n)
|
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
|
||||||
raise
|
|
||||||
if id not in films:
|
|
||||||
films[id] = (title, y)
|
|
||||||
added += 1
|
|
||||||
'''
|
|
||||||
for a in article.xpath('.//a'):
|
|
||||||
if '/title/tt' in a.attrib['href']:
|
|
||||||
img = a.xpath('.//img')
|
|
||||||
if img:
|
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
|
||||||
if id not in films:
|
|
||||||
title = img[0].attrib['alt']
|
|
||||||
title = ox.decode_html(title)
|
|
||||||
films[id] = title
|
|
||||||
added += 1
|
|
||||||
'''
|
|
||||||
print(len(films), 'films')
|
|
||||||
if n:
|
|
||||||
data = ox.web.imdb.read_url(n, unicode=True)
|
|
||||||
else:
|
|
||||||
with open('last.html', 'w') as f:
|
|
||||||
f.write(data)
|
|
||||||
if added > 1000:
|
|
||||||
added = 0
|
|
||||||
write(films, filename)
|
|
||||||
print('>> year', year)
|
|
||||||
year += 1
|
|
||||||
|
|
||||||
write(films, filename)
|
|
||||||
|
|
|
||||||
24
stats.py
24
stats.py
|
|
@ -1,28 +1,20 @@
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
print "usage: %s idsofcountry.json" % sys.argv[0]
|
print("usage: %s idsofcountry.json" % sys.argv[0])
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
idsofcountry = sys.argv[1]
|
idsofcountry = sys.argv[1]
|
||||||
data = json.load(open(idsofcountry))
|
data = json.load(open(idsofcountry))
|
||||||
|
|
||||||
mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
|
years = Counter()
|
||||||
tv_series = filter(lambda x: 'TV Series' in x['title'], data)
|
|
||||||
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
|
for film in data:
|
||||||
tv_special = filter(lambda x: 'TV Special' in x['title'], data)
|
years[film['year']] += 1
|
||||||
documentary = filter(lambda x: 'Documentary' in x['title'], data)
|
|
||||||
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
|
|
||||||
|
|
||||||
|
|
||||||
print len(tv_special), 'TV Specials'
|
for year in sorted(years):
|
||||||
print len(tv_series), 'TV Series'
|
print(year, years[year])
|
||||||
print len(tv_movies), 'TV Movies'
|
|
||||||
print len(mini_series), 'Mini-Series'
|
|
||||||
print len(documentary), 'Documentaries'
|
|
||||||
#print len(cinema), 'Cinema'
|
|
||||||
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
|
|
||||||
print len(data), 'total'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue