update parser

This commit is contained in:
j 2018-04-02 20:27:36 +05:30
parent f6ed20d87b
commit 322d63f234
3 changed files with 86 additions and 42 deletions

View file

@ -1,5 +1,6 @@
# collection tools to create *cine.ma sites
COUNTRY=in
python films_by_country.py $COUNTRY films_$COUNTRY.json
python add_metadata.py $COUNTRY films_$COUNTRY.json films_$COUNTRY_metadata.json
NAME=India
python films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json

View file

@ -1,15 +1,17 @@
#!/usr/bin/python
#!/usr/bin/python3
from optparse import OptionParser
import json
import codecs
import sys
import os
from datetime import datetime
import ox
def add_metadata(films, country, output):
meta = []
api = ox.API('https://indiancine.ma/api/')
current_year = datetime.now().year
if os.path.exists(output):
with open(output) as fd:
@ -40,16 +42,23 @@ def add_metadata(films, country, output):
'originalTitle', 'year'
]
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print info
print extra
print(info)
print(extra)
if not extra:
save()
print 'lets try again'
print('lets try again')
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print extra
if 'isSeries' in extra or ('country' in extra and not country in extra['country']):
print(extra)
y = extra.get('year')
if y:
y = int(y)
if '(????)' in info.get('title', '') or not y or y >= current_year:
info['delete'] = True
print 'deleting', info['imdbId'], info.get('title')
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
continue
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
info['delete'] = True
print('deleting', info['imdbId'], info.get('title'))
continue
if 'originalTitle' in extra:
info['alternativeTitles'] = [[info['title'], '']]
@ -59,13 +68,14 @@ def add_metadata(films, country, output):
for key in extra:
if key not in info:
info[key] = extra[key]
print info['imdbId'], info['title']
print(info['imdbId'], info['title'])
meta.append(info)
if len(meta) % 100 == 0:
save()
save()
return meta
if __name__ == '__main__':
usage = "usage: %prog [options] country films.json films_with_metadata.json"
parser = OptionParser(usage=usage)

View file

@ -1,11 +1,15 @@
#!/usr/bin/python
#!/usr/bin/python3
import ox.web.imdb
import re
import json
import sys
import codecs
from datetime import datetime
from optparse import OptionParser
import lxml.html
'''
python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json
@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json
def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0)
def write(films, filename):
data = []
for id, title in films.items():
data.append({
'imdbId': id,
'title': title
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage)
@ -23,40 +40,56 @@ if __name__ == '__main__':
parser.print_help()
sys.exit(1)
films = []
films = {}
country, filename = args
current_year = datetime.now().strftime('Y')
if opts.reset:
reset_url(opts.reset)
base_url = 'http://akas.imdb.com'
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
data = ox.web.imdb.read_url(url)
n = True
while n:
n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
if n:
n = '%s%s' % (base_url, n[0].split('href="')[-1])
base_url = 'http://www.imdb.com'
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
year = 1880
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
if results:
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
print n
print len(films), 'films'
if n:
data = ox.web.imdb.read_url(n)
else:
with open('last.html', 'w') as f:
f.write(data)
if len(films) % 1000 == 0:
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump([{
'imdbId': f[0],
'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)
added = 0
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump([{
'imdbId': f[0],
'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)
while year < datetime.now().year:
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
data = ox.web.imdb.read_url(url, unicode=True)
n = True
page = 1
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
year += 1
print('>> year', year)
write(films, filename)