update parser

This commit is contained in:
j 2018-04-02 20:27:36 +05:30
parent f6ed20d87b
commit 322d63f234
3 changed files with 86 additions and 42 deletions

View file

@ -1,5 +1,6 @@
# collection tools to create *cine.ma sites # collection tools to create *cine.ma sites
COUNTRY=in COUNTRY=in
python films_by_country.py $COUNTRY films_$COUNTRY.json NAME=India
python add_metadata.py $COUNTRY films_$COUNTRY.json films_$COUNTRY_metadata.json python films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json

View file

@ -1,15 +1,17 @@
#!/usr/bin/python #!/usr/bin/python3
from optparse import OptionParser from optparse import OptionParser
import json import json
import codecs import codecs
import sys import sys
import os import os
from datetime import datetime
import ox import ox
def add_metadata(films, country, output): def add_metadata(films, country, output):
meta = [] meta = []
api = ox.API('https://indiancine.ma/api/') api = ox.API('https://indiancine.ma/api/')
current_year = datetime.now().year
if os.path.exists(output): if os.path.exists(output):
with open(output) as fd: with open(output) as fd:
@ -40,16 +42,23 @@ def add_metadata(films, country, output):
'originalTitle', 'year' 'originalTitle', 'year'
] ]
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print info print(info)
print extra print(extra)
if not extra: if not extra:
save() save()
print 'lets try again' print('lets try again')
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print extra print(extra)
if 'isSeries' in extra or ('country' in extra and not country in extra['country']): y = extra.get('year')
if y:
y = int(y)
if '(????)' in info.get('title', '') or not y or y >= current_year:
info['delete'] = True info['delete'] = True
print 'deleting', info['imdbId'], info.get('title') print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
continue
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
info['delete'] = True
print('deleting', info['imdbId'], info.get('title'))
continue continue
if 'originalTitle' in extra: if 'originalTitle' in extra:
info['alternativeTitles'] = [[info['title'], '']] info['alternativeTitles'] = [[info['title'], '']]
@ -59,13 +68,14 @@ def add_metadata(films, country, output):
for key in extra: for key in extra:
if key not in info: if key not in info:
info[key] = extra[key] info[key] = extra[key]
print info['imdbId'], info['title'] print(info['imdbId'], info['title'])
meta.append(info) meta.append(info)
if len(meta) % 100 == 0: if len(meta) % 100 == 0:
save() save()
save() save()
return meta return meta
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] country films.json films_with_metadata.json" usage = "usage: %prog [options] country films.json films_with_metadata.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)

View file

@ -1,11 +1,15 @@
#!/usr/bin/python #!/usr/bin/python3
import ox.web.imdb import ox.web.imdb
import re import re
import json import json
import sys import sys
import codecs import codecs
from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
import lxml.html
''' '''
python allofcountry.py in idsofindia.json python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json python allofcountry.py tr idsofturkey.json
@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json
def reset_url(url): def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0) x = ox.web.imdb.read_url(url, timeout=0)
def write(films, filename):
data = []
for id, title in films.items():
data.append({
'imdbId': id,
'title': title
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json" usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)
@ -23,40 +40,56 @@ if __name__ == '__main__':
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
films = [] films = {}
country, filename = args country, filename = args
current_year = datetime.now().strftime('Y')
if opts.reset: if opts.reset:
reset_url(opts.reset) reset_url(opts.reset)
base_url = 'http://akas.imdb.com'
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
data = ox.web.imdb.read_url(url)
n = True
while n:
n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
if n:
n = '%s%s' % (base_url, n[0].split('href="')[-1])
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data) base_url = 'http://www.imdb.com'
if results: #url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0]) year = 1880
print n
print len(films), 'films'
if n:
data = ox.web.imdb.read_url(n)
else:
with open('last.html', 'w') as f:
f.write(data)
if len(films) % 1000 == 0:
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump([{
'imdbId': f[0],
'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)
with codecs.open(filename, 'w', encoding='utf-8') as fd: added = 0
json.dump([{
'imdbId': f[0], while year < datetime.now().year:
'title': ox.decode_html(f[1]) url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
} for f in films], fd, indent=1, ensure_ascii=False)
data = ox.web.imdb.read_url(url, unicode=True)
n = True
page = 1
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
year += 1
print('>> year', year)
write(films, filename)