Compare commits

...

3 commits

Author SHA1 Message Date
j
322d63f234 update parser 2018-04-02 20:28:42 +05:30
j
f6ed20d87b cleanup 2018-04-02 20:28:42 +05:30
j
ccb54122cc stats 2018-04-02 20:28:42 +05:30
6 changed files with 100 additions and 45 deletions

1
README
View file

@ -1 +0,0 @@
collection tools to create *cine.ma sites

6
README.md Normal file
View file

@ -0,0 +1,6 @@
# collection tools to create *cine.ma sites
COUNTRY=in
NAME=India
python films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json

View file

@ -1,15 +1,17 @@
#!/usr/bin/python #!/usr/bin/python3
from optparse import OptionParser from optparse import OptionParser
import json import json
import codecs import codecs
import sys import sys
import os import os
from datetime import datetime
import ox import ox
def add_metadata(films, country, output): def add_metadata(films, country, output):
meta = [] meta = []
api = ox.API('https://indiancine.ma/api/') api = ox.API('https://indiancine.ma/api/')
current_year = datetime.now().year
if os.path.exists(output): if os.path.exists(output):
with open(output) as fd: with open(output) as fd:
@ -25,13 +27,12 @@ def add_metadata(films, country, output):
if info['imdbId'] in known_ids: if info['imdbId'] in known_ids:
continue continue
skip = False skip = False
for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'): for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'):
if key in info['title']: if key in info['title']:
skip = True skip = True
if skip: if skip:
continue continue
keys = [
extra = api.getMetadata(id=info['imdbId'], keys=[
'language', 'productionCompany', 'director', 'language', 'productionCompany', 'director',
'runtime', 'alternativeTitles', 'runtime', 'alternativeTitles',
'color', 'sound', 'color', 'sound',
@ -39,12 +40,25 @@ def add_metadata(films, country, output):
'isSeries', 'isSeries',
'title', 'title',
'originalTitle', 'year' 'originalTitle', 'year'
])['data'] ]
print info extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print extra print(info)
if 'isSeries' in extra or ('country' in extra and not country in extra['country']): print(extra)
if not extra:
save()
print('lets try again')
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print(extra)
y = extra.get('year')
if y:
y = int(y)
if '(????)' in info.get('title', '') or not y or y >= current_year:
info['delete'] = True info['delete'] = True
print 'deleting', info['imdbId'], info.get('title') print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
continue
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
info['delete'] = True
print('deleting', info['imdbId'], info.get('title'))
continue continue
if 'originalTitle' in extra: if 'originalTitle' in extra:
info['alternativeTitles'] = [[info['title'], '']] info['alternativeTitles'] = [[info['title'], '']]
@ -54,13 +68,14 @@ def add_metadata(films, country, output):
for key in extra: for key in extra:
if key not in info: if key not in info:
info[key] = extra[key] info[key] = extra[key]
print info['imdbId'], info['title'] print(info['imdbId'], info['title'])
meta.append(info) meta.append(info)
if len(meta) % 100 == 0: if len(meta) % 100 == 0:
save() save()
save() save()
return meta return meta
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] country films.json films_with_metadata.json" usage = "usage: %prog [options] country films.json films_with_metadata.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)

View file

@ -1,11 +1,15 @@
#!/usr/bin/python #!/usr/bin/python3
import ox.web.imdb import ox.web.imdb
import re import re
import json import json
import sys import sys
import codecs import codecs
from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
import lxml.html
''' '''
python allofcountry.py in idsofindia.json python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json python allofcountry.py tr idsofturkey.json
@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json
def reset_url(url): def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0) x = ox.web.imdb.read_url(url, timeout=0)
def write(films, filename):
data = []
for id, title in films.items():
data.append({
'imdbId': id,
'title': title
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json" usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)
@ -23,40 +40,56 @@ if __name__ == '__main__':
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
films = [] films = {}
country, filename = args country, filename = args
current_year = datetime.now().strftime('Y')
if opts.reset: if opts.reset:
reset_url(opts.reset) reset_url(opts.reset)
base_url = 'http://akas.imdb.com' base_url = 'http://www.imdb.com'
url = '%s/search/title?countries=%s&sort=year' % (base_url, country) #url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
data = ox.web.imdb.read_url(url) year = 1880
n = True
while n:
n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
if n:
n = '%s%s' % (base_url, n[0].split('href="')[-1])
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data) added = 0
if results:
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
print n
print len(films), 'films'
if n:
data = ox.web.imdb.read_url(n)
else:
with open('last.html', 'w') as f:
f.write(data)
if len(films) % 1000 == 0:
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump([{
'imdbId': f[0],
'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)
with codecs.open(filename, 'w', encoding='utf-8') as fd: while year < datetime.now().year:
json.dump([{ url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
'imdbId': f[0],
'title': ox.decode_html(f[1]) data = ox.web.imdb.read_url(url, unicode=True)
} for f in films], fd, indent=1, ensure_ascii=False) n = True
page = 1
while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data)
if n:
n = '%s&page=%s' % (url, page)
page += 1
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
n = None
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
print(len(films), 'films')
if n:
data = ox.web.imdb.read_url(n, unicode=True)
else:
with open('last.html', 'w') as f:
f.write(data)
if added > 1000:
added = 0
write(films, filename)
year += 1
print('>> year', year)
write(films, filename)

View file

@ -21,7 +21,7 @@ def load(data_json):
reset_table(archive.models.Volume._meta.db_table) reset_table(archive.models.Volume._meta.db_table)
reset_table(models.Item._meta.db_table) reset_table(models.Item._meta.db_table)
transaction.commit_unless_managed() transaction.commit_unless_managed()
os.system('rm -r /srv/pandora/data/files') os.system('rm -r /srv/pandora/data/media')
os.system('rm -r /srv/pandora/data/items') os.system('rm -r /srv/pandora/data/items')
films = json.load(open(data_json)) films = json.load(open(data_json))

View file

@ -12,6 +12,7 @@ mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
tv_series = filter(lambda x: 'TV Series' in x['title'], data) tv_series = filter(lambda x: 'TV Series' in x['title'], data)
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
tv_special = filter(lambda x: 'TV Special' in x['title'], data) tv_special = filter(lambda x: 'TV Special' in x['title'], data)
documentary = filter(lambda x: 'Documentary' in x['title'], data)
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies) #cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
@ -19,8 +20,9 @@ print len(tv_special), 'TV Specials'
print len(tv_series), 'TV Series' print len(tv_series), 'TV Series'
print len(tv_movies), 'TV Movies' print len(tv_movies), 'TV Movies'
print len(mini_series), 'Mini-Series' print len(mini_series), 'Mini-Series'
print len(documentary), 'Documentaries'
#print len(cinema), 'Cinema' #print len(cinema), 'Cinema'
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special), 'Movies' print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
print len(data), 'total' print len(data), 'total'