Compare commits

..

No commits in common. "322d63f2348e15bd3095d9fbef09d1d70c697065" and "ba93143980fa51959a7dbfdcd2c5af704e3585e6" have entirely different histories.

6 changed files with 45 additions and 100 deletions

1
README Normal file
View file

@ -0,0 +1 @@
collection tools to create *cine.ma sites

View file

@ -1,6 +0,0 @@
# collection tools to create *cine.ma sites
COUNTRY=in
NAME=India
python films_by_country.py $COUNTRY films_${COUNTRY}.json
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json

View file

@ -1,17 +1,15 @@
#!/usr/bin/python3 #!/usr/bin/python
from optparse import OptionParser from optparse import OptionParser
import json import json
import codecs import codecs
import sys import sys
import os import os
from datetime import datetime
import ox import ox
def add_metadata(films, country, output): def add_metadata(films, country, output):
meta = [] meta = []
api = ox.API('https://indiancine.ma/api/') api = ox.API('https://indiancine.ma/api/')
current_year = datetime.now().year
if os.path.exists(output): if os.path.exists(output):
with open(output) as fd: with open(output) as fd:
@ -27,12 +25,13 @@ def add_metadata(films, country, output):
if info['imdbId'] in known_ids: if info['imdbId'] in known_ids:
continue continue
skip = False skip = False
for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'): for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'):
if key in info['title']: if key in info['title']:
skip = True skip = True
if skip: if skip:
continue continue
keys = [
extra = api.getMetadata(id=info['imdbId'], keys=[
'language', 'productionCompany', 'director', 'language', 'productionCompany', 'director',
'runtime', 'alternativeTitles', 'runtime', 'alternativeTitles',
'color', 'sound', 'color', 'sound',
@ -40,25 +39,12 @@ def add_metadata(films, country, output):
'isSeries', 'isSeries',
'title', 'title',
'originalTitle', 'year' 'originalTitle', 'year'
] ])['data']
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data'] print info
print(info) print extra
print(extra) if 'isSeries' in extra or ('country' in extra and not country in extra['country']):
if not extra:
save()
print('lets try again')
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
print(extra)
y = extra.get('year')
if y:
y = int(y)
if '(????)' in info.get('title', '') or not y or y >= current_year:
info['delete'] = True info['delete'] = True
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year')) print 'deleting', info['imdbId'], info.get('title')
continue
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
info['delete'] = True
print('deleting', info['imdbId'], info.get('title'))
continue continue
if 'originalTitle' in extra: if 'originalTitle' in extra:
info['alternativeTitles'] = [[info['title'], '']] info['alternativeTitles'] = [[info['title'], '']]
@ -68,14 +54,13 @@ def add_metadata(films, country, output):
for key in extra: for key in extra:
if key not in info: if key not in info:
info[key] = extra[key] info[key] = extra[key]
print(info['imdbId'], info['title']) print info['imdbId'], info['title']
meta.append(info) meta.append(info)
if len(meta) % 100 == 0: if len(meta) % 100 == 0:
save() save()
save() save()
return meta return meta
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] country films.json films_with_metadata.json" usage = "usage: %prog [options] country films.json films_with_metadata.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)

View file

@ -1,15 +1,11 @@
#!/usr/bin/python3 #!/usr/bin/python
import ox.web.imdb import ox.web.imdb
import re import re
import json import json
import sys import sys
import codecs import codecs
from datetime import datetime
from optparse import OptionParser from optparse import OptionParser
import lxml.html
''' '''
python allofcountry.py in idsofindia.json python allofcountry.py in idsofindia.json
python allofcountry.py tr idsofturkey.json python allofcountry.py tr idsofturkey.json
@ -18,19 +14,6 @@ python allofcountry.py tr idsofturkey.json
def reset_url(url): def reset_url(url):
x = ox.web.imdb.read_url(url, timeout=0) x = ox.web.imdb.read_url(url, timeout=0)
def write(films, filename):
data = []
for id, title in films.items():
data.append({
'imdbId': id,
'title': title
})
with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump(data, fd, indent=1, ensure_ascii=False)
if __name__ == '__main__': if __name__ == '__main__':
usage = "usage: %prog [options] countrycode output.json" usage = "usage: %prog [options] countrycode output.json"
parser = OptionParser(usage=usage) parser = OptionParser(usage=usage)
@ -40,56 +23,40 @@ if __name__ == '__main__':
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
films = {} films = []
country, filename = args country, filename = args
current_year = datetime.now().strftime('Y')
if opts.reset: if opts.reset:
reset_url(opts.reset) reset_url(opts.reset)
base_url = 'http://www.imdb.com' base_url = 'http://akas.imdb.com'
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country) url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
year = 1880 data = ox.web.imdb.read_url(url)
added = 0
while year < datetime.now().year:
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
data = ox.web.imdb.read_url(url, unicode=True)
n = True n = True
page = 1
while n: while n:
n = re.compile('Next &#187;</a>', re.DOTALL).findall(data) n = re.compile('<a href="(.*?)">Next&nbsp;&raquo;</a>').findall(data)
if n: if n:
n = '%s&page=%s' % (url, page) n = '%s%s' % (base_url, n[0].split('href="')[-1])
page += 1
doc = lxml.html.fromstring(data) results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
article = doc.find_class('article') if results:
if article: films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
article = article[0] print n
else: print len(films), 'films'
n = None
for a in article.xpath('.//a'):
if '/title/tt' in a.attrib['href']:
img = a.xpath('.//img')
if img:
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
if id not in films:
title = img[0].attrib['alt']
title = ox.decode_html(title)
films[id] = title
added += 1
print(len(films), 'films')
if n: if n:
data = ox.web.imdb.read_url(n, unicode=True) data = ox.web.imdb.read_url(n)
else: else:
with open('last.html', 'w') as f: with open('last.html', 'w') as f:
f.write(data) f.write(data)
if added > 1000: if len(films) % 1000 == 0:
added = 0 with codecs.open(filename, 'w', encoding='utf-8') as fd:
write(films, filename) json.dump([{
year += 1 'imdbId': f[0],
print('>> year', year) 'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)
write(films, filename) with codecs.open(filename, 'w', encoding='utf-8') as fd:
json.dump([{
'imdbId': f[0],
'title': ox.decode_html(f[1])
} for f in films], fd, indent=1, ensure_ascii=False)

View file

@ -21,7 +21,7 @@ def load(data_json):
reset_table(archive.models.Volume._meta.db_table) reset_table(archive.models.Volume._meta.db_table)
reset_table(models.Item._meta.db_table) reset_table(models.Item._meta.db_table)
transaction.commit_unless_managed() transaction.commit_unless_managed()
os.system('rm -r /srv/pandora/data/media') os.system('rm -r /srv/pandora/data/files')
os.system('rm -r /srv/pandora/data/items') os.system('rm -r /srv/pandora/data/items')
films = json.load(open(data_json)) films = json.load(open(data_json))

View file

@ -12,7 +12,6 @@ mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
tv_series = filter(lambda x: 'TV Series' in x['title'], data) tv_series = filter(lambda x: 'TV Series' in x['title'], data)
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data) tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
tv_special = filter(lambda x: 'TV Special' in x['title'], data) tv_special = filter(lambda x: 'TV Special' in x['title'], data)
documentary = filter(lambda x: 'Documentary' in x['title'], data)
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies) #cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
@ -20,9 +19,8 @@ print len(tv_special), 'TV Specials'
print len(tv_series), 'TV Series' print len(tv_series), 'TV Series'
print len(tv_movies), 'TV Movies' print len(tv_movies), 'TV Movies'
print len(mini_series), 'Mini-Series' print len(mini_series), 'Mini-Series'
print len(documentary), 'Documentaries'
#print len(cinema), 'Cinema' #print len(cinema), 'Cinema'
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies' print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special), 'Movies'
print len(data), 'total' print len(data), 'total'