Compare commits
3 commits
ba93143980
...
322d63f234
| Author | SHA1 | Date | |
|---|---|---|---|
| 322d63f234 | |||
| f6ed20d87b | |||
| ccb54122cc |
6 changed files with 100 additions and 45 deletions
1
README
1
README
|
|
@ -1 +0,0 @@
|
|||
collection tools to create *cine.ma sites
|
||||
6
README.md
Normal file
6
README.md
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# collection tools to create *cine.ma sites
|
||||
|
||||
COUNTRY=in
|
||||
NAME=India
|
||||
python films_by_country.py $COUNTRY films_${COUNTRY}.json
|
||||
python add_metadata.py $NAME films_${COUNTRY}.json films_${COUNTRY}_metadata.json
|
||||
|
|
@ -1,15 +1,17 @@
|
|||
#!/usr/bin/python
|
||||
#!/usr/bin/python3
|
||||
from optparse import OptionParser
|
||||
import json
|
||||
import codecs
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import ox
|
||||
|
||||
def add_metadata(films, country, output):
|
||||
meta = []
|
||||
api = ox.API('https://indiancine.ma/api/')
|
||||
current_year = datetime.now().year
|
||||
|
||||
if os.path.exists(output):
|
||||
with open(output) as fd:
|
||||
|
|
@ -25,13 +27,12 @@ def add_metadata(films, country, output):
|
|||
if info['imdbId'] in known_ids:
|
||||
continue
|
||||
skip = False
|
||||
for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special'):
|
||||
for key in ('Mini-Series', 'TV Series', 'TV Movie', 'TV Special', 'Video Game'):
|
||||
if key in info['title']:
|
||||
skip = True
|
||||
if skip:
|
||||
continue
|
||||
|
||||
extra = api.getMetadata(id=info['imdbId'], keys=[
|
||||
keys = [
|
||||
'language', 'productionCompany', 'director',
|
||||
'runtime', 'alternativeTitles',
|
||||
'color', 'sound',
|
||||
|
|
@ -39,12 +40,25 @@ def add_metadata(films, country, output):
|
|||
'isSeries',
|
||||
'title',
|
||||
'originalTitle', 'year'
|
||||
])['data']
|
||||
print info
|
||||
print extra
|
||||
if 'isSeries' in extra or ('country' in extra and not country in extra['country']):
|
||||
]
|
||||
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
|
||||
print(info)
|
||||
print(extra)
|
||||
if not extra:
|
||||
save()
|
||||
print('lets try again')
|
||||
extra = api.getMetadata(id=info['imdbId'], keys=keys)['data']
|
||||
print(extra)
|
||||
y = extra.get('year')
|
||||
if y:
|
||||
y = int(y)
|
||||
if '(????)' in info.get('title', '') or not y or y >= current_year:
|
||||
info['delete'] = True
|
||||
print 'deleting', info['imdbId'], info.get('title')
|
||||
print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
|
||||
continue
|
||||
if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
|
||||
info['delete'] = True
|
||||
print('deleting', info['imdbId'], info.get('title'))
|
||||
continue
|
||||
if 'originalTitle' in extra:
|
||||
info['alternativeTitles'] = [[info['title'], '']]
|
||||
|
|
@ -54,13 +68,14 @@ def add_metadata(films, country, output):
|
|||
for key in extra:
|
||||
if key not in info:
|
||||
info[key] = extra[key]
|
||||
print info['imdbId'], info['title']
|
||||
print(info['imdbId'], info['title'])
|
||||
meta.append(info)
|
||||
if len(meta) % 100 == 0:
|
||||
save()
|
||||
save()
|
||||
return meta
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
usage = "usage: %prog [options] country films.json films_with_metadata.json"
|
||||
parser = OptionParser(usage=usage)
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
#!/usr/bin/python
|
||||
#!/usr/bin/python3
|
||||
import ox.web.imdb
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
import codecs
|
||||
|
||||
from datetime import datetime
|
||||
from optparse import OptionParser
|
||||
|
||||
import lxml.html
|
||||
|
||||
'''
|
||||
python allofcountry.py in idsofindia.json
|
||||
python allofcountry.py tr idsofturkey.json
|
||||
|
|
@ -14,6 +18,19 @@ python allofcountry.py tr idsofturkey.json
|
|||
def reset_url(url):
|
||||
x = ox.web.imdb.read_url(url, timeout=0)
|
||||
|
||||
|
||||
def write(films, filename):
|
||||
data = []
|
||||
for id, title in films.items():
|
||||
data.append({
|
||||
'imdbId': id,
|
||||
'title': title
|
||||
})
|
||||
|
||||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||
json.dump(data, fd, indent=1, ensure_ascii=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
usage = "usage: %prog [options] countrycode output.json"
|
||||
parser = OptionParser(usage=usage)
|
||||
|
|
@ -23,40 +40,56 @@ if __name__ == '__main__':
|
|||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
films = []
|
||||
films = {}
|
||||
country, filename = args
|
||||
current_year = datetime.now().strftime('Y')
|
||||
|
||||
if opts.reset:
|
||||
reset_url(opts.reset)
|
||||
|
||||
base_url = 'http://akas.imdb.com'
|
||||
url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
||||
data = ox.web.imdb.read_url(url)
|
||||
n = True
|
||||
while n:
|
||||
n = re.compile('<a href="(.*?)">Next »</a>').findall(data)
|
||||
if n:
|
||||
n = '%s%s' % (base_url, n[0].split('href="')[-1])
|
||||
|
||||
results = re.compile('<table class="results">(.*?)</table>', re.DOTALL).findall(data)
|
||||
if results:
|
||||
films += re.compile('href="/title/tt(\d{7})/" title="(.*?)"').findall(results[0])
|
||||
print n
|
||||
print len(films), 'films'
|
||||
if n:
|
||||
data = ox.web.imdb.read_url(n)
|
||||
else:
|
||||
with open('last.html', 'w') as f:
|
||||
f.write(data)
|
||||
if len(films) % 1000 == 0:
|
||||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||
json.dump([{
|
||||
'imdbId': f[0],
|
||||
'title': ox.decode_html(f[1])
|
||||
} for f in films], fd, indent=1, ensure_ascii=False)
|
||||
base_url = 'http://www.imdb.com'
|
||||
#url = '%s/search/title?countries=%s&sort=year' % (base_url, country)
|
||||
year = 1880
|
||||
|
||||
with codecs.open(filename, 'w', encoding='utf-8') as fd:
|
||||
json.dump([{
|
||||
'imdbId': f[0],
|
||||
'title': ox.decode_html(f[1])
|
||||
} for f in films], fd, indent=1, ensure_ascii=False)
|
||||
added = 0
|
||||
|
||||
while year < datetime.now().year:
|
||||
url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)
|
||||
|
||||
data = ox.web.imdb.read_url(url, unicode=True)
|
||||
n = True
|
||||
page = 1
|
||||
while n:
|
||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
||||
if n:
|
||||
n = '%s&page=%s' % (url, page)
|
||||
page += 1
|
||||
doc = lxml.html.fromstring(data)
|
||||
article = doc.find_class('article')
|
||||
if article:
|
||||
article = article[0]
|
||||
else:
|
||||
n = None
|
||||
for a in article.xpath('.//a'):
|
||||
if '/title/tt' in a.attrib['href']:
|
||||
img = a.xpath('.//img')
|
||||
if img:
|
||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||
if id not in films:
|
||||
title = img[0].attrib['alt']
|
||||
title = ox.decode_html(title)
|
||||
films[id] = title
|
||||
added += 1
|
||||
print(len(films), 'films')
|
||||
if n:
|
||||
data = ox.web.imdb.read_url(n, unicode=True)
|
||||
else:
|
||||
with open('last.html', 'w') as f:
|
||||
f.write(data)
|
||||
if added > 1000:
|
||||
added = 0
|
||||
write(films, filename)
|
||||
year += 1
|
||||
print('>> year', year)
|
||||
|
||||
write(films, filename)
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ def load(data_json):
|
|||
reset_table(archive.models.Volume._meta.db_table)
|
||||
reset_table(models.Item._meta.db_table)
|
||||
transaction.commit_unless_managed()
|
||||
os.system('rm -r /srv/pandora/data/files')
|
||||
os.system('rm -r /srv/pandora/data/media')
|
||||
os.system('rm -r /srv/pandora/data/items')
|
||||
|
||||
films = json.load(open(data_json))
|
||||
|
|
|
|||
4
stats.py
4
stats.py
|
|
@ -12,6 +12,7 @@ mini_series = filter(lambda x: 'Mini-Series' in x['title'], data)
|
|||
tv_series = filter(lambda x: 'TV Series' in x['title'], data)
|
||||
tv_movies = filter(lambda x: 'TV Movie' in x['title'], data)
|
||||
tv_special = filter(lambda x: 'TV Special' in x['title'], data)
|
||||
documentary = filter(lambda x: 'Documentary' in x['title'], data)
|
||||
#cinema = set(data) - set(mini_series) - set(tv_series) - set(tv_movies)
|
||||
|
||||
|
||||
|
|
@ -19,8 +20,9 @@ print len(tv_special), 'TV Specials'
|
|||
print len(tv_series), 'TV Series'
|
||||
print len(tv_movies), 'TV Movies'
|
||||
print len(mini_series), 'Mini-Series'
|
||||
print len(documentary), 'Documentaries'
|
||||
#print len(cinema), 'Cinema'
|
||||
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special), 'Movies'
|
||||
print len(data) - len(mini_series) - len(tv_movies) - len(tv_series) - len(tv_special) - len(documentary), 'Movies'
|
||||
print len(data), 'total'
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue