add resolution/pages to documents, use poppler-utils to extract pdf pages.

This commit is contained in:
j 2014-01-04 12:29:11 +00:00
parent 5705c9e74d
commit ff13a30639
6 changed files with 198 additions and 10 deletions

2
README
View file

@ -30,7 +30,7 @@ To run pan.do/ra you need to install and setup:
python-gst0.10 gstreamer0.10-plugins-good gstreamer0.10-plugins-bad \
postgresql postgresql-contrib rabbitmq-server \
ffmpeg2theora libav-tools libavcodec-extra-53 \
python-ox oxframe imagemagick
python-ox oxframe imagemagick poppler-utils
* Prepare Environment

View file

@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
import datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Adding field 'Document.pages'
db.add_column('document_document', 'pages',
self.gf('django.db.models.fields.IntegerField')(default=-1),
keep_default=False)
# Adding field 'Document.width'
db.add_column('document_document', 'width',
self.gf('django.db.models.fields.IntegerField')(default=-1),
keep_default=False)
# Adding field 'Document.height'
db.add_column('document_document', 'height',
self.gf('django.db.models.fields.IntegerField')(default=-1),
keep_default=False)
def backwards(self, orm):
# Deleting field 'Document.pages'
db.delete_column('document_document', 'pages')
# Deleting field 'Document.width'
db.delete_column('document_document', 'width')
# Deleting field 'Document.height'
db.delete_column('document_document', 'height')
models = {
'auth.group': {
'Meta': {'object_name': 'Group'},
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'}),
'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'})
},
'auth.permission': {
'Meta': {'ordering': "('content_type__app_label', 'content_type__model', 'codename')", 'unique_together': "(('content_type', 'codename'),)", 'object_name': 'Permission'},
'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['contenttypes.ContentType']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '50'})
},
'auth.user': {
'Meta': {'object_name': 'User'},
'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'email': ('django.db.models.fields.EmailField', [], {'max_length': '255', 'blank': 'True'}),
'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'groups': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Group']", 'symmetrical': 'False', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}),
'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}),
'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}),
'password': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': "orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}),
'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '255'})
},
'contenttypes.contenttype': {
'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"},
'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '100'})
},
'document.document': {
'Meta': {'unique_together': "(('user', 'name', 'extension'),)", 'object_name': 'Document'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'description': ('django.db.models.fields.TextField', [], {'default': "''"}),
'description_sort': ('django.db.models.fields.CharField', [], {'max_length': '512'}),
'extension': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'file': ('django.db.models.fields.files.FileField', [], {'default': 'None', 'max_length': '100', 'null': 'True', 'blank': 'True'}),
'height': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'items': ('django.db.models.fields.related.ManyToManyField', [], {'related_name': "'documents'", 'symmetrical': 'False', 'through': "orm['document.ItemProperties']", 'to': "orm['item.Item']"}),
'matches': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'name_sort': ('django.db.models.fields.CharField', [], {'max_length': '255'}),
'oshash': ('django.db.models.fields.CharField', [], {'max_length': '16', 'unique': 'True', 'null': 'True'}),
'pages': ('django.db.models.fields.IntegerField', [], {'default': '-1'}),
'ratio': ('django.db.models.fields.FloatField', [], {'default': '1'}),
'size': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'uploading': ('django.db.models.fields.BooleanField', [], {'default': 'False'}),
'user': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'files'", 'to': "orm['auth.User']"}),
'width': ('django.db.models.fields.IntegerField', [], {'default': '-1'})
},
'document.itemproperties': {
'Meta': {'unique_together': "(('item', 'document'),)", 'object_name': 'ItemProperties'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'description': ('django.db.models.fields.TextField', [], {'default': "''"}),
'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'descriptions'", 'to': "orm['document.Document']"}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'index': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'item': ('django.db.models.fields.related.ForeignKey', [], {'to': "orm['item.Item']"}),
'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'})
},
'item.item': {
'Meta': {'object_name': 'Item'},
'created': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'data': ('ox.django.fields.DictField', [], {'default': '{}'}),
'external_data': ('ox.django.fields.DictField', [], {'default': '{}'}),
'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "'items'", 'blank': 'True', 'to': "orm['auth.Group']"}),
'icon': ('django.db.models.fields.files.ImageField', [], {'default': 'None', 'max_length': '100', 'blank': 'True'}),
'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'itemId': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '128', 'blank': 'True'}),
'json': ('ox.django.fields.DictField', [], {'default': '{}'}),
'level': ('django.db.models.fields.IntegerField', [], {'db_index': 'True'}),
'modified': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'blank': 'True'}),
'oxdbId': ('django.db.models.fields.CharField', [], {'max_length': '42', 'unique': 'True', 'null': 'True', 'blank': 'True'}),
'poster': ('django.db.models.fields.files.ImageField', [], {'default': 'None', 'max_length': '100', 'blank': 'True'}),
'poster_frame': ('django.db.models.fields.FloatField', [], {'default': '-1'}),
'poster_height': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'poster_source': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
'poster_width': ('django.db.models.fields.IntegerField', [], {'default': '0'}),
'rendered': ('django.db.models.fields.BooleanField', [], {'default': 'False', 'db_index': 'True'}),
'stream_aspect': ('django.db.models.fields.FloatField', [], {'default': '1.3333333333333333'}),
'stream_info': ('ox.django.fields.DictField', [], {'default': '{}'}),
'torrent': ('django.db.models.fields.files.FileField', [], {'default': 'None', 'max_length': '1000', 'blank': 'True'}),
'user': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'items'", 'null': 'True', 'to': "orm['auth.User']"})
}
}
complete_apps = ['document']

View file

@ -18,6 +18,7 @@ from item.models import Item
from archive.extract import resize_image
import managers
import utils
class Document(models.Model):
@ -34,6 +35,9 @@ class Document(models.Model):
size = models.IntegerField(default=0)
matches = models.IntegerField(default=0)
ratio = models.FloatField(default=1)
pages = models.IntegerField(default=-1)
width = models.IntegerField(default=-1)
height = models.IntegerField(default=-1)
description = models.TextField(default="")
oshash = models.CharField(max_length=16, unique=True, null=True)
@ -53,11 +57,13 @@ class Document(models.Model):
self.size = self.file.size
if self.extension == 'pdf' and not os.path.exists(self.thumbnail()):
self.make_thumbnail()
self.get_info()
self.name_sort = ox.sort_string(self.name or u'')[:255].lower()
self.description_sort = ox.sort_string(self.description or u'')[:512].lower()
super(Document, self).save(*args, **kwargs)
self.update_matches()
def __unicode__(self):
return self.get_id()
@ -67,6 +73,7 @@ class Document(models.Model):
if created:
p.index = ItemProperties.objects.filter(item=item).aggregate(Max('index'))['index__max'] + 1
p.save()
p.document.update_matches()
def remove(self, item):
ItemProperties.objects.filter(item=item, document=self).delete()
@ -110,6 +117,10 @@ class Document(models.Model):
p.description = ox.sanitize_html(data['description'])
p.save()
@property
def resolution(self):
return [self.width, self.height]
def json(self, keys=None, user=None, item=None):
if not keys:
keys=[
@ -123,6 +134,10 @@ class Document(models.Model):
'ratio',
'user'
]
if self.extension == 'pdf':
keys.append('pages')
else:
keys.append('resolution')
response = {}
_map = {
}
@ -162,6 +177,7 @@ class Document(models.Model):
f.write(chunk.read())
if done:
self.uploading = False
self.get_info()
self.get_ratio()
self.oshash = ox.oshash(self.file.path)
self.save()
@ -178,6 +194,8 @@ class Document(models.Model):
else:
path = src
if os.path.exists(src) and not os.path.exists(path):
image_size = max(self.width, self.height)
if image_size == -1:
image_size = max(*Image.open(src).size)
if size > image_size:
path = src
@ -193,12 +211,25 @@ class Document(models.Model):
p = subprocess.Popen(cmd)
p.wait()
def get_info(self):
if self.extension == 'pdf':
if self.pages == -1:
self.width = 1
self.height = -1
self.pages = utils.pdfpages(self.file.path)
elif self.width == -1:
self.pages = -1
self.width, self.height = Image.open(self.file.path).size
def get_ratio(self):
if self.extension == 'pdf':
self.make_thumbnail()
image = self.thumbnail()
else:
image = self.file.path
if self.width > 0:
size = self.resolution
else:
try:
size = Image.open(image).size
except:
@ -213,7 +244,7 @@ class Document(models.Model):
url = unquote(urls[0])
if url != urls[0]:
urls.append(url)
matches = 0
matches = self.items.count()
for url in urls:
matches += annotation.models.Annotation.objects.filter(value__contains=url).count()
matches += item.models.Item.objects.filter(data__contains=url).count()

19
pandora/document/utils.py Normal file
View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import subprocess
def pdfpages(pdf):
return int(pdfinfo(pdf).get('pages', '0'))
def pdfinfo(pdf):
cmd = ['pdfinfo', pdf]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
data = {}
for line in stdout.strip().split('\n'):
parts = line.split(':')
key = parts[0].lower().strip()
if key:
data[key] = ':'.join(parts[1:]).strip()
return data

View file

@ -86,6 +86,10 @@ def _order_query(qs, sort):
'name': 'name_sort',
'description': 'description_sort',
}.get(e['key'], e['key'])
if key == 'resolution':
order_by.append('%swidth'%operator)
order_by.append('%sheight'%operator)
else:
order = '%s%s' % (operator, key)
order_by.append(order)
if order_by:

View file

@ -46,6 +46,7 @@ apt-get install -y \
libav-tools \
ffmpeg2theora \
imagemagick \
poppler-utils \
ipython \
postfix \
postgresql \