From 6524ceea8a776e04ab1b0ae91508f7b051b70b9a Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Mon, 26 Dec 2011 20:00:30 +0530 Subject: [PATCH] annotation cleanup --- README | 3 ++- pandora/annotation/models.py | 25 ++++++++++++------------ pandora/annotation/utils.py | 38 +++++++++--------------------------- pandora/item/utils.py | 2 +- pandora/padma.jsonc | 2 +- pandora/user/models.py | 1 + requirements.txt | 1 + 7 files changed, 27 insertions(+), 45 deletions(-) diff --git a/README b/README index 477656960..2cd6efeea 100644 --- a/README +++ b/README @@ -12,7 +12,8 @@ python, bazaar, pip and virtualenv and several other python modules: apt-get install bzr git subversion mercurial \ python-setuptools python-pip python-virtualenv ipython \ python-dev python-imaging python-numpy python-psycopg2 \ - python-geoip postgresql rabbitmq-server + python-geoip python-html5lib python-lxml \ + postgresql rabbitmq-server apt-get install oxframe oxtimeline * Pan.do/ra diff --git a/pandora/annotation/models.py b/pandora/annotation/models.py index 009c03ffe..9c5088ade 100644 --- a/pandora/annotation/models.py +++ b/pandora/annotation/models.py @@ -34,6 +34,7 @@ class Annotation(models.Model): layer = models.CharField(max_length=255, db_index=True) value = models.TextField() + findvalue = models.TextField() sortvalue = models.CharField(max_length=1000, null=True, blank=True, db_index=True) def editable(self, user): @@ -44,22 +45,25 @@ class Annotation(models.Model): return True return False - def html(self): - if self.layer == 'string': - return utils.html_parser(self.value) - else: - return self.value - def set_public_id(self): if self.id: public_id = Annotation.objects.filter(item=self.item, id__lt=self.id).count() + 1 self.public_id = "%s/%s" % (self.item.itemId, ox.toAZ(public_id)) Annotation.objects.filter(id=self.id).update(public_id=self.public_id) + def get_layer(self): + for layer in settings.CONFIG['layers']: + if layer['id'] == self.layer: + return layer + return {} + def save(self, *args, **kwargs): set_public_id = not self.id or not self.public_id + layer = self.get_layer() if self.value: - sortvalue = ox.stripTags(self.value).strip() + self.value = utils.cleanup_value(self.value, self.layer['tyoe']) + self.findvalue = ox.stripTags(self.value).strip() + sortvalue = self.findvalue sortvalue = sort_string(sortvalue) if sortvalue: self.sortvalue = sortvalue[:1000] @@ -69,12 +73,7 @@ class Annotation(models.Model): self.sortvalue = None #no clip or update clip - def get_layer(id): - for l in settings.CONFIG['layers']: - if l['id'] == id: - return l - return {} - private = get_layer(self.layer).get('private', False) + private = layer.get('private', False) if not private: if not self.clip or self.start != self.clip.start or self.end != self.clip.end: self.clip, created = Clip.get_or_create(self.item, self.start, self.end) diff --git a/pandora/annotation/utils.py b/pandora/annotation/utils.py index 79ace73df..95328d7a0 100644 --- a/pandora/annotation/utils.py +++ b/pandora/annotation/utils.py @@ -2,37 +2,17 @@ # ci:si:et:sw=4:sts=4:ts=4 import re import ox +import html5lib -def html_parser(text, nofollow=True): - text = text.replace('', '__i__').replace('', '__/i__') - text = text.replace('', '__b__').replace('', '__/b__') - #truns links into wiki links, make sure to only take http links - text = re.sub('(.*?)', '[\\1 \\2]', text) - text = ox.escape(text) - text = text.replace('__i__', '').replace('__/i__', '') - text = text.replace('__b__', '').replace('__/b__', '') - if nofollow: - nofollow_rel = ' rel="nofollow"' +def cleanup_value(value, layer_type): + #FIXME: what about other types? location etc + if layer_type == 'text': + value = sanitize_fragment(value) else: - nofollow_rel = '' + value = ox.stripTags(value) + return value - links = re.compile('(\[(http.*?) (.*?)\])').findall(text) - for t, link, txt in links: - link = link.replace('http', '__LINK__').replace('.', '__DOT__') - ll = '%s' % (link, nofollow_rel, txt) - text = text.replace(t, ll) - links = re.compile('(\[(http.*?)\])').findall(text) - for t, link in links: - link = link.replace('http', '__LINK__').replace('.', '__DOT__') - ll = '%s' % (link, nofollow_rel, link) - text = text.replace(t, ll) +def sanitize_fragment(html): + return html5lib.parseFragment(html).toxml() - text = ox.urlize(text, nofollow=nofollow) - - #inpage links - text = re.sub('\[(/.+?) (.+?)\]', '\\2', text) - - text = text.replace('__LINK__', 'http').replace('__DOT__', '.') - text = text.replace("\n", '
') - return text diff --git a/pandora/item/utils.py b/pandora/item/utils.py index f72a1bbfd..c7c851b46 100644 --- a/pandora/item/utils.py +++ b/pandora/item/utils.py @@ -44,7 +44,7 @@ def sort_string(string): #pad numbered titles string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string) - return unicodedata.normalize('NFKD', string) + return unicodedata.normalize('NFKD', string).lower() def sort_title(title): diff --git a/pandora/padma.jsonc b/pandora/padma.jsonc index afccd3a3f..aa33c76fc 100644 --- a/pandora/padma.jsonc +++ b/pandora/padma.jsonc @@ -415,7 +415,7 @@ "id": "keywords", "title": "Keywords", "overlap": true, - "type": "text" + "type": "string" }, { "id": "descriptions", diff --git a/pandora/user/models.py b/pandora/user/models.py index 99a5d4bb4..11dcac9fb 100644 --- a/pandora/user/models.py +++ b/pandora/user/models.py @@ -187,6 +187,7 @@ def user_post_save(sender, instance, **kwargs): profile, new = UserProfile.objects.get_or_create(user=instance) if new and instance.is_superuser: profile.level = len(settings.CONFIG['userLevels']) - 1 + profile.newsletter = settings.CONFIG['user']['newsletter'] profile.save() SessionData.objects.filter(user=instance).update(level=profile.level, username=instance.username) diff --git a/requirements.txt b/requirements.txt index b37bc72b5..66b3ab6eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ django-celery>2.1.1 -e git://github.com/bit/django-extensions.git#egg=django_extensions -e git+git://github.com/dcramer/django-devserver#egg=django_devserver gunicorn +html5lib