From 6524ceea8a776e04ab1b0ae91508f7b051b70b9a Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Mon, 26 Dec 2011 20:00:30 +0530
Subject: [PATCH] annotation cleanup

---
 README                       |  3 ++-
 pandora/annotation/models.py | 25 ++++++++++++------------
 pandora/annotation/utils.py  | 38 +++++++++---------------------------
 pandora/item/utils.py        |  2 +-
 pandora/padma.jsonc          |  2 +-
 pandora/user/models.py       |  1 +
 requirements.txt             |  1 +
 7 files changed, 27 insertions(+), 45 deletions(-)
diff --git a/README b/README
index 477656960..2cd6efeea 100644
--- a/README
+++ b/README
@@ -12,7 +12,8 @@ python, bazaar, pip and virtualenv and several other python modules:
     apt-get install bzr git subversion mercurial \
             python-setuptools python-pip python-virtualenv ipython \
             python-dev python-imaging python-numpy python-psycopg2 \
-            python-geoip postgresql rabbitmq-server
+            python-geoip python-html5lib python-lxml \
+            postgresql rabbitmq-server
     apt-get install oxframe oxtimeline 
 
 * Pan.do/ra
diff --git a/pandora/annotation/models.py b/pandora/annotation/models.py
index 009c03ffe..9c5088ade 100644
--- a/pandora/annotation/models.py
+++ b/pandora/annotation/models.py
@@ -34,6 +34,7 @@ class Annotation(models.Model):
 
     layer = models.CharField(max_length=255, db_index=True)
     value = models.TextField()
+    findvalue = models.TextField()
     sortvalue = models.CharField(max_length=1000, null=True, blank=True, db_index=True)
 
     def editable(self, user):
@@ -44,22 +45,25 @@ class Annotation(models.Model):
                 return True
         return False
 
-    def html(self):
-        if self.layer == 'string':
-            return utils.html_parser(self.value)
-        else:
-            return self.value
-    
     def set_public_id(self):
         if self.id:
             public_id = Annotation.objects.filter(item=self.item, id__lt=self.id).count() + 1
             self.public_id = "%s/%s" % (self.item.itemId, ox.toAZ(public_id))
             Annotation.objects.filter(id=self.id).update(public_id=self.public_id)
 
+    def get_layer(self):
+        for layer in settings.CONFIG['layers']:
+            if layer['id'] == self.layer:
+                return layer
+        return {}
+
     def save(self, *args, **kwargs):
         set_public_id = not self.id or not self.public_id
+        layer = self.get_layer()
         if self.value:
-            sortvalue = ox.stripTags(self.value).strip()
+            self.value = utils.cleanup_value(self.value, self.layer['tyoe'])
+            self.findvalue = ox.stripTags(self.value).strip()
+            sortvalue = self.findvalue
             sortvalue = sort_string(sortvalue)
             if sortvalue:
                 self.sortvalue = sortvalue[:1000]
@@ -69,12 +73,7 @@ class Annotation(models.Model):
             self.sortvalue = None
 
         #no clip or update clip
-        def get_layer(id):
-            for l in settings.CONFIG['layers']:
-                if l['id'] == id:
-                    return l
-            return {}
-        private = get_layer(self.layer).get('private', False)
+        private = layer.get('private', False)
         if not private:
             if not self.clip or self.start != self.clip.start or self.end != self.clip.end:
                 self.clip, created = Clip.get_or_create(self.item, self.start, self.end)
diff --git a/pandora/annotation/utils.py b/pandora/annotation/utils.py
index 79ace73df..95328d7a0 100644
--- a/pandora/annotation/utils.py
+++ b/pandora/annotation/utils.py
@@ -2,37 +2,17 @@
 # ci:si:et:sw=4:sts=4:ts=4
 import re
 import ox
+import html5lib
 
 
-def html_parser(text, nofollow=True):
-    text = text.replace('<i>', '__i__').replace('</i>', '__/i__')
-    text = text.replace('<b>', '__b__').replace('</b>', '__/b__')
-    #truns links into wiki links, make sure to only take http links
-    text = re.sub('<a .*?href="(http.*?)".*?>(.*?)</a>', '[\\1 \\2]', text)
-    text = ox.escape(text)
-    text = text.replace('__i__', '<i>').replace('__/i__', '</i>')
-    text = text.replace('__b__', '<b>').replace('__/b__', '</b>')
-    if nofollow:
-        nofollow_rel = ' rel="nofollow"'
+def cleanup_value(value, layer_type):
+    #FIXME: what about other types? location etc
+    if layer_type == 'text':
+        value = sanitize_fragment(value)
     else:
-        nofollow_rel = ''
+        value = ox.stripTags(value)
+    return value
 
-    links = re.compile('(\[(http.*?) (.*?)\])').findall(text)
-    for t, link, txt in links:
-        link = link.replace('http', '__LINK__').replace('.', '__DOT__')
-        ll = '<a href="%s"%s>%s</a>' % (link, nofollow_rel, txt)
-        text = text.replace(t, ll)
-    links = re.compile('(\[(http.*?)\])').findall(text)
-    for t, link in links:
-        link = link.replace('http', '__LINK__').replace('.', '__DOT__')
-        ll = '<a href="%s"%s>%s</a>' % (link, nofollow_rel, link)
-        text = text.replace(t, ll)
+def sanitize_fragment(html):
+    return html5lib.parseFragment(html).toxml()
 
-    text = ox.urlize(text, nofollow=nofollow)
-
-    #inpage links
-    text = re.sub('\[(/.+?) (.+?)\]', '<a href="\\1">\\2</a>', text)
-
-    text = text.replace('__LINK__', 'http').replace('__DOT__', '.')
-    text = text.replace("\n", '<br />')
-    return text
diff --git a/pandora/item/utils.py b/pandora/item/utils.py
index f72a1bbfd..c7c851b46 100644
--- a/pandora/item/utils.py
+++ b/pandora/item/utils.py
@@ -44,7 +44,7 @@ def sort_string(string):
 
     #pad numbered titles
     string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
-    return unicodedata.normalize('NFKD', string)
+    return unicodedata.normalize('NFKD', string).lower()
 
 
 def sort_title(title):
diff --git a/pandora/padma.jsonc b/pandora/padma.jsonc
index afccd3a3f..aa33c76fc 100644
--- a/pandora/padma.jsonc
+++ b/pandora/padma.jsonc
@@ -415,7 +415,7 @@
             "id": "keywords",
             "title": "Keywords",
             "overlap": true,
-            "type": "text"
+            "type": "string"
         },
         {
             "id": "descriptions",
diff --git a/pandora/user/models.py b/pandora/user/models.py
index 99a5d4bb4..11dcac9fb 100644
--- a/pandora/user/models.py
+++ b/pandora/user/models.py
@@ -187,6 +187,7 @@ def user_post_save(sender, instance, **kwargs):
     profile, new = UserProfile.objects.get_or_create(user=instance)
     if new and instance.is_superuser:
         profile.level = len(settings.CONFIG['userLevels']) - 1
+        profile.newsletter = settings.CONFIG['user']['newsletter']
         profile.save()
     SessionData.objects.filter(user=instance).update(level=profile.level,
                                                      username=instance.username)
diff --git a/requirements.txt b/requirements.txt
index b37bc72b5..66b3ab6eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ django-celery>2.1.1
 -e git://github.com/bit/django-extensions.git#egg=django_extensions
 -e git+git://github.com/dcramer/django-devserver#egg=django_devserver
 gunicorn
+html5lib