diff --git a/pandora/annotaion/utils.py b/pandora/annotaion/utils.py new file mode 100644 index 00000000..133d874e --- /dev/null +++ b/pandora/annotaion/utils.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# ci:si:et:sw=4:sts=4:ts=4 + +def html_parser(text, nofollow=True): + text = text.replace('', '__i__').replace('', '__/i__') + text = text.replace('', '__b__').replace('', '__/b__') + #truns links into wiki links, make sure to only take http links + text = re.sub('(.*?)', '[\\1 \\2]', text) + text = escape(text) + text = text.replace('__i__', '').replace('__/i__', '') + text = text.replace('__b__', '').replace('__/b__', '') + if nofollow: + nofollow_rel = ' rel="nofollow"' + else: + nofollow_rel = '' + + links = re.compile('(\[(http.*?) (.*?)\])').findall(text) + for t, link, txt in links: + link = link.replace('http', '__LINK__').replace('.', '__DOT__') + ll = '%s' % (link, nofollow_rel, txt) + text = text.replace(t, ll) + links = re.compile('(\[(http.*?)\])').findall(text) + for t, link in links: + link = link.replace('http', '__LINK__').replace('.', '__DOT__') + ll = '%s' % (link, nofollow_rel, link) + text = text.replace(t, ll) + + text = urlize(text, nofollow=nofollow) + + #inpage links + text = re.sub('\[(/.+?) (.+?)\]', '\\2', text) + + text = text.replace('__LINK__', 'http').replace('__DOT__', '.') + text = text.replace("\n", '
') + return text +