# -*- coding: utf-8 -*- # ci:si:et:sw=4:sts=4:ts=4 import re import ox def html_parser(text, nofollow=True): text = text.replace('', '__i__').replace('', '__/i__') text = text.replace('', '__b__').replace('', '__/b__') #truns links into wiki links, make sure to only take http links text = re.sub('(.*?)', '[\\1 \\2]', text) text = ox.escape(text) text = text.replace('__i__', '').replace('__/i__', '') text = text.replace('__b__', '').replace('__/b__', '') if nofollow: nofollow_rel = ' rel="nofollow"' else: nofollow_rel = '' links = re.compile('(\[(http.*?) (.*?)\])').findall(text) for t, link, txt in links: link = link.replace('http', '__LINK__').replace('.', '__DOT__') ll = '%s' % (link, nofollow_rel, txt) text = text.replace(t, ll) links = re.compile('(\[(http.*?)\])').findall(text) for t, link in links: link = link.replace('http', '__LINK__').replace('.', '__DOT__') ll = '%s' % (link, nofollow_rel, link) text = text.replace(t, ll) text = ox.urlize(text, nofollow=nofollow) #inpage links text = re.sub('\[(/.+?) (.+?)\]', '\\2', text) text = text.replace('__LINK__', 'http').replace('__DOT__', '.') text = text.replace("\n", '
') return text