# -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4
import re
import ox
def html_parser(text, nofollow=True):
text = text.replace('', '__i__').replace('', '__/i__')
text = text.replace('', '__b__').replace('', '__/b__')
#truns links into wiki links, make sure to only take http links
text = re.sub('(.*?)', '[\\1 \\2]', text)
text = ox.escape(text)
text = text.replace('__i__', '').replace('__/i__', '')
text = text.replace('__b__', '').replace('__/b__', '')
if nofollow:
nofollow_rel = ' rel="nofollow"'
else:
nofollow_rel = ''
links = re.compile('(\[(http.*?) (.*?)\])').findall(text)
for t, link, txt in links:
link = link.replace('http', '__LINK__').replace('.', '__DOT__')
ll = '%s' % (link, nofollow_rel, txt)
text = text.replace(t, ll)
links = re.compile('(\[(http.*?)\])').findall(text)
for t, link in links:
link = link.replace('http', '__LINK__').replace('.', '__DOT__')
ll = '%s' % (link, nofollow_rel, link)
text = text.replace(t, ll)
text = ox.urlize(text, nofollow=nofollow)
#inpage links
text = re.sub('\[(/.+?) (.+?)\]', '\\2', text)
text = text.replace('__LINK__', 'http').replace('__DOT__', '.')
text = text.replace("\n", '
')
return text