fix lxml unicode handling

2013-07-04 20:32:54 +02:00 · 2013-07-04 20:32:54 +02:00 · ad7e21e7a8
commit ad7e21e7a8
parent b1d248c4df
1 changed files with 2 additions and 2 deletions
--- a/ox/web/twitter.py
+++ b/ox/web/twitter.py
@ -11,13 +11,13 @@ from ox.cache import read_url

 def find(query, timeout=60):
    url = 'https://twitter.com/search/' + quote(query)
-    data = ox.cache.read_url(url, timeout=timeout)
+    data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
    doc = lxml.html.document_fromstring(data)
    tweets = []
    for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
        t = lxml.html.tostring(e)
        text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
-        html = lxml.html.tostring(text).strip()
+        html = lxml.html.tostring(text, encoding='unicode').strip()
        text = ox.decode_html(ox.strip_tags(html)).strip()
        user = re.compile('data-name="(.*?)"').findall(t)[0]
        user = ox.decode_html(ox.strip_tags(user)).strip()