python-ox/ox/web/twitter.py

34 lines
1.2 KiB
Python
Raw Normal View History

2013-01-31 14:18:07 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from datetime import datetime
from urllib import quote
import lxml.html
2013-01-31 14:37:11 +00:00
import ox
2013-01-31 14:18:07 +00:00
from ox.cache import read_url
def find(query, timeout=60):
2013-01-31 14:18:07 +00:00
url = 'https://twitter.com/search/' + quote(query)
data = ox.cache.read_url(url, timeout=timeout)
2013-01-31 14:18:07 +00:00
doc = lxml.html.document_fromstring(data)
tweets = []
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
t = lxml.html.tostring(e)
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
html = lxml.html.tostring(text).strip()
text = ox.decode_html(ox.strip_tags(html)).strip()
2013-01-31 14:18:07 +00:00
user = re.compile('data-name="(.*?)"').findall(t)[0]
user = ox.decode_html(ox.strip_tags(user)).strip()
tweets.append({
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
'user': user,
'text': text,
'html': html,
2013-01-31 14:18:07 +00:00
})
return tweets