2013-01-31 14:18:07 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
import re
|
|
|
|
from datetime import datetime
|
2023-07-27 11:07:13 +00:00
|
|
|
from urllib.parse import quote
|
2013-01-31 14:18:07 +00:00
|
|
|
|
|
|
|
import lxml.html
|
2013-01-31 14:37:11 +00:00
|
|
|
import ox
|
2013-01-31 14:18:07 +00:00
|
|
|
from ox.cache import read_url
|
|
|
|
|
2013-08-01 13:14:06 +00:00
|
|
|
def find(query=None, user=None, timeout=60):
|
|
|
|
if user:
|
|
|
|
url = 'https://twitter.com/' + quote(user)
|
|
|
|
else:
|
|
|
|
url = 'https://twitter.com/search/' + quote(query)
|
2013-07-04 18:32:54 +00:00
|
|
|
data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
|
2013-01-31 14:18:07 +00:00
|
|
|
doc = lxml.html.document_fromstring(data)
|
|
|
|
tweets = []
|
|
|
|
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
|
|
|
|
t = lxml.html.tostring(e)
|
|
|
|
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
|
2013-07-04 18:32:54 +00:00
|
|
|
html = lxml.html.tostring(text, encoding='unicode').strip()
|
2013-07-04 10:22:56 +00:00
|
|
|
text = ox.decode_html(ox.strip_tags(html)).strip()
|
2013-01-31 14:18:07 +00:00
|
|
|
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
|
|
|
user = ox.decode_html(ox.strip_tags(user)).strip()
|
|
|
|
tweets.append({
|
2024-09-11 21:52:01 +00:00
|
|
|
'id': re.compile(r'data-tweet-id="(\d+)"').findall(t)[0],
|
|
|
|
'user-id': re.compile(r'data-user-id="(\d+)"').findall(t)[0],
|
|
|
|
'name': re.compile(r'data-screen-name="(.*?)"').findall(t)[0],
|
|
|
|
'time': datetime.fromtimestamp(int(re.compile(r'data-time="(\d+)"').findall(t)[0])),
|
2013-01-31 14:18:07 +00:00
|
|
|
'user': user,
|
|
|
|
'text': text,
|
2013-07-04 10:22:56 +00:00
|
|
|
'html': html,
|
2013-01-31 14:18:07 +00:00
|
|
|
})
|
|
|
|
return tweets
|