add timeout as option to twitter.find, also return html
This commit is contained in:
parent
0d9bba8865
commit
b1d248c4df
1 changed files with 5 additions and 3 deletions
|
@ -9,15 +9,16 @@ import ox
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def find(query):
|
def find(query, timeout=60):
|
||||||
url = 'https://twitter.com/search/' + quote(query)
|
url = 'https://twitter.com/search/' + quote(query)
|
||||||
data = ox.cache.read_url(url, timeout=60)
|
data = ox.cache.read_url(url, timeout=timeout)
|
||||||
doc = lxml.html.document_fromstring(data)
|
doc = lxml.html.document_fromstring(data)
|
||||||
tweets = []
|
tweets = []
|
||||||
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
|
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
|
||||||
t = lxml.html.tostring(e)
|
t = lxml.html.tostring(e)
|
||||||
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
|
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
|
||||||
text = ox.decode_html(ox.strip_tags(lxml.html.tostring(text))).strip()
|
html = lxml.html.tostring(text).strip()
|
||||||
|
text = ox.decode_html(ox.strip_tags(html)).strip()
|
||||||
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
||||||
user = ox.decode_html(ox.strip_tags(user)).strip()
|
user = ox.decode_html(ox.strip_tags(user)).strip()
|
||||||
tweets.append({
|
tweets.append({
|
||||||
|
@ -27,5 +28,6 @@ def find(query):
|
||||||
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
|
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
|
||||||
'user': user,
|
'user': user,
|
||||||
'text': text,
|
'text': text,
|
||||||
|
'html': html,
|
||||||
})
|
})
|
||||||
return tweets
|
return tweets
|
||||||
|
|
Loading…
Reference in a new issue