python-ox/ox/web/twitter.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from datetime import datetime
from urllib.parse import quote

import lxml.html
import ox
from ox.cache import read_url

def find(query=None, user=None, timeout=60):
    if user:
        url = 'https://twitter.com/' + quote(user)
    else:
        url = 'https://twitter.com/search/' + quote(query)
    data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
    doc = lxml.html.document_fromstring(data)
    tweets = []
    for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
        t = lxml.html.tostring(e)
        text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
        html = lxml.html.tostring(text, encoding='unicode').strip()
        text = ox.decode_html(ox.strip_tags(html)).strip()
        user = re.compile('data-name="(.*?)"').findall(t)[0]
        user = ox.decode_html(ox.strip_tags(user)).strip()
        tweets.append({
            'id': re.compile(r'data-tweet-id="(\d+)"').findall(t)[0],
            'user-id': re.compile(r'data-user-id="(\d+)"').findall(t)[0],
            'name': re.compile(r'data-screen-name="(.*?)"').findall(t)[0],
            'time': datetime.fromtimestamp(int(re.compile(r'data-time="(\d+)"').findall(t)[0])),
            'user': user,
            'text': text,
            'html': html,
        })
    return tweets
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`
			`from datetime import datetime`
drop six and python2 support 2023-07-27 11:07:13 +00:00			`from urllib.parse import quote`
add ox.web.twitter 2013-01-31 14:18:07 +00:00
			`import lxml.html`
fix import 2013-01-31 14:37:11 +00:00			`import ox`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`from ox.cache import read_url`

add option to get tweets from one user 2013-08-01 13:14:06 +00:00			`def find(query=None, user=None, timeout=60):`
			`if user:`
			`url = 'https://twitter.com/' + quote(user)`
			`else:`
			`url = 'https://twitter.com/search/' + quote(query)`
fix lxml unicode handling 2013-07-04 18:32:54 +00:00			`data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`doc = lxml.html.document_fromstring(data)`
			`tweets = []`
			`for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):`
			`t = lxml.html.tostring(e)`
			`text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]`
fix lxml unicode handling 2013-07-04 18:32:54 +00:00			`html = lxml.html.tostring(text, encoding='unicode').strip()`
add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`text = ox.decode_html(ox.strip_tags(html)).strip()`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`user = re.compile('data-name="(.*?)"').findall(t)[0]`
			`user = ox.decode_html(ox.strip_tags(user)).strip()`
			`tweets.append({`
escape strings 2024-09-11 21:52:01 +00:00			`'id': re.compile(r'data-tweet-id="(\d+)"').findall(t)[0],`
			`'user-id': re.compile(r'data-user-id="(\d+)"').findall(t)[0],`
			`'name': re.compile(r'data-screen-name="(.*?)"').findall(t)[0],`
			`'time': datetime.fromtimestamp(int(re.compile(r'data-time="(\d+)"').findall(t)[0])),`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`'user': user,`
			`'text': text,`
add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`'html': html,`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`})`
			`return tweets`