python-ox/ox/web/twitter.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from datetime import datetime
from urllib import quote

import lxml.html
import ox
from ox.cache import read_url


def find(query, timeout=60):
    url = 'https://twitter.com/search/' + quote(query)
    data = ox.cache.read_url(url, timeout=timeout)
    doc = lxml.html.document_fromstring(data)
    tweets = []
    for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
        t = lxml.html.tostring(e)
        text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
        html = lxml.html.tostring(text).strip()
        text = ox.decode_html(ox.strip_tags(html)).strip()
        user = re.compile('data-name="(.*?)"').findall(t)[0]
        user = ox.decode_html(ox.strip_tags(user)).strip()
        tweets.append({
            'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
            'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
            'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
            'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
            'user': user,
            'text': text,
            'html': html,
        })
    return tweets
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`
			`from datetime import datetime`
			`from urllib import quote`

			`import lxml.html`
fix import 2013-01-31 14:37:11 +00:00			`import ox`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`from ox.cache import read_url`


add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`def find(query, timeout=60):`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`url = 'https://twitter.com/search/' + quote(query)`
add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`data = ox.cache.read_url(url, timeout=timeout)`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`doc = lxml.html.document_fromstring(data)`
			`tweets = []`
			`for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):`
			`t = lxml.html.tostring(e)`
			`text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]`
add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`html = lxml.html.tostring(text).strip()`
			`text = ox.decode_html(ox.strip_tags(html)).strip()`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`user = re.compile('data-name="(.*?)"').findall(t)[0]`
			`user = ox.decode_html(ox.strip_tags(user)).strip()`
			`tweets.append({`
			`'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],`
			`'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],`
			`'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],`
			`'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),`
			`'user': user,`
			`'text': text,`
add timeout as option to twitter.find, also return html 2013-07-04 10:22:56 +00:00			`'html': html,`
add ox.web.twitter 2013-01-31 14:18:07 +00:00			`})`
			`return tweets`