add scrapeit
This commit is contained in:
commit
ca2a42e773
18 changed files with 1864 additions and 0 deletions
14
scrapeit/__init__.py
Normal file
14
scrapeit/__init__.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# encoding: utf-8
|
||||
|
||||
import btjunkie
|
||||
import google
|
||||
import imdb
|
||||
import mininova
|
||||
import thepiratebay
|
||||
import torrent
|
||||
import rottentomatoes
|
||||
|
||||
|
||||
__version__ = '1.0.0'
|
32
scrapeit/btjunkie.py
Normal file
32
scrapeit/btjunkie.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from urllib import quote
|
||||
import re
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from utils import read_url, stripTags
|
||||
from btutils import torrentsWeLike
|
||||
|
||||
|
||||
def search(query):
|
||||
'''search for torrents on btjunkie
|
||||
'''
|
||||
url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1" % quote(query)
|
||||
page = read_url(url)
|
||||
soup = BeautifulSoup(page)
|
||||
torrents = soup.findAll('a', {'class': 'BlckUnd'})
|
||||
torrents = filter(torrentsWeLike, torrents)
|
||||
torrent_links = []
|
||||
for t in torrents:
|
||||
tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href']
|
||||
tlink = tlink.replace('do=stat', 'do=download')
|
||||
torrent_links.append(tlink)
|
||||
return torrent_links
|
||||
|
||||
def searchByImdb(imdb):
|
||||
'''search for torrents by imdb, not supported on btjunkie right now
|
||||
'''
|
||||
return []
|
25
scrapeit/btutils.py
Normal file
25
scrapeit/btutils.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from utils import stripTags
|
||||
|
||||
|
||||
def torrentsWeLike(link):
|
||||
'''check if torrent title looks like something we want to see,
|
||||
dvdrip / no cam / no dubbed versions
|
||||
'''
|
||||
text = stripTags(unicode(link)).lower()
|
||||
#no cams / telesyncs or other stuff
|
||||
for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'):
|
||||
if word in text:
|
||||
return False
|
||||
#no dubbed versions
|
||||
for word in ('italian', 'german', 'spanish', 'french'):
|
||||
if word in text:
|
||||
return False
|
||||
#only dvdrips or dvdscrs
|
||||
for word in ('dvdrip', 'dvdscr', 'dvd screener'):
|
||||
if word in text:
|
||||
return True
|
||||
return False
|
115
scrapeit/djangohtml.py
Normal file
115
scrapeit/djangohtml.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
"HTML utilities suitable for global use."
|
||||
|
||||
import re, string
|
||||
|
||||
# Configuration for urlize() function
|
||||
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
|
||||
|
||||
# list of possible strings used for bullets in bulleted lists
|
||||
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
||||
|
||||
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||
word_split_re = re.compile(r'(\s+)')
|
||||
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
|
||||
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
|
||||
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
||||
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||
del x # Temporary variable
|
||||
|
||||
def escape(html):
|
||||
"Returns the given HTML with ampersands, quotes and carets encoded"
|
||||
if not isinstance(html, basestring):
|
||||
html = str(html)
|
||||
return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|
||||
|
||||
def linebreaks(value):
|
||||
"Converts newlines into <p> and <br />s"
|
||||
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
|
||||
paras = re.split('\n{2,}', value)
|
||||
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
||||
return '\n\n'.join(paras)
|
||||
|
||||
def strip_tags(value):
|
||||
"Returns the given HTML with all tags stripped"
|
||||
return re.sub(r'<[^>]*?>', '', value)
|
||||
|
||||
def strip_spaces_between_tags(value):
|
||||
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||
return re.sub(r'>\s+<', '> <', value)
|
||||
|
||||
def strip_entities(value):
|
||||
"Returns the given HTML with all entities (&something;) stripped"
|
||||
return re.sub(r'&(?:\w+|#\d);', '', value)
|
||||
|
||||
def fix_ampersands(value):
|
||||
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||
return unencoded_ampersands_re.sub('&', value)
|
||||
|
||||
def urlize(text, trim_url_limit=None, nofollow=False):
|
||||
"""
|
||||
Converts any URLs in text into clickable links. Works on http://, https:// and
|
||||
www. links. Links can have trailing punctuation (periods, commas, close-parens)
|
||||
and leading punctuation (opening parens) and it'll still do the right thing.
|
||||
|
||||
If trim_url_limit is not None, the URLs in link text will be limited to
|
||||
trim_url_limit characters.
|
||||
|
||||
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
|
||||
"""
|
||||
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
|
||||
words = word_split_re.split(text)
|
||||
nofollow_attr = nofollow and ' rel="nofollow"' or ''
|
||||
for i, word in enumerate(words):
|
||||
match = punctuation_re.match(word)
|
||||
if match:
|
||||
lead, middle, trail = match.groups()
|
||||
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
|
||||
len(middle) > 0 and middle[0] in string.letters + string.digits and \
|
||||
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
|
||||
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(anchor))
|
||||
if middle.startswith('http://') or middle.startswith('https://'):
|
||||
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
|
||||
and simple_email_re.match(middle):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
if lead + middle + trail != word:
|
||||
words[i] = lead + middle + trail
|
||||
return ''.join(words)
|
||||
|
||||
def clean_html(text):
|
||||
"""
|
||||
Cleans the given HTML. Specifically, it does the following:
|
||||
* Converts <b> and <i> to <strong> and <em>.
|
||||
* Encodes all ampersands correctly.
|
||||
* Removes all "target" attributes from <a> tags.
|
||||
* Removes extraneous HTML, such as presentational tags that open and
|
||||
immediately close and <br clear="all">.
|
||||
* Converts hard-coded bullets into HTML unordered lists.
|
||||
* Removes stuff like "<p> </p>", but only if it's at the
|
||||
bottom of the text.
|
||||
"""
|
||||
from djangotext import normalize_newlines
|
||||
text = normalize_newlines(text)
|
||||
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
||||
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
||||
text = fix_ampersands(text)
|
||||
# Remove all target="" attributes from <a> tags.
|
||||
text = link_target_attribute_re.sub('\\1', text)
|
||||
# Trim stupid HTML such as <br clear="all">.
|
||||
text = html_gunk_re.sub('', text)
|
||||
# Convert hard-coded bullets into HTML unordered lists.
|
||||
def replace_p_tags(match):
|
||||
s = match.group().replace('</p>', '</li>')
|
||||
for d in DOTS:
|
||||
s = s.replace('<p>%s' % d, '<li>')
|
||||
return '<ul>\n%s\n</ul>' % s
|
||||
text = hard_coded_bullets_re.sub(replace_p_tags, text)
|
||||
# Remove stuff like "<p> </p>", but only if it's at the bottom of the text.
|
||||
text = trailing_empty_content_re.sub('', text)
|
||||
return text
|
||||
|
111
scrapeit/djangotext.py
Normal file
111
scrapeit/djangotext.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
import re
|
||||
|
||||
# Capitalizes the first letter of a string.
|
||||
capfirst = lambda x: x and x[0].upper() + x[1:]
|
||||
|
||||
def wrap(text, width):
|
||||
"""
|
||||
A word-wrap function that preserves existing line breaks and most spaces in
|
||||
the text. Expects that existing line breaks are posix newlines (\n).
|
||||
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
|
||||
"""
|
||||
return reduce(lambda line, word, width=width: '%s%s%s' %
|
||||
(line,
|
||||
' \n'[(len(line[line.rfind('\n')+1:])
|
||||
+ len(word.split('\n',1)[0]
|
||||
) >= width)],
|
||||
word),
|
||||
text.split(' ')
|
||||
)
|
||||
|
||||
def truncate_words(s, num):
|
||||
"Truncates a string after a certain number of words."
|
||||
length = int(num)
|
||||
words = s.split()
|
||||
if len(words) > length:
|
||||
words = words[:length]
|
||||
if not words[-1].endswith('...'):
|
||||
words.append('...')
|
||||
return ' '.join(words)
|
||||
|
||||
def get_valid_filename(s):
|
||||
"""
|
||||
Returns the given string converted to a string that can be used for a clean
|
||||
filename. Specifically, leading and trailing spaces are removed; other
|
||||
spaces are converted to underscores; and all non-filename-safe characters
|
||||
are removed.
|
||||
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||
'johns_portrait_in_2004.jpg'
|
||||
"""
|
||||
s = s.strip().replace(' ', '_')
|
||||
return re.sub(r'[^-A-Za-z0-9_.]', '', s)
|
||||
|
||||
def get_text_list(list_, last_word='or'):
|
||||
"""
|
||||
>>> get_text_list(['a', 'b', 'c', 'd'])
|
||||
'a, b, c or d'
|
||||
>>> get_text_list(['a', 'b', 'c'], 'and')
|
||||
'a, b and c'
|
||||
>>> get_text_list(['a', 'b'], 'and')
|
||||
'a and b'
|
||||
>>> get_text_list(['a'])
|
||||
'a'
|
||||
>>> get_text_list([])
|
||||
''
|
||||
"""
|
||||
if len(list_) == 0: return ''
|
||||
if len(list_) == 1: return list_[0]
|
||||
return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
|
||||
|
||||
def normalize_newlines(text):
|
||||
return re.sub(r'\r\n|\r|\n', '\n', text)
|
||||
|
||||
def recapitalize(text):
|
||||
"Recapitalizes text, placing caps after end-of-sentence punctuation."
|
||||
# capwords = ()
|
||||
text = text.lower()
|
||||
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
|
||||
text = capsRE.sub(lambda x: x.group(1).upper(), text)
|
||||
# for capword in capwords:
|
||||
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
|
||||
# text = capwordRE.sub(capword, text)
|
||||
return text
|
||||
|
||||
def phone2numeric(phone):
|
||||
"Converts a phone number with letters into its numeric equivalent."
|
||||
letters = re.compile(r'[A-PR-Y]', re.I)
|
||||
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
|
||||
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
|
||||
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
|
||||
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
|
||||
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||
return letters.sub(char2number, phone)
|
||||
|
||||
# From http://www.xhaus.com/alan/python/httpcomp.html#gzip
|
||||
# Used with permission.
|
||||
def compress_string(s):
|
||||
import cStringIO, gzip
|
||||
zbuf = cStringIO.StringIO()
|
||||
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||
zfile.write(s)
|
||||
zfile.close()
|
||||
return zbuf.getvalue()
|
||||
|
||||
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||
def smart_split(text):
|
||||
"""
|
||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||
Supports both single and double quotes, and supports escaping quotes with
|
||||
backslashes. In the output, strings will keep their initial and trailing
|
||||
quote marks.
|
||||
>>> list(smart_split('This is "a person\'s" test.'))
|
||||
['This', 'is', '"a person\'s"', 'test.']
|
||||
"""
|
||||
for bit in smart_split_re.finditer(text):
|
||||
bit = bit.group(0)
|
||||
if bit[0] == '"':
|
||||
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
|
||||
elif bit[0] == "'":
|
||||
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
|
||||
else:
|
||||
yield bit
|
68
scrapeit/epguides.py
Normal file
68
scrapeit/epguides.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import re
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from google import google
|
||||
from utils import read_url, read_url_utf8, stripTags
|
||||
import tvcom
|
||||
import imdb
|
||||
|
||||
def epguidesUrl(title):
|
||||
'''
|
||||
Search Epguide Url for Show via Show Title.
|
||||
Use Google to search the url, this is also done on Epguide.
|
||||
'''
|
||||
for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1):
|
||||
if url.startswith('http://epguides.com'):
|
||||
if re.search(title, name):
|
||||
return url
|
||||
return None
|
||||
|
||||
def getShowImdb(title):
|
||||
imdbid = None
|
||||
url = epguidesUrl(title)
|
||||
if url:
|
||||
data = read_url(url)
|
||||
soup = BeautifulSoup(data)
|
||||
links = soup('a', {'href': re.compile('imdb.com/title/tt')})
|
||||
if links:
|
||||
link = links[0].get('href')
|
||||
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
||||
if not imdbid:
|
||||
imdbid = imdb.guess(title)
|
||||
return imdbid
|
||||
|
||||
def getEpisodeData(title, episode, show_url = None):
|
||||
'''
|
||||
Collect information about an episode.
|
||||
|
||||
Returns dict with title, show, description and episode
|
||||
'''
|
||||
episodeData = {
|
||||
'title': u'',
|
||||
'show': title,
|
||||
'description': u'',
|
||||
'episode': episode,
|
||||
}
|
||||
description = u''
|
||||
data = u''
|
||||
if not show_url:
|
||||
show_url = epguidesUrl(title)
|
||||
if show_url:
|
||||
data = read_url_utf8(show_url)
|
||||
else:
|
||||
return imdb.getEpisodeData(title, episode)
|
||||
estring = u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip()
|
||||
for line in data.split('\n'):
|
||||
a = line.split(estring)
|
||||
if len(a) == 2:
|
||||
soup = BeautifulSoup(line)
|
||||
episodeData['title'] = soup('a')[0].contents[0]
|
||||
tvcom_url = soup('a')[0].get('href')
|
||||
episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description']
|
||||
break
|
||||
return episodeData
|
375
scrapeit/google.py
Normal file
375
scrapeit/google.py
Normal file
|
@ -0,0 +1,375 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
"""
|
||||
Query Web search engines.
|
||||
|
||||
This module works by filtering the HTML returned by the search engine and thus tends to break when
|
||||
search engines modify their HTML output.
|
||||
|
||||
Public domain, Connelly Barnes 2005-2007. Compatible with Python 2.3-2.5.
|
||||
|
||||
See L{examples} for a quick start. See L{description} for the full
|
||||
explanation, precautions, and legal disclaimers.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
import weakref
|
||||
import threading
|
||||
import Queue
|
||||
|
||||
from utils import read_url
|
||||
|
||||
__version__ = '1.0.2'
|
||||
|
||||
# Default headers for HTTP requests.
|
||||
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
|
||||
|
||||
# Default maximum number of results.
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
|
||||
# Function names for supported search engines.
|
||||
SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo']
|
||||
|
||||
__all__ = SEARCH_ENGINES + ['examples', 'description']
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Functions
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def quote_plus(s):
|
||||
"""
|
||||
A variant of urllib.quote_plus which handles ASCII and Unicode.
|
||||
"""
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
|
||||
|
||||
def fix_url(url):
|
||||
"""
|
||||
Given url str, trim redirect stuff and return actual URL.
|
||||
|
||||
Currently this just returns the URL unmodified.
|
||||
"""
|
||||
# if url.lower().find('http%3a//') > 0:
|
||||
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
|
||||
# if url.find('http://') > 0:
|
||||
# return url[url.rindex('http://'):]
|
||||
return url
|
||||
|
||||
|
||||
def get_search_page_links(page, results_per_page, begin, end, link_re):
|
||||
"""
|
||||
Given str contents of search result page, return list of links.
|
||||
|
||||
Returns list of (name, url, desc) str tuples. See make_searcher()
|
||||
for a description of results_per_page and link_re.
|
||||
"""
|
||||
if begin is not None and begin in page:
|
||||
page = page[page.index(begin):]
|
||||
if end is not None and end in page:
|
||||
page = page[:page.index(end)]
|
||||
ans = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(page):
|
||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||
url = fix_url(url)
|
||||
ans += [(html_to_text(name), url, html_to_text(desc))]
|
||||
return ans
|
||||
|
||||
|
||||
def html_to_text(s):
|
||||
"""
|
||||
Given an HTML formatted str, convert it to a text str.
|
||||
"""
|
||||
s = re.sub(r'<.*?>', '', s)
|
||||
s = s.replace('\r', ' ')
|
||||
s = s.replace('\n', ' ')
|
||||
s = s.replace('\t', ' ')
|
||||
s = s.replace('&', '&')
|
||||
s = s.replace('<', '<')
|
||||
s = s.replace('>', '>')
|
||||
s = s.replace('"', '"')
|
||||
s = s.replace('·', '\xb7')
|
||||
for i in range(256):
|
||||
s = s.replace('&#%d;' % i, chr(i))
|
||||
while s.replace(' ', ' ') != s:
|
||||
s = s.replace(' ', ' ')
|
||||
return s.strip()
|
||||
|
||||
|
||||
def nonblocking(f, blocking_return=None, sleep_time=0.01):
|
||||
"""
|
||||
Wrap a callable which returns an iter so that it no longer blocks.
|
||||
|
||||
The wrapped iterator returns blocking_return while callable f is
|
||||
blocking. The callable f is called in a background thread. If the
|
||||
wrapped iterator is deleted, then the iterator returned by f is
|
||||
deleted also and the background thread is terminated.
|
||||
"""
|
||||
def g(*args, **kwargs):
|
||||
f_iter = f(*args, **kwargs)
|
||||
g_iter = None
|
||||
def run():
|
||||
while True:
|
||||
g_obj = g_iter()
|
||||
if g_obj is None:
|
||||
return
|
||||
if g_obj.q.qsize() == 0:
|
||||
try:
|
||||
f_next = f_iter.next()
|
||||
except Exception, e:
|
||||
g_obj.exc = e
|
||||
return
|
||||
g_obj.q.put(f_next)
|
||||
else:
|
||||
del g_obj
|
||||
time.sleep(sleep_time)
|
||||
class Iter:
|
||||
def __init__(self):
|
||||
self.q = Queue.Queue()
|
||||
self.exc = None
|
||||
self.thread = threading.Thread(target=run)
|
||||
self.thread.setDaemon(True)
|
||||
def next(self):
|
||||
if self.exc is not None:
|
||||
raise self.exc
|
||||
try:
|
||||
return self.q.get_nowait()
|
||||
except Queue.Empty:
|
||||
return blocking_return
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
obj = Iter()
|
||||
g_iter = weakref.ref(obj)
|
||||
obj.thread.start()
|
||||
try:
|
||||
return obj
|
||||
finally:
|
||||
del obj
|
||||
return g
|
||||
|
||||
|
||||
def make_searcher(query_url, results_per_page, page_url, page_mode,
|
||||
begin, end, link_re):
|
||||
"""
|
||||
Return a search function for the given search engine.
|
||||
|
||||
Here query_url is the URL for the initial search, with %(q)s for
|
||||
the query string, results_per_page is the number of search results
|
||||
per page, page_url is the URL for the 2nd and subsequent pages of
|
||||
search results, with %(q)s for the query string and %(n)s for the
|
||||
page "number." Here page_mode controls the actual value for the
|
||||
page "number:"
|
||||
|
||||
- page_mode='page0': Use 0-based index of the page.
|
||||
- page_mode='page1': Use 1-based index of the page.
|
||||
- page_mode='offset0': Use 0-based index of the search result,
|
||||
which is a multiple of results_per_page.
|
||||
- page_mode='offset1': Use 1-based index of the search result
|
||||
(one plus a multiple of results_per_page).
|
||||
|
||||
If begin is not None, then only text after the first occurrence of
|
||||
begin will be used in the search results page. If end is not None,
|
||||
then only text before the first occurrence of end will be used.
|
||||
|
||||
Finally, link_re is a regex string (see module re) which matches
|
||||
three named groups: 'name', 'url', and 'desc'. These correspond to
|
||||
the name, URL and description of each search result. The regex is
|
||||
applied in re.DOTALL mode.
|
||||
|
||||
Returns a search() function which has the same interface as
|
||||
described in the module docstring.
|
||||
"""
|
||||
def search_blocking(query, max_results):
|
||||
last_links = None
|
||||
page_num = 0
|
||||
# done = False
|
||||
q = Queue.Queue()
|
||||
for i in range(max_results):
|
||||
if q.qsize() == 0:
|
||||
if page_num == 0:
|
||||
page = read_url(query_url % {'q': quote_plus(query)})
|
||||
else:
|
||||
# if done:
|
||||
# break
|
||||
if page_mode == 'page0':
|
||||
n = page_num
|
||||
elif page_mode == 'page1':
|
||||
n = page_num + 1
|
||||
elif page_mode == 'offset0':
|
||||
n = page_num * results_per_page
|
||||
elif page_mode == 'offset1':
|
||||
n = page_num * results_per_page + 1
|
||||
else:
|
||||
raise ValueError('unknown page mode')
|
||||
page = read_url(page_url % {'n': n, 'q': quote_plus(query)})
|
||||
page_num += 1
|
||||
links = get_search_page_links(page, results_per_page, begin, end, link_re)
|
||||
if len(links) == 0 or links == last_links:
|
||||
break
|
||||
# if len(links) < results_per_page:
|
||||
# done = True
|
||||
last_links = links
|
||||
for link in links:
|
||||
q.put(link)
|
||||
yield q.get()
|
||||
|
||||
search_nonblocking = nonblocking(search_blocking)
|
||||
|
||||
def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
|
||||
"""
|
||||
See docstring for web_search module.
|
||||
"""
|
||||
if blocking:
|
||||
return search_blocking(query, max_results)
|
||||
else:
|
||||
return search_nonblocking(query, max_results)
|
||||
|
||||
return search
|
||||
|
||||
|
||||
def examples():
|
||||
"""
|
||||
Examples of the web_search module.
|
||||
|
||||
Example 1:
|
||||
|
||||
>>> from web_search import google
|
||||
>>> for (name, url, desc) in google('python', 20):
|
||||
... print name, url
|
||||
...
|
||||
(First 20 results for Google search of "python").
|
||||
|
||||
Example 2:
|
||||
|
||||
>>> from web_search import dmoz
|
||||
>>> list(dmoz('abc', 10))
|
||||
[('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...]
|
||||
|
||||
"""
|
||||
print examples.__doc__
|
||||
|
||||
|
||||
def description():
|
||||
"""
|
||||
Full explanation and precautions for web_search module.
|
||||
|
||||
The search functions in this module follow a common interface::
|
||||
|
||||
search(query, max_results=10, blocking=True) =>
|
||||
iterator of (name, url, description) search results.
|
||||
|
||||
Here query is the query string, max_results gives the maximum number
|
||||
of search results, and the items in the returned iterator are string
|
||||
3-tuples containing the Website name, URL, and description for each
|
||||
search result.
|
||||
|
||||
If blocking=False, then an iterator is returned which does not block
|
||||
execution: the iterator yields None when the next search result is
|
||||
not yet available (a background thread is created).
|
||||
|
||||
Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn',
|
||||
'yahoo'. This module is not associated with or endorsed by any of
|
||||
these search engine corporations.
|
||||
|
||||
Be warned that if searches are made too frequently, or max_results is
|
||||
large and you enumerate all search results, then you will be a drain
|
||||
on the search engine's bandwidth, and the search engine organization
|
||||
may respond by banning your IP address or IP address range.
|
||||
|
||||
This software has been placed in the public domain with the
|
||||
following legal notice::
|
||||
|
||||
http://oregonstate.edu/~barnesc/documents/public_domain.txt
|
||||
|
||||
"""
|
||||
print description.__doc__
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Search engines
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
ask = make_searcher('http://www.ask.com/web?q=%(q)s', 10,
|
||||
'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1',
|
||||
None, None,
|
||||
r'<a .*? class="L4" href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||
r'.*?</div>(?P<desc>.*?)</div>')
|
||||
|
||||
dmoz = make_searcher('http://search.dmoz.org/cgi-bin/search?search=%(q)s', 20,
|
||||
'http://search.dmoz.org/cgi-bin/search?start=%(n)d&search=%(q)s', 'offset1',
|
||||
None, None,
|
||||
r'<li><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||
r'.*? - (?P<desc>.*?)<br>')
|
||||
|
||||
excite = make_searcher('http://msxml.excite.com/info.xcite/search/web/%(q)s', 20,
|
||||
'http://msxml.excite.com/info.xcite/search/web/%(q)s/%(n)d', 'offset1',
|
||||
None, None,
|
||||
r'<div class="listingmain" style=""><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||
r'(?P<desc>.*?)</span>')
|
||||
|
||||
google = make_searcher('http://www.google.com/search?q=%(q)s', 10,
|
||||
'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
|
||||
None, None,
|
||||
r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
|
||||
r'.*?(?:<br>|<table.*?>)' +
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
|
||||
|
||||
msn = make_searcher('http://search.msn.com/results.aspx?q=%(q)s', 10,
|
||||
'http://search.msn.com/results.aspx?q=%(q)s&first=%(n)d', 'offset1',
|
||||
'<h2>Results</h2>', '<div id="ads_rightC">',
|
||||
r'<h3><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||
r'(?P<desc>.*?)<li class="first">')
|
||||
|
||||
yahoo = make_searcher('http://search.yahoo.com/search?p=%(q)s', 10,
|
||||
'http://search.yahoo.com/search?p=%(q)s&b=%(n)d', 'offset1',
|
||||
None, None,
|
||||
'<li><div><a class=yschttl.*?href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
|
||||
r'.*?<div class=yschabstr>(?P<desc>.*?)</div>')
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Unit tests
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def test_engine(search):
|
||||
"""
|
||||
Test a search engine function returned by make_searcher().
|
||||
"""
|
||||
for query in ['abc', 'microsoft', 'love', 'pweropieiw', 'addfdae']:
|
||||
popular = query in ['abc', 'microsoft', 'love', 'run']
|
||||
for n in [6, 17, 31]:
|
||||
n1 = len(list(search(query, n)))
|
||||
if popular:
|
||||
assert n1 == n
|
||||
else:
|
||||
assert n1 <= n
|
||||
n2 = 0
|
||||
for item in search(query, n, False):
|
||||
if item is not None:
|
||||
n2 += 1
|
||||
else:
|
||||
time.sleep(0.01)
|
||||
if popular:
|
||||
assert n2 == n
|
||||
else:
|
||||
assert n2 <= n
|
||||
|
||||
|
||||
def test():
|
||||
"""
|
||||
Unit test main routine.
|
||||
"""
|
||||
import inspect
|
||||
print 'Testing:'
|
||||
for name in SEARCH_ENGINES:
|
||||
print ' ' + (name + ':').ljust(20),
|
||||
test_engine(getattr(inspect.getmodule(test), name))
|
||||
print 'OK'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
34
scrapeit/googlemovie.py
Normal file
34
scrapeit/googlemovie.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import re
|
||||
from urllib import quote
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from utils import read_url, read_url_utf8, stripTags
|
||||
|
||||
def getGoogleMovieId(title):
|
||||
url = 'http://google.com/movies?q=%s&btnG=Search+Movies' % quote(title)
|
||||
data = read_url(url)
|
||||
cids = re.compile('reviews\?cid=(.*?)&').findall(data)
|
||||
if cids:
|
||||
return cids[0]
|
||||
return ''
|
||||
|
||||
def getGoogleMovieData(title, year = None, cid = None):
|
||||
gdata = {
|
||||
'title': title,
|
||||
'year': year,
|
||||
'cid': cid,
|
||||
'rating': '',
|
||||
}
|
||||
if not cid:
|
||||
cid = getGoogleMovieId("%s (%s)" % (title, year))
|
||||
if cid:
|
||||
gdata['cid'] = cid
|
||||
data = read_url('http://www.google.com/movies/reviews?cid=%s' % cid)
|
||||
gdata['rating'] = re.compile('font size=.3><b><nobr>(.*?) / 5').findall(data)[0]
|
||||
gdata['reviews'] = re.compile('Based on (.*?) reviews').findall(data)[0]
|
||||
gdata['year'] = re.compile("<title>.*?\((.*?)\).*?</title").findall(data)[0]
|
||||
return gdata
|
441
scrapeit/imdb.py
Normal file
441
scrapeit/imdb.py
Normal file
|
@ -0,0 +1,441 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import urllib2
|
||||
from urllib import quote
|
||||
import re, time
|
||||
import os
|
||||
|
||||
from elementtree.ElementTree import parse, tostring
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from google import google
|
||||
from utils import stripTags, read_url_utf8, htmldecode
|
||||
|
||||
import utils
|
||||
|
||||
def read_url(url):
|
||||
base = "/var/cache/scrapeit/cache/"
|
||||
path = os.path.join(base, url.replace('http://',''))
|
||||
if path.endswith('/'):
|
||||
path = "%sindex.html" % path
|
||||
if os.path.isdir(path):
|
||||
path = "%s/index.html" % path
|
||||
if os.path.exists(path):
|
||||
f = open(path)
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
else:
|
||||
data = utils.read_url(url)
|
||||
folder = os.path.dirname(path)
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
f = open(path, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
return data
|
||||
|
||||
def _get_data(url):
|
||||
data = None
|
||||
try:
|
||||
data = read_url(url)
|
||||
except:
|
||||
print "error reading data from", url
|
||||
return data
|
||||
|
||||
def get_image(url):
|
||||
return read_url(url)
|
||||
|
||||
def _castList(data, regexp):
|
||||
soup = re.compile(regexp).findall(data)
|
||||
if soup:
|
||||
soup = BeautifulSoup(soup[0])
|
||||
names = []
|
||||
for i in soup('a', {'href': re.compile('/name/nm')}):
|
||||
if i.string:
|
||||
cast = stripTags(i.string)
|
||||
if cast not in names:
|
||||
names.append(cast)
|
||||
return names
|
||||
return []
|
||||
|
||||
def _getTerm(data, regexp):
|
||||
term = ''
|
||||
try:
|
||||
reg = re.compile(regexp, re.IGNORECASE)
|
||||
m = reg.search(data)
|
||||
if m:
|
||||
term = stripTags(m.group(1)).strip()
|
||||
except:
|
||||
print "waring, parsing failed for", regexp
|
||||
return term.encode('utf8')
|
||||
|
||||
|
||||
class IMDb:
|
||||
def __init__(self, imdb):
|
||||
self.imdb = imdb
|
||||
self.pageSource = None
|
||||
self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
|
||||
|
||||
self.businessSource = None
|
||||
self.businessUrl = "%sbusiness" % self.pageUrl
|
||||
self.connectionsSource = None
|
||||
self.connectionsUrl = "%smovieconnections" % self.pageUrl
|
||||
self.creditsSource = None
|
||||
self.creditsUrl = "%sfullcredits" % self.pageUrl
|
||||
self.episodesSource = None
|
||||
self.episodesUrl = "%sepisodes" % self.pageUrl
|
||||
self.keywordSource = None
|
||||
self.keywordUrl = "%skeywords" % self.pageUrl
|
||||
self.plotSource = None
|
||||
self.plotUrl = "%splotsummary" % self.pageUrl
|
||||
self.releaseinfoSource = None
|
||||
self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
|
||||
self.triviaSource = None
|
||||
self.triviaUrl = "%strivia" % self.pageUrl
|
||||
|
||||
def getPage(self, forcereload = False):
|
||||
if forcereload or not self.pageSource:
|
||||
self.pageSource = read_url(self.pageUrl)
|
||||
return self.pageSource
|
||||
|
||||
def parse_raw_value(self, key, value):
|
||||
if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
|
||||
value = stripTags(value).strip()
|
||||
if key == 'runtime':
|
||||
parsed_value = _getTerm(value, '(.*?) min')
|
||||
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = _getTerm(value, '(.*?) sec')
|
||||
parsed_value = _getTerm(parsed_value, '([0-9]+)')
|
||||
if not parsed_value:
|
||||
parsed_value = 0
|
||||
else:
|
||||
parsed_value = int(parsed_value)
|
||||
else:
|
||||
parsed_value = int(parsed_value) * 60
|
||||
elif key in ('country', 'language'):
|
||||
parsed_value = value.split(' / ')
|
||||
elif key == 'genre':
|
||||
parsed_value = value.replace('more', '').strip().split(' / ')
|
||||
elif key == 'tagline':
|
||||
parsed_value = value.replace('more', '').strip()
|
||||
elif key == 'plot_outline':
|
||||
parsed_value = value.replace('(view trailer)', '').strip()
|
||||
if parsed_value.endswith('more'):
|
||||
parsed_value = parsed_value[:-4].strip()
|
||||
elif key == 'tv_series':
|
||||
m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
|
||||
if m:
|
||||
parsed_value = m[0][0]
|
||||
else:
|
||||
parsed_value = ''
|
||||
else:
|
||||
print value
|
||||
parsed_value = value
|
||||
return parsed_value
|
||||
|
||||
def parse(self):
|
||||
data = self.getPage()
|
||||
IMDbDict ={}
|
||||
#Poster
|
||||
IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
|
||||
if not IMDbDict['poster']:
|
||||
IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
|
||||
#Title, Year
|
||||
title = u''
|
||||
year = u''
|
||||
flat_data = data.replace('\n', '').replace('\r', '')
|
||||
html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data)
|
||||
if html_title:
|
||||
title = html_title[0][0]
|
||||
IMDbDict['year'] = html_title[0][1]
|
||||
IMDbDict['title'] = stripTags(title).strip()
|
||||
else:
|
||||
title = _getTerm(data, '<title>(.*?)</title>').split('(')
|
||||
year = title[-1].split(')')[0].strip()
|
||||
title = title[0].strip().decode('utf-8')
|
||||
IMDbDict['title'] = title
|
||||
IMDbDict['year'] = year
|
||||
IMDbDict['title'] = htmldecode(IMDbDict['title'])
|
||||
if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"':
|
||||
IMDbDict['title'] = IMDbDict['title'][1:-1]
|
||||
|
||||
#Votes
|
||||
m = re.compile('<b>(.*?)/10</b> \(<a href="ratings">(.*?) votes</a>\)', re.IGNORECASE).search(data)
|
||||
if m:
|
||||
IMDbDict['rating'] = int(float(m.group(1)) * 1000)
|
||||
IMDbDict['votes'] = int(m.group(2).replace(',', ''))
|
||||
else:
|
||||
IMDbDict['rating'] = -1
|
||||
IMDbDict['votes'] = -1
|
||||
|
||||
data = data.replace('\n',' ')
|
||||
#some values
|
||||
keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series')
|
||||
for key in keys:
|
||||
IMDbDict[key] = ''
|
||||
IMDbDict['runtime'] = 0
|
||||
soup = BeautifulSoup(data)
|
||||
for info in soup('div', {'class': 'info'}):
|
||||
key = str(info).split('</h5>')[0].split('<h5>')
|
||||
if len(key) > 1:
|
||||
raw_value = str(info).split('</h5>')[1]
|
||||
key = key[1][:-1].lower().replace(' ', '_')
|
||||
if key in keys:
|
||||
IMDbDict[key] = self.parse_raw_value(key, raw_value)
|
||||
|
||||
#is episode
|
||||
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||
|
||||
IMDbDict['episodes'] = self.parseEpisodes()
|
||||
IMDbDict['credits'] = self.parseCredits()
|
||||
IMDbDict['plot'] = self.parsePlot()
|
||||
IMDbDict['keywords'] = self.parseKeywords()
|
||||
|
||||
IMDbDict['trivia'] = self.parseTrivia()
|
||||
IMDbDict['connections'] = self.parseConnections()
|
||||
IMDbDict['release_date'] = self.parseReleaseinfo()
|
||||
IMDbDict['business'] = self.parseBusiness()
|
||||
self.IMDbDict = IMDbDict
|
||||
return self.IMDbDict
|
||||
|
||||
def getCredits(self, forcereload = False):
|
||||
if forcereload or not self.creditsSource:
|
||||
self.creditsSource = read_url(self.creditsUrl)
|
||||
return self.creditsSource
|
||||
|
||||
def parseCredits(self):
|
||||
data = self.getCredits()
|
||||
credits = {}
|
||||
credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
|
||||
credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
|
||||
credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
|
||||
#credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
|
||||
credits['cast'] = []
|
||||
soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
|
||||
soup = BeautifulSoup(data)
|
||||
cast = soup('table', {'class': 'cast'})
|
||||
if cast:
|
||||
cast = str(cast[0])
|
||||
names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
|
||||
for name in names:
|
||||
real_name = name[0]
|
||||
role_name = name[1]
|
||||
if role_name:
|
||||
role_name = role_name.split('(')[0].replace('/ ...','').strip()
|
||||
credits['cast'].append((stripTags(real_name), stripTags(role_name)))
|
||||
self.credits = credits
|
||||
return self.credits
|
||||
|
||||
def getPlot(self, forcereload = False):
|
||||
if forcereload or not self.plotSource:
|
||||
self.plotSource = read_url(self.plotUrl)
|
||||
return self.plotSource
|
||||
|
||||
def parsePlot(self):
|
||||
soup = BeautifulSoup(self.getPlot())
|
||||
plot = soup('p', {'class':'plotpar'})
|
||||
if plot:
|
||||
plot = str(plot[0]).split('<i>')[0]
|
||||
else:
|
||||
plot = u''
|
||||
plot = stripTags(plot).strip()
|
||||
self.plot = plot
|
||||
return plot
|
||||
|
||||
def getEpisodes(self, forcereload = False):
|
||||
if forcereload or not self.episodesSource:
|
||||
self.episodesSource = read_url(self.episodesUrl)
|
||||
return self.episodesSource
|
||||
|
||||
def parseEpisodes(self):
|
||||
episodes = {}
|
||||
cdata = self.getEpisodes().replace('\r\n',' ')
|
||||
regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
|
||||
#regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
|
||||
reg = re.compile(regexp, re.IGNORECASE)
|
||||
m = reg.findall(cdata)
|
||||
for match in m:
|
||||
try:
|
||||
episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
|
||||
episodes[episode] = {}
|
||||
episodes[episode]['imdb'] = match[2]
|
||||
episodes[episode]['title'] = match[3].strip()
|
||||
description = htmldecode(match[4])
|
||||
description = stripTags(description.split('Next US airings:')[0])
|
||||
episodes[episode]['description'] = description
|
||||
except:
|
||||
import traceback
|
||||
print traceback.print_exc()
|
||||
pass
|
||||
self.episodes = episodes
|
||||
return self.episodes
|
||||
|
||||
def getKeywords(self, forcereload = False):
|
||||
if forcereload or not self.keywordSource:
|
||||
self.keywordSource = read_url(self.keywordUrl)
|
||||
return self.keywordSource
|
||||
|
||||
def parseKeywords(self):
|
||||
soup = BeautifulSoup(self.getKeywords())
|
||||
keywords = []
|
||||
for key in soup('a', {'href': re.compile('/keyword')}):
|
||||
keywords.append(htmldecode(key.string))
|
||||
self.keywords = keywords
|
||||
return self.keywords
|
||||
|
||||
def getTrivia(self, forcereload = False):
|
||||
if forcereload or not self.triviaSource:
|
||||
self.triviaSource = read_url(self.triviaUrl)
|
||||
return self.triviaSource
|
||||
|
||||
def parseTrivia(self):
|
||||
trivia = []
|
||||
soup = BeautifulSoup(self.getTrivia())
|
||||
triviaList = []
|
||||
for i in soup('ul', {'class': "trivia"}):
|
||||
for t in i('li'):
|
||||
t = str(t).replace('<br />', '').strip()
|
||||
if t.startswith('<li>') and t.endswith('</li>'):
|
||||
t = t[4:-5].strip()
|
||||
trivia.append(t)
|
||||
self.trivia = trivia
|
||||
return self.trivia
|
||||
|
||||
def getConnections(self, forcereload = False):
|
||||
if forcereload or not self.connectionsSource:
|
||||
self.connectionsSource = read_url(self.connectionsUrl)
|
||||
return self.connectionsSource
|
||||
|
||||
def parseConnections(self):
|
||||
connections = {}
|
||||
soup = BeautifulSoup(self.getConnections())
|
||||
content = soup('div', {'id': 'tn15content'})[0]
|
||||
blocks = str(content).split('<h5>')[1:]
|
||||
for c in blocks:
|
||||
connection = c.split('</h5>')[0]
|
||||
cs = BeautifulSoup(c)
|
||||
if connection:
|
||||
#relation -> list of imdb ids
|
||||
connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
|
||||
return connections
|
||||
|
||||
def getReleaseinfo(self, forcereload = False):
|
||||
if forcereload or not self.releaseinfoSource:
|
||||
self.releaseinfoSource = read_url(self.releaseinfoUrl)
|
||||
return self.releaseinfoSource
|
||||
|
||||
def parseReleaseinfo(self):
|
||||
soup = BeautifulSoup(self.getReleaseinfo())
|
||||
for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
|
||||
d = row('td', {'align':'right'})
|
||||
if d:
|
||||
try:
|
||||
possible_date = stripTags(str(d[0])).strip()
|
||||
rdate = time.strptime(possible_date, "%d %B %Y")
|
||||
rdate = time.strftime('%Y-%m-%d', rdate)
|
||||
return rdate
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def getBusiness(self, forcereload = False):
|
||||
if forcereload or not self.businessSource:
|
||||
self.businessSource = read_url(self.businessUrl)
|
||||
return self.businessSource
|
||||
|
||||
def parseBusiness(self):
|
||||
soup = BeautifulSoup(self.getBusiness())
|
||||
business = {'budget': 0, 'gross': 0, 'profit': 0}
|
||||
content = soup('div', {'id': 'tn15content'})[0]
|
||||
blocks = str(content).split('<h5>')[1:]
|
||||
for c in blocks:
|
||||
cs = BeautifulSoup(c)
|
||||
line = c.split('</h5>')
|
||||
if line:
|
||||
title = line[0]
|
||||
line = line[1]
|
||||
if title in ['Budget', 'Gross']:
|
||||
values = re.compile('\$(.*?) ').findall(line)
|
||||
values = [int(value.replace(',','')) for value in values]
|
||||
if values:
|
||||
business[title.lower()] = max(values)
|
||||
if business['budget'] and business['gross']:
|
||||
business['profit'] = business['gross'] - business['budget']
|
||||
return business
|
||||
|
||||
def guess(title, director=''):
|
||||
#FIXME: proper file -> title
|
||||
title = title.split('-')[0]
|
||||
title = title.split('(')[0]
|
||||
title = title.split('.')[0]
|
||||
title = title.strip()
|
||||
imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
|
||||
return_url = ''
|
||||
|
||||
#lest first try google
|
||||
#i.e. site:imdb.com Michael Stevens Sin
|
||||
if director:
|
||||
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||
else:
|
||||
search = 'site:imdb.com "%s"' % title
|
||||
for (name, url, desc) in google(search, 1):
|
||||
if url.startswith('http://www.imdb.com/title/tt'):
|
||||
return url[28:35]
|
||||
|
||||
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
||||
u = urllib2.urlopen(req)
|
||||
data = u.read()
|
||||
return_url = u.url
|
||||
u.close()
|
||||
|
||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||
return return_url[28:35]
|
||||
if data:
|
||||
imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
|
||||
if imdb_id:
|
||||
return imdb_id
|
||||
|
||||
imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
|
||||
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
|
||||
u = urllib2.urlopen(req)
|
||||
data = u.read()
|
||||
return_url = u.url
|
||||
u.close()
|
||||
if return_url.startswith('http://www.imdb.com/title/tt'):
|
||||
return return_url[28:35]
|
||||
|
||||
return None
|
||||
|
||||
def getEpisodeData(title, episode, show_url = None):
|
||||
'''
|
||||
Collect information about an episode.
|
||||
|
||||
Returns dict with title, show, description and episode
|
||||
'''
|
||||
episodeData = {
|
||||
'title': u'',
|
||||
'show': title,
|
||||
'description': u'',
|
||||
'episode': episode,
|
||||
}
|
||||
description = u''
|
||||
if not show_url:
|
||||
imdbid = guess(title)
|
||||
else:
|
||||
imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
|
||||
if imdbid:
|
||||
i = IMDb(imdbid).parse()
|
||||
episodeData['title'] = i['episodes'][episode]['title']
|
||||
episodeData['description'] = i['episodes'][episode]['description']
|
||||
episodeData['imdb'] = i['episodes'][episode]['imdb']
|
||||
return episodeData
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
#print parse(sys.argv[1])
|
||||
print "imdb:", guess(sys.argv[1])
|
40
scrapeit/mininova.py
Normal file
40
scrapeit/mininova.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from utils import read_url, read_url_utf8
|
||||
from btutils import torrentsWeLike
|
||||
|
||||
socket.setdefaulttimeout(10.0)
|
||||
|
||||
def search(query):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
torrents = []
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
page = read_url(url)
|
||||
soup = BeautifulSoup(page)
|
||||
for row in soup('tr'):
|
||||
links = row('a', {'href':re.compile('/tor')})
|
||||
if links and torrentsWeLike(links[0]):
|
||||
torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
|
||||
torrents.append(torrent_url)
|
||||
return torrents
|
||||
|
||||
def searchByImdb(imdb):
|
||||
'''search for torrents on mininova by imdb
|
||||
'''
|
||||
torrents = []
|
||||
page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
|
||||
soup = BeautifulSoup(page)
|
||||
for row in soup('tr'):
|
||||
links = row('a', {'href':re.compile('/get')})
|
||||
if links:
|
||||
torrent_url = "http://www.mininova.org%s" % links[0].get('href')
|
||||
torrents.append(torrent_url)
|
||||
return torrents
|
37
scrapeit/rottentomatoes.py
Normal file
37
scrapeit/rottentomatoes.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from urllib import quote
|
||||
import re
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from utils import read_url
|
||||
|
||||
|
||||
def getRottenTomatoes(rating = 70):
|
||||
'''
|
||||
Get movie TITLES
|
||||
rated ABOVE 70 or value passed as first argument
|
||||
from RottenTomatoes
|
||||
'''
|
||||
movies = []
|
||||
offset = 0
|
||||
titles = ['1']
|
||||
while titles:
|
||||
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
|
||||
page = read_url(url)
|
||||
soup = BeautifulSoup(page)
|
||||
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
|
||||
data = str(soup)
|
||||
ratings = re.compile('<span class="bold">(.*?) %</span>').findall(data)
|
||||
|
||||
ratings = ratings[len(ratings)- len(titles):]
|
||||
|
||||
for title in titles:
|
||||
movies.append({'title': title, 'rating': ratings[titles.index(title)], 'torrent': ''})
|
||||
|
||||
offset += 10
|
||||
return movies
|
||||
|
16
scrapeit/scrapetorrent.py
Normal file
16
scrapeit/scrapetorrent.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from urllib import quote
|
||||
import re
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
|
||||
def search(query):
|
||||
'''search for torrents on scrapetorrent
|
||||
'''
|
||||
torrents = []
|
||||
return torrents
|
||||
|
104
scrapeit/thepiratebay.py
Normal file
104
scrapeit/thepiratebay.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from google import google
|
||||
from utils import read_url, read_url_utf8
|
||||
|
||||
|
||||
socket.setdefaulttimeout(10.0)
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
def shows(name = None):
|
||||
data = read_url_utf8('http://thepiratebay.org/tv/all')
|
||||
shows = re.compile('<dt><a href="/tv/(.*?)/">(.*?)</a></dt>').findall(data)
|
||||
if not name:
|
||||
return shows
|
||||
for show in shows:
|
||||
id = show[0]
|
||||
if name == show[1]:
|
||||
return id
|
||||
return ''
|
||||
|
||||
def findMatch(data, reg):
|
||||
m = re.compile(reg).findall(data)
|
||||
if m:
|
||||
return m[0]
|
||||
return u''
|
||||
|
||||
def get_info(url):
|
||||
url = url.strip()
|
||||
if url.startswith('/'):
|
||||
url = 'http://thepiratebay.org' + url
|
||||
data = read_url(url)
|
||||
line = data.replace('\n', ' ')
|
||||
info = {}
|
||||
info['torrent'] = findMatch(data, '(http://.*?.torrent)"')
|
||||
info['files'] = findMatch(data, '<dd><a href="/details.php.id=.*?&fl#show">(.*?)</a></dd>')
|
||||
try:
|
||||
info['files'] = int(info['files'])
|
||||
except:
|
||||
info['files'] = 0
|
||||
info['spoken_language'] = findMatch(line, '<dt>Spoken language\(s\):</dt>.*?<dd>(.*?)</dd>')
|
||||
info['texted_language'] = findMatch(line, '<dt>Texted language\(s\):</dt>.*?<dd>(.*?)</dd>')
|
||||
return info
|
||||
|
||||
def get_episode_name(string):
|
||||
episode = ''
|
||||
ep = season_episode.findall(string)
|
||||
if ep:
|
||||
episode = ep[0].upper()
|
||||
return episode
|
||||
|
||||
def in_killwords(string):
|
||||
string = string.lower()
|
||||
match = False
|
||||
for w in ['swesub', 'mpeg']:
|
||||
if w in string:
|
||||
match = True
|
||||
return match
|
||||
|
||||
def get_episode(show_id, episode):
|
||||
if show_id <= 0:
|
||||
return ''
|
||||
tpbe = get_episodes(show_id)
|
||||
for e in tpbe:
|
||||
link =e[0]
|
||||
ep = get_episode_name(e[1])
|
||||
if ep == episode:
|
||||
info = get_info(link)
|
||||
if not in_killwords(info['torrent']) \
|
||||
and info['files'] > 0 and info['files'] < 10 \
|
||||
and (not info['texted_language'] or info['texted_language'] == info['spoken_language']):
|
||||
return info['torrent']
|
||||
return u''
|
||||
|
||||
def get_episodes(id):
|
||||
data = read_url("http://thepiratebay.org/tv/%s" % id)
|
||||
episodes = re.compile('<nobr><a href="(.*?)">(.*?)</a></nobr>').findall(data)
|
||||
return episodes
|
||||
|
||||
def search(query):
|
||||
torrents = []
|
||||
url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query)
|
||||
page = read_url(url)
|
||||
soup = BeautifulSoup(page)
|
||||
for row in soup('tr'):
|
||||
torrentType = row.findAll('td', {'class': 'vertTh'})
|
||||
if torrentType:
|
||||
torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
|
||||
# 201 = Movies , 202 = Movie DVDR
|
||||
if torrentType in ['201']:
|
||||
torrent = row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
|
||||
torrents.append(torrent)
|
||||
return torrents
|
||||
|
||||
def searchByImdb(imdb):
|
||||
return search("tt" + imdb)
|
18
scrapeit/torrent.py
Normal file
18
scrapeit/torrent.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import mininova
|
||||
import btjunkie
|
||||
import thepiratebay
|
||||
|
||||
def search(query):
|
||||
'''meta function to search with the best known torrent search engine
|
||||
'''
|
||||
return btjunkie.search(query)
|
||||
|
||||
def searchByImdb(imdb):
|
||||
'''meta function to search by imdb with the best known torrent search engine
|
||||
'''
|
||||
return mininova.searchByImdb(imdb)
|
||||
|
34
scrapeit/tvcom.py
Normal file
34
scrapeit/tvcom.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import re
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
from utils import read_url_utf8, stripTags
|
||||
|
||||
def getEpisodeData(url):
|
||||
''' prases informatin on tvcom episode pages
|
||||
returns dict with title, show, description, score
|
||||
'''
|
||||
tvcom = {
|
||||
'description': u''
|
||||
}
|
||||
data = read_url_utf8(url).replace('\n',' ')
|
||||
regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
|
||||
reg = re.compile(regexp, re.IGNORECASE)
|
||||
m = reg.findall(data)
|
||||
for match in m:
|
||||
description = match.strip()
|
||||
description = stripTags(description).replace('Watch Video','')
|
||||
tvcom['description'] = description.strip()
|
||||
soup = BeautifulSoup(data)
|
||||
#optional data
|
||||
try:
|
||||
tvcom['show'] = soup('h1')[0].contents[0]
|
||||
tvcom['title'] = soup('h1')[1].contents[0]
|
||||
tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
|
||||
except:
|
||||
pass
|
||||
return tvcom
|
219
scrapeit/tvrss.py
Executable file
219
scrapeit/tvrss.py
Executable file
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
from os.path import *
|
||||
import sys
|
||||
import datetime
|
||||
import time
|
||||
import re
|
||||
from urllib2 import urlopen
|
||||
import Image
|
||||
import StringIO
|
||||
|
||||
import feedparser
|
||||
|
||||
from utils import read_url
|
||||
|
||||
|
||||
hr_hdtv = re.compile('HR HDTV')
|
||||
hdtv = re.compile('HDTV')
|
||||
|
||||
def get_url(title):
|
||||
return title.replace(' ','_').replace('/', '_').lower()
|
||||
|
||||
def get_show(string):
|
||||
return string.split(';')[0].split(':')[1].strip()
|
||||
|
||||
def get_title(string):
|
||||
title = string.split(';')[1].split(':')[1].strip()
|
||||
if title != 'n/a':
|
||||
return title
|
||||
return ''
|
||||
|
||||
def get_season(string):
|
||||
try:
|
||||
season = int(string.split(';')[2].split(':')[1].strip())
|
||||
except:
|
||||
return None
|
||||
return season
|
||||
|
||||
def get_episode(string):
|
||||
try:
|
||||
episode = int(string.split(';')[3].split(':')[1].strip())
|
||||
except:
|
||||
return None
|
||||
return episode
|
||||
|
||||
def get_episodedate(string):
|
||||
s = string.split('Episode Date:')
|
||||
if len(s) == 2:
|
||||
return s[1].strip()
|
||||
return None
|
||||
|
||||
def choose_item(old, new):
|
||||
if old['link'] == new['link']:
|
||||
return False
|
||||
if not hdtv.search(old['title']):
|
||||
if hdtv.search(new['title']):
|
||||
display_item(new)
|
||||
log.debug("vs.")
|
||||
display_item(old)
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_imdbdata(imdbid):
|
||||
thumbnail = None
|
||||
description=''
|
||||
imdb = IMDb.parse(imdbid)
|
||||
if imdb:
|
||||
poster = imdb['poster']
|
||||
if poster != 'http://i.imdb.com/Heads/npa.gif':
|
||||
log.debug("getting poster %s" % poster)
|
||||
try:
|
||||
thumbnail = read_url(poster)
|
||||
im = Image.open(StringIO.StringIO(thumbnail))
|
||||
out = StringIO.StringIO()
|
||||
im.crop((0,0,100,100)).convert().save(out, 'JPEG')
|
||||
thumbnail = out.getvalue()
|
||||
except:
|
||||
thumbnail = None
|
||||
if imdb['summary']:
|
||||
description=imdb['summary']
|
||||
else:
|
||||
description=imdb['tagline']
|
||||
return (imdb, description, thumbnail)
|
||||
else:
|
||||
return(imdb, '', None)
|
||||
|
||||
def load():
|
||||
log.debug("getting new shows from tvrss...")
|
||||
feed = feedparser.parse('http://tvrss.net/feed/combined/')
|
||||
shows = {}
|
||||
for item in feed['entries']:
|
||||
show = get_show(item['description'])
|
||||
season = get_season(item['description'])
|
||||
episode = get_episode(item['description'])
|
||||
episodedate = get_episodedate(item['description'])
|
||||
estring = None
|
||||
if season and episode:
|
||||
estring = "S%02dE%02d" %(season, episode)
|
||||
elif episodedate:
|
||||
estring = episodedate
|
||||
if estring:
|
||||
if show and not hr_hdtv.search(item['title']):
|
||||
if shows.has_key(show):
|
||||
if shows[show].has_key(estring):
|
||||
if choose_item(shows[show][estring], item):
|
||||
shows[show][estring] = item
|
||||
else:
|
||||
shows[show][estring] = item
|
||||
else:
|
||||
shows[show] = {}
|
||||
shows[show][estring] = item
|
||||
for show in shows:
|
||||
imdb = None
|
||||
try:
|
||||
model.ShowsBlacklist.byShowUrl(get_url(show))
|
||||
log.debug("ignoring blacklisted show %s" % show)
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
s = None
|
||||
try:
|
||||
s = model.Shows.byUrl(get_url(show))
|
||||
except SQLObjectNotFound:
|
||||
try:
|
||||
alias = model.ShowsAlias.byAlias(get_url(show))
|
||||
s = alias.show
|
||||
except SQLObjectNotFound:
|
||||
s = None
|
||||
if not s:
|
||||
log.debug("about to add %s" % show)
|
||||
thumbnail = None
|
||||
description=''
|
||||
ur = '-'
|
||||
try:
|
||||
imdbid = IMDb.guess(show)
|
||||
if imdbid:
|
||||
imdb, description, thumbnail = get_imdbdata(imdbid)
|
||||
if imdb:
|
||||
ur = imdb['rating']
|
||||
except:
|
||||
import traceback
|
||||
print ptraceback.print_exc()
|
||||
pass
|
||||
s= model.Shows(
|
||||
title = show,
|
||||
url = get_url(show),
|
||||
description = description,
|
||||
imdb = imdbid,
|
||||
imdbUserRating = ur
|
||||
)
|
||||
s.thumbnail = thumbnail
|
||||
meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
|
||||
if meta:
|
||||
s.metacriticUrl = meta['url']
|
||||
s.metacriticScore = "%s" % meta['score']
|
||||
for review in meta['critics']:
|
||||
model.addReview(s, review)
|
||||
model.hub.commit()
|
||||
log.debug('added %s' % show)
|
||||
for episode in shows[show]:
|
||||
episode_title = get_title(shows[show][episode]['description'])
|
||||
episode_description = ''
|
||||
episode_imdb = ''
|
||||
q = model.Episodes.select(AND(
|
||||
model.Episodes.q.showID == s.id,
|
||||
model.Episodes.q.episode == episode))
|
||||
if q.count() == 0:
|
||||
if not imdb:
|
||||
try:
|
||||
imdbid = IMDb.guess(show)
|
||||
if imdbid:
|
||||
imdb = IMDb.parse(imdbid)
|
||||
except:
|
||||
pass
|
||||
if imdb and imdb['episodes'].has_key(episode):
|
||||
episode_title = imdb['episodes'][episode]['title']
|
||||
episode_description = imdb['episodes'][episode]['description']
|
||||
episode_imdb = imdb['episodes'][episode]['imdb']
|
||||
if not episode_description or not episode_title:
|
||||
tvcom_data = tvcom.get(show, episode)
|
||||
if not episode_description:
|
||||
episode_description = tvcom_data['description']
|
||||
if not episode_title:
|
||||
episode_title = tvcom_data['title']
|
||||
e = model.Episodes(
|
||||
showID = s.id,
|
||||
title = episode_title,
|
||||
episode = episode,
|
||||
torrent = shows[show][episode]['enclosures'][0]['href'],
|
||||
description = episode_description,
|
||||
imdb = episode_imdb,
|
||||
thumbnail = None,
|
||||
pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
|
||||
)
|
||||
s.lastUpdate = datetime.datetime.now()
|
||||
model.hub.commit()
|
||||
log.debug("from tvrss add %s %s" %(episode, show))
|
||||
log.debug("updating tvrss done.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# first look on the command line for a desired config file,
|
||||
# if it's not on the command line, then
|
||||
# look for setup.py in this directory. If it's not there, this script is
|
||||
# probably installed
|
||||
if len(sys.argv) > 1:
|
||||
turbogears.update_config(configfile=sys.argv[1],
|
||||
modulename="btvcr.config")
|
||||
elif exists(join(dirname(__file__), "setup.py")):
|
||||
turbogears.update_config(configfile="dev.cfg",
|
||||
modulename="btvcr.config")
|
||||
else:
|
||||
turbogears.update_config(configfile="prod.cfg",
|
||||
modulename="btvcr.config")
|
||||
|
||||
from btvcr.controllers import Root
|
||||
load()
|
150
scrapeit/utils.py
Normal file
150
scrapeit/utils.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
"""
|
||||
screape tools
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import djangohtml
|
||||
|
||||
|
||||
# Default headers for HTTP requests.
|
||||
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Functions
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
def quote_plus(s):
|
||||
"""
|
||||
A variant of urllib.quote_plus which handles ASCII and Unicode.
|
||||
"""
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
|
||||
|
||||
def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||
"""
|
||||
Read str contents of given str URL.
|
||||
|
||||
Here headers is a map of str -> str for HTTP request headers. If
|
||||
blocking is True, returns the str page contents. If blocking is
|
||||
False, returns an iterator which gives None until a successful read,
|
||||
at which point the str page contents is yielded.
|
||||
"""
|
||||
req = urllib2.Request(url, None, headers)
|
||||
f = urllib2.urlopen(req)
|
||||
data = f.read()
|
||||
f.close()
|
||||
ctype = f.headers.getheader('content-type')
|
||||
charset = ctype.split('charset=')
|
||||
if len(charset)>1: charset = charset[1]
|
||||
else: charset = 'latin-1'
|
||||
data = unicode(data, charset)
|
||||
return data
|
||||
|
||||
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||
"""
|
||||
Read str contents of given str URL.
|
||||
|
||||
Here headers is a map of str -> str for HTTP request headers. If
|
||||
blocking is True, returns the str page contents. If blocking is
|
||||
False, returns an iterator which gives None until a successful read,
|
||||
at which point the str page contents is yielded.
|
||||
"""
|
||||
req = urllib2.Request(url, None, headers)
|
||||
f = urllib2.urlopen(req)
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
|
||||
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||
"""
|
||||
opens given str URL and returns the url after redirection.
|
||||
"""
|
||||
rurl = url
|
||||
try:
|
||||
req = urllib2.Request(url, None, headers)
|
||||
rurl = urllib2.urlopen(req).url
|
||||
rurl = rurl.replace('&src=rss', '')
|
||||
except:
|
||||
rurl = url
|
||||
return rurl
|
||||
|
||||
|
||||
def fix_url(url):
|
||||
"""
|
||||
Given url str, trim redirect stuff and return actual URL.
|
||||
|
||||
Currently this just returns the URL unmodified.
|
||||
"""
|
||||
# if url.lower().find('http%3a//') > 0:
|
||||
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
|
||||
# if url.find('http://') > 0:
|
||||
# return url[url.rindex('http://'):]
|
||||
return url
|
||||
|
||||
|
||||
_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
|
||||
import htmlentitydefs
|
||||
|
||||
def html_entity_decode(s, encoding = 'utf-8'):
|
||||
r = []
|
||||
p = 0
|
||||
mo = _html_entity_re.search(s, p)
|
||||
while mo:
|
||||
r.append(s[p:mo.start()].decode(encoding))
|
||||
i = mo.lastindex
|
||||
e = mo.group(i)
|
||||
try:
|
||||
if i == 1:
|
||||
c = htmlentitydefs.name2codepoint[e]
|
||||
elif i == 2:
|
||||
c = int(e)
|
||||
elif i == 3:
|
||||
c = int(e, 16)
|
||||
else:
|
||||
assert 0
|
||||
r.append(unichr(c))
|
||||
except KeyError:
|
||||
r.append(mo.group(0))
|
||||
|
||||
p = mo.end()
|
||||
mo = _html_entity_re.search(s, p)
|
||||
r.append(s[p:].decode(encoding))
|
||||
return u''.join(r)
|
||||
|
||||
def stripTags(s):
|
||||
return djangohtml.strip_tags(htmldecode(s))
|
||||
|
||||
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
# This pattern matches a character entity reference (a decimal numeric
|
||||
# references, a hexadecimal numeric reference, or a named reference).
|
||||
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||
|
||||
def htmldecode(text):
|
||||
"""Decode HTML entities in the given text."""
|
||||
if type(text) != unicode:
|
||||
text = unicode(text)
|
||||
if type(text) is unicode:
|
||||
uchr = unichr
|
||||
else:
|
||||
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
||||
def entitydecode(match, uchr=uchr):
|
||||
entity = match.group(1)
|
||||
if entity.startswith('#x'):
|
||||
return uchr(int(entity[2:], 16))
|
||||
elif entity.startswith('#'):
|
||||
return uchr(int(entity[1:]))
|
||||
elif entity in name2codepoint:
|
||||
return uchr(name2codepoint[entity])
|
||||
else:
|
||||
return match.group(0)
|
||||
return charrefpat.sub(entitydecode, text)
|
||||
|
31
setup.py
Normal file
31
setup.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- Mode: Python; -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# encoding: utf-8
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
import os
|
||||
|
||||
setup(
|
||||
name="scrapeit",
|
||||
version="0.1",
|
||||
|
||||
# uncomment the following lines if you fill them out in release.py
|
||||
description="collection of scrapers for various websites",
|
||||
author="bot",
|
||||
author_email="bot@mailb.org",
|
||||
#url=url,
|
||||
#download_url=download_url,
|
||||
#license=license,
|
||||
packages=find_packages(),
|
||||
zip_safe=False,
|
||||
keywords = [
|
||||
],
|
||||
classifiers = [
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
],
|
||||
)
|
||||
|
Loading…
Reference in a new issue