commit ca2a42e773655644770e9df80097821132ad855e
Author: j <0x006A@0x2620.org>
Date: Thu Mar 1 15:11:35 2007 +0000
add scrapeit
diff --git a/scrapeit/__init__.py b/scrapeit/__init__.py
new file mode 100644
index 0000000..9193535
--- /dev/null
+++ b/scrapeit/__init__.py
@@ -0,0 +1,14 @@
+# -*- Mode: Python; -*-
+# vi:si:et:sw=2:sts=2:ts=2
+# encoding: utf-8
+
+import btjunkie
+import google
+import imdb
+import mininova
+import thepiratebay
+import torrent
+import rottentomatoes
+
+
+__version__ = '1.0.0'
diff --git a/scrapeit/btjunkie.py b/scrapeit/btjunkie.py
new file mode 100644
index 0000000..d42cac4
--- /dev/null
+++ b/scrapeit/btjunkie.py
@@ -0,0 +1,32 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from urllib import quote
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url, stripTags
+from btutils import torrentsWeLike
+
+
+def search(query):
+ '''search for torrents on btjunkie
+ '''
+ url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1" % quote(query)
+ page = read_url(url)
+ soup = BeautifulSoup(page)
+ torrents = soup.findAll('a', {'class': 'BlckUnd'})
+ torrents = filter(torrentsWeLike, torrents)
+ torrent_links = []
+ for t in torrents:
+ tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href']
+ tlink = tlink.replace('do=stat', 'do=download')
+ torrent_links.append(tlink)
+ return torrent_links
+
+def searchByImdb(imdb):
+ '''search for torrents by imdb, not supported on btjunkie right now
+ '''
+ return []
\ No newline at end of file
diff --git a/scrapeit/btutils.py b/scrapeit/btutils.py
new file mode 100644
index 0000000..474ea7c
--- /dev/null
+++ b/scrapeit/btutils.py
@@ -0,0 +1,25 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from utils import stripTags
+
+
+def torrentsWeLike(link):
+ '''check if torrent title looks like something we want to see,
+ dvdrip / no cam / no dubbed versions
+ '''
+ text = stripTags(unicode(link)).lower()
+ #no cams / telesyncs or other stuff
+ for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'):
+ if word in text:
+ return False
+ #no dubbed versions
+ for word in ('italian', 'german', 'spanish', 'french'):
+ if word in text:
+ return False
+ #only dvdrips or dvdscrs
+ for word in ('dvdrip', 'dvdscr', 'dvd screener'):
+ if word in text:
+ return True
+ return False
\ No newline at end of file
diff --git a/scrapeit/djangohtml.py b/scrapeit/djangohtml.py
new file mode 100644
index 0000000..88fe5ec
--- /dev/null
+++ b/scrapeit/djangohtml.py
@@ -0,0 +1,115 @@
+"HTML utilities suitable for global use."
+
+import re, string
+
+# Configuration for urlize() function
+LEADING_PUNCTUATION = ['(', '<', '<']
+TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
+
+# list of possible strings used for bullets in bulleted lists
+DOTS = ['·', '*', '\xe2\x80\xa2', '', '•', '•']
+
+unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
+word_split_re = re.compile(r'(\s+)')
+punctuation_re = re.compile('^(?P (?:%s).*?[a-zA-Z].*? (?: |\s| and %s
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
+hard_coded_bullets_re = re.compile(r'((?:
)*?
s"
+ value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
+ paras = re.split('\n{2,}', value)
+ paras = ['
') for p in paras]
+ return '\n\n'.join(paras)
+
+def strip_tags(value):
+ "Returns the given HTML with all tags stripped"
+ return re.sub(r'<[^>]*?>', '', value)
+
+def strip_spaces_between_tags(value):
+ "Returns the given HTML with spaces between tags normalized to a single space"
+ return re.sub(r'>\s+<', '> <', value)
+
+def strip_entities(value):
+ "Returns the given HTML with all entities (&something;) stripped"
+ return re.sub(r'&(?:\w+|#\d);', '', value)
+
+def fix_ampersands(value):
+ "Returns the given HTML with all unencoded ampersands encoded correctly"
+ return unencoded_ampersands_re.sub('&', value)
+
+def urlize(text, trim_url_limit=None, nofollow=False):
+ """
+ Converts any URLs in text into clickable links. Works on http://, https:// and
+ www. links. Links can have trailing punctuation (periods, commas, close-parens)
+ and leading punctuation (opening parens) and it'll still do the right thing.
+
+ If trim_url_limit is not None, the URLs in link text will be limited to
+ trim_url_limit characters.
+
+ If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
+ """
+ trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
+ words = word_split_re.split(text)
+ nofollow_attr = nofollow and ' rel="nofollow"' or ''
+ for i, word in enumerate(words):
+ match = punctuation_re.match(word)
+ if match:
+ lead, middle, trail = match.groups()
+ if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
+ len(middle) > 0 and middle[0] in string.letters + string.digits and \
+ (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
+ middle = '%s' % (middle, nofollow_attr, trim_url(anchor))
+ if middle.startswith('http://') or middle.startswith('https://'):
+ middle = '%s' % (middle, nofollow_attr, trim_url(middle))
+ if '@' in middle and not middle.startswith('www.') and not ':' in middle \
+ and simple_email_re.match(middle):
+ middle = '%s' % (middle, middle)
+ if lead + middle + trail != word:
+ words[i] = lead + middle + trail
+ return ''.join(words)
+
+def clean_html(text):
+ """
+ Cleans the given HTML. Specifically, it does the following:
+ * Converts and to and .
+ * Encodes all ampersands correctly.
+ * Removes all "target" attributes from tags.
+ * Removes extraneous HTML, such as presentational tags that open and
+ immediately close and
.
+ * Converts hard-coded bullets into HTML unordered lists.
+ * Removes stuff like "
.
+ text = html_gunk_re.sub('', text)
+ # Convert hard-coded bullets into HTML unordered lists.
+ def replace_p_tags(match):
+ s = match.group().replace('
%s' % d, '
", but only if it's at the bottom of the text. + text = trailing_empty_content_re.sub('', text) + return text + diff --git a/scrapeit/djangotext.py b/scrapeit/djangotext.py new file mode 100644 index 0000000..33a88cf --- /dev/null +++ b/scrapeit/djangotext.py @@ -0,0 +1,111 @@ +import re + +# Capitalizes the first letter of a string. +capfirst = lambda x: x and x[0].upper() + x[1:] + +def wrap(text, width): + """ + A word-wrap function that preserves existing line breaks and most spaces in + the text. Expects that existing line breaks are posix newlines (\n). + See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061 + """ + return reduce(lambda line, word, width=width: '%s%s%s' % + (line, + ' \n'[(len(line[line.rfind('\n')+1:]) + + len(word.split('\n',1)[0] + ) >= width)], + word), + text.split(' ') + ) + +def truncate_words(s, num): + "Truncates a string after a certain number of words." + length = int(num) + words = s.split() + if len(words) > length: + words = words[:length] + if not words[-1].endswith('...'): + words.append('...') + return ' '.join(words) + +def get_valid_filename(s): + """ + Returns the given string converted to a string that can be used for a clean + filename. Specifically, leading and trailing spaces are removed; other + spaces are converted to underscores; and all non-filename-safe characters + are removed. + >>> get_valid_filename("john's portrait in 2004.jpg") + 'johns_portrait_in_2004.jpg' + """ + s = s.strip().replace(' ', '_') + return re.sub(r'[^-A-Za-z0-9_.]', '', s) + +def get_text_list(list_, last_word='or'): + """ + >>> get_text_list(['a', 'b', 'c', 'd']) + 'a, b, c or d' + >>> get_text_list(['a', 'b', 'c'], 'and') + 'a, b and c' + >>> get_text_list(['a', 'b'], 'and') + 'a and b' + >>> get_text_list(['a']) + 'a' + >>> get_text_list([]) + '' + """ + if len(list_) == 0: return '' + if len(list_) == 1: return list_[0] + return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1]) + +def normalize_newlines(text): + return re.sub(r'\r\n|\r|\n', '\n', text) + +def recapitalize(text): + "Recapitalizes text, placing caps after end-of-sentence punctuation." +# capwords = () + text = text.lower() + capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])') + text = capsRE.sub(lambda x: x.group(1).upper(), text) +# for capword in capwords: +# capwordRE = re.compile(r'\b%s\b' % capword, re.I) +# text = capwordRE.sub(capword, text) + return text + +def phone2numeric(phone): + "Converts a phone number with letters into its numeric equivalent." + letters = re.compile(r'[A-PR-Y]', re.I) + char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3', + 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5', + 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7', + 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8', + 'y': '9', 'x': '9'}.get(m.group(0).lower()) + return letters.sub(char2number, phone) + +# From http://www.xhaus.com/alan/python/httpcomp.html#gzip +# Used with permission. +def compress_string(s): + import cStringIO, gzip + zbuf = cStringIO.StringIO() + zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) + zfile.write(s) + zfile.close() + return zbuf.getvalue() + +smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') +def smart_split(text): + """ + Generator that splits a string by spaces, leaving quoted phrases together. + Supports both single and double quotes, and supports escaping quotes with + backslashes. In the output, strings will keep their initial and trailing + quote marks. + >>> list(smart_split('This is "a person\'s" test.')) + ['This', 'is', '"a person\'s"', 'test.'] + """ + for bit in smart_split_re.finditer(text): + bit = bit.group(0) + if bit[0] == '"': + yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"' + elif bit[0] == "'": + yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'" + else: + yield bit diff --git a/scrapeit/epguides.py b/scrapeit/epguides.py new file mode 100644 index 0000000..a290319 --- /dev/null +++ b/scrapeit/epguides.py @@ -0,0 +1,68 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re + +from BeautifulSoup import BeautifulSoup + +from google import google +from utils import read_url, read_url_utf8, stripTags +import tvcom +import imdb + +def epguidesUrl(title): + ''' + Search Epguide Url for Show via Show Title. + Use Google to search the url, this is also done on Epguide. + ''' + for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1): + if url.startswith('http://epguides.com'): + if re.search(title, name): + return url + return None + +def getShowImdb(title): + imdbid = None + url = epguidesUrl(title) + if url: + data = read_url(url) + soup = BeautifulSoup(data) + links = soup('a', {'href': re.compile('imdb.com/title/tt')}) + if links: + link = links[0].get('href') + imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0]) + if not imdbid: + imdbid = imdb.guess(title) + return imdbid + +def getEpisodeData(title, episode, show_url = None): + ''' + Collect information about an episode. + + Returns dict with title, show, description and episode + ''' + episodeData = { + 'title': u'', + 'show': title, + 'description': u'', + 'episode': episode, + } + description = u'' + data = u'' + if not show_url: + show_url = epguidesUrl(title) + if show_url: + data = read_url_utf8(show_url) + else: + return imdb.getEpisodeData(title, episode) + estring = u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip() + for line in data.split('\n'): + a = line.split(estring) + if len(a) == 2: + soup = BeautifulSoup(line) + episodeData['title'] = soup('a')[0].contents[0] + tvcom_url = soup('a')[0].get('href') + episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description'] + break + return episodeData diff --git a/scrapeit/google.py b/scrapeit/google.py new file mode 100644 index 0000000..7af65b7 --- /dev/null +++ b/scrapeit/google.py @@ -0,0 +1,375 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +""" +Query Web search engines. + +This module works by filtering the HTML returned by the search engine and thus tends to break when +search engines modify their HTML output. + +Public domain, Connelly Barnes 2005-2007. Compatible with Python 2.3-2.5. + +See L{examples} for a quick start. See L{description} for the full +explanation, precautions, and legal disclaimers. + +""" + +import re +import time +import urllib +import urllib2 +import weakref +import threading +import Queue + +from utils import read_url + +__version__ = '1.0.2' + +# Default headers for HTTP requests. +DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'} + +# Default maximum number of results. +DEFAULT_MAX_RESULTS = 10 + +# Function names for supported search engines. +SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo'] + +__all__ = SEARCH_ENGINES + ['examples', 'description'] + +# -------------------------------------------------------------------- +# Functions +# -------------------------------------------------------------------- + +def quote_plus(s): + """ + A variant of urllib.quote_plus which handles ASCII and Unicode. + """ + return urllib.quote_plus(s.encode('utf-8')) + + +def fix_url(url): + """ + Given url str, trim redirect stuff and return actual URL. + + Currently this just returns the URL unmodified. + """ +# if url.lower().find('http%3a//') > 0: +# return 'http://' + url[url.lower().rindex('http%3a//')+9:] +# if url.find('http://') > 0: +# return url[url.rindex('http://'):] + return url + + +def get_search_page_links(page, results_per_page, begin, end, link_re): + """ + Given str contents of search result page, return list of links. + + Returns list of (name, url, desc) str tuples. See make_searcher() + for a description of results_per_page and link_re. + """ + if begin is not None and begin in page: + page = page[page.index(begin):] + if end is not None and end in page: + page = page[:page.index(end)] + ans = [] + for match in re.compile(link_re, re.DOTALL).finditer(page): + (name, url, desc) = match.group('name', 'url', 'desc') + url = fix_url(url) + ans += [(html_to_text(name), url, html_to_text(desc))] + return ans + + +def html_to_text(s): + """ + Given an HTML formatted str, convert it to a text str. + """ + s = re.sub(r'<.*?>', '', s) + s = s.replace('\r', ' ') + s = s.replace('\n', ' ') + s = s.replace('\t', ' ') + s = s.replace('&', '&') + s = s.replace('<', '<') + s = s.replace('>', '>') + s = s.replace('"', '"') + s = s.replace('·', '\xb7') + for i in range(256): + s = s.replace('%d;' % i, chr(i)) + while s.replace(' ', ' ') != s: + s = s.replace(' ', ' ') + return s.strip() + + +def nonblocking(f, blocking_return=None, sleep_time=0.01): + """ + Wrap a callable which returns an iter so that it no longer blocks. + + The wrapped iterator returns blocking_return while callable f is + blocking. The callable f is called in a background thread. If the + wrapped iterator is deleted, then the iterator returned by f is + deleted also and the background thread is terminated. + """ + def g(*args, **kwargs): + f_iter = f(*args, **kwargs) + g_iter = None + def run(): + while True: + g_obj = g_iter() + if g_obj is None: + return + if g_obj.q.qsize() == 0: + try: + f_next = f_iter.next() + except Exception, e: + g_obj.exc = e + return + g_obj.q.put(f_next) + else: + del g_obj + time.sleep(sleep_time) + class Iter: + def __init__(self): + self.q = Queue.Queue() + self.exc = None + self.thread = threading.Thread(target=run) + self.thread.setDaemon(True) + def next(self): + if self.exc is not None: + raise self.exc + try: + return self.q.get_nowait() + except Queue.Empty: + return blocking_return + def __iter__(self): + return self + + obj = Iter() + g_iter = weakref.ref(obj) + obj.thread.start() + try: + return obj + finally: + del obj + return g + + +def make_searcher(query_url, results_per_page, page_url, page_mode, + begin, end, link_re): + """ + Return a search function for the given search engine. + + Here query_url is the URL for the initial search, with %(q)s for + the query string, results_per_page is the number of search results + per page, page_url is the URL for the 2nd and subsequent pages of + search results, with %(q)s for the query string and %(n)s for the + page "number." Here page_mode controls the actual value for the + page "number:" + + - page_mode='page0': Use 0-based index of the page. + - page_mode='page1': Use 1-based index of the page. + - page_mode='offset0': Use 0-based index of the search result, + which is a multiple of results_per_page. + - page_mode='offset1': Use 1-based index of the search result + (one plus a multiple of results_per_page). + + If begin is not None, then only text after the first occurrence of + begin will be used in the search results page. If end is not None, + then only text before the first occurrence of end will be used. + + Finally, link_re is a regex string (see module re) which matches + three named groups: 'name', 'url', and 'desc'. These correspond to + the name, URL and description of each search result. The regex is + applied in re.DOTALL mode. + + Returns a search() function which has the same interface as + described in the module docstring. + """ + def search_blocking(query, max_results): + last_links = None + page_num = 0 +# done = False + q = Queue.Queue() + for i in range(max_results): + if q.qsize() == 0: + if page_num == 0: + page = read_url(query_url % {'q': quote_plus(query)}) + else: +# if done: +# break + if page_mode == 'page0': + n = page_num + elif page_mode == 'page1': + n = page_num + 1 + elif page_mode == 'offset0': + n = page_num * results_per_page + elif page_mode == 'offset1': + n = page_num * results_per_page + 1 + else: + raise ValueError('unknown page mode') + page = read_url(page_url % {'n': n, 'q': quote_plus(query)}) + page_num += 1 + links = get_search_page_links(page, results_per_page, begin, end, link_re) + if len(links) == 0 or links == last_links: + break +# if len(links) < results_per_page: +# done = True + last_links = links + for link in links: + q.put(link) + yield q.get() + + search_nonblocking = nonblocking(search_blocking) + + def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True): + """ + See docstring for web_search module. + """ + if blocking: + return search_blocking(query, max_results) + else: + return search_nonblocking(query, max_results) + + return search + + +def examples(): + """ + Examples of the web_search module. + + Example 1: + + >>> from web_search import google + >>> for (name, url, desc) in google('python', 20): + ... print name, url + ... + (First 20 results for Google search of "python"). + + Example 2: + + >>> from web_search import dmoz + >>> list(dmoz('abc', 10)) + [('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...] + + """ + print examples.__doc__ + + +def description(): + """ + Full explanation and precautions for web_search module. + + The search functions in this module follow a common interface:: + + search(query, max_results=10, blocking=True) => + iterator of (name, url, description) search results. + + Here query is the query string, max_results gives the maximum number + of search results, and the items in the returned iterator are string + 3-tuples containing the Website name, URL, and description for each + search result. + + If blocking=False, then an iterator is returned which does not block + execution: the iterator yields None when the next search result is + not yet available (a background thread is created). + + Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn', + 'yahoo'. This module is not associated with or endorsed by any of + these search engine corporations. + + Be warned that if searches are made too frequently, or max_results is + large and you enumerate all search results, then you will be a drain + on the search engine's bandwidth, and the search engine organization + may respond by banning your IP address or IP address range. + + This software has been placed in the public domain with the + following legal notice:: + + http://oregonstate.edu/~barnesc/documents/public_domain.txt + + """ + print description.__doc__ + + +# -------------------------------------------------------------------- +# Search engines +# -------------------------------------------------------------------- + +ask = make_searcher('http://www.ask.com/web?q=%(q)s', 10, + 'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1', + None, None, + r'(?P