commit ca2a42e773655644770e9df80097821132ad855e Author: j <0x006A@0x2620.org> Date: Thu Mar 1 15:11:35 2007 +0000 add scrapeit diff --git a/scrapeit/__init__.py b/scrapeit/__init__.py new file mode 100644 index 0000000..9193535 --- /dev/null +++ b/scrapeit/__init__.py @@ -0,0 +1,14 @@ +# -*- Mode: Python; -*- +# vi:si:et:sw=2:sts=2:ts=2 +# encoding: utf-8 + +import btjunkie +import google +import imdb +import mininova +import thepiratebay +import torrent +import rottentomatoes + + +__version__ = '1.0.0' diff --git a/scrapeit/btjunkie.py b/scrapeit/btjunkie.py new file mode 100644 index 0000000..d42cac4 --- /dev/null +++ b/scrapeit/btjunkie.py @@ -0,0 +1,32 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +from urllib import quote +import re + +from BeautifulSoup import BeautifulSoup + +from utils import read_url, stripTags +from btutils import torrentsWeLike + + +def search(query): + '''search for torrents on btjunkie + ''' + url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1" % quote(query) + page = read_url(url) + soup = BeautifulSoup(page) + torrents = soup.findAll('a', {'class': 'BlckUnd'}) + torrents = filter(torrentsWeLike, torrents) + torrent_links = [] + for t in torrents: + tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href'] + tlink = tlink.replace('do=stat', 'do=download') + torrent_links.append(tlink) + return torrent_links + +def searchByImdb(imdb): + '''search for torrents by imdb, not supported on btjunkie right now + ''' + return [] \ No newline at end of file diff --git a/scrapeit/btutils.py b/scrapeit/btutils.py new file mode 100644 index 0000000..474ea7c --- /dev/null +++ b/scrapeit/btutils.py @@ -0,0 +1,25 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +from utils import stripTags + + +def torrentsWeLike(link): + '''check if torrent title looks like something we want to see, + dvdrip / no cam / no dubbed versions + ''' + text = stripTags(unicode(link)).lower() + #no cams / telesyncs or other stuff + for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'): + if word in text: + return False + #no dubbed versions + for word in ('italian', 'german', 'spanish', 'french'): + if word in text: + return False + #only dvdrips or dvdscrs + for word in ('dvdrip', 'dvdscr', 'dvd screener'): + if word in text: + return True + return False \ No newline at end of file diff --git a/scrapeit/djangohtml.py b/scrapeit/djangohtml.py new file mode 100644 index 0000000..88fe5ec --- /dev/null +++ b/scrapeit/djangohtml.py @@ -0,0 +1,115 @@ +"HTML utilities suitable for global use." + +import re, string + +# Configuration for urlize() function +LEADING_PUNCTUATION = ['(', '<', '<'] +TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] + +# list of possible strings used for bullets in bulleted lists +DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] + +unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') +word_split_re = re.compile(r'(\s+)') +punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ + ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), + '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) +simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') +link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') +html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) +hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) +trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') +del x # Temporary variable + +def escape(html): + "Returns the given HTML with ampersands, quotes and carets encoded" + if not isinstance(html, basestring): + html = str(html) + return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') + +def linebreaks(value): + "Converts newlines into

and
s" + value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines + paras = re.split('\n{2,}', value) + paras = ['

%s

' % p.strip().replace('\n', '
') for p in paras] + return '\n\n'.join(paras) + +def strip_tags(value): + "Returns the given HTML with all tags stripped" + return re.sub(r'<[^>]*?>', '', value) + +def strip_spaces_between_tags(value): + "Returns the given HTML with spaces between tags normalized to a single space" + return re.sub(r'>\s+<', '> <', value) + +def strip_entities(value): + "Returns the given HTML with all entities (&something;) stripped" + return re.sub(r'&(?:\w+|#\d);', '', value) + +def fix_ampersands(value): + "Returns the given HTML with all unencoded ampersands encoded correctly" + return unencoded_ampersands_re.sub('&', value) + +def urlize(text, trim_url_limit=None, nofollow=False): + """ + Converts any URLs in text into clickable links. Works on http://, https:// and + www. links. Links can have trailing punctuation (periods, commas, close-parens) + and leading punctuation (opening parens) and it'll still do the right thing. + + If trim_url_limit is not None, the URLs in link text will be limited to + trim_url_limit characters. + + If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. + """ + trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x + words = word_split_re.split(text) + nofollow_attr = nofollow and ' rel="nofollow"' or '' + for i, word in enumerate(words): + match = punctuation_re.match(word) + if match: + lead, middle, trail = match.groups() + if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \ + len(middle) > 0 and middle[0] in string.letters + string.digits and \ + (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): + middle = '
%s' % (middle, nofollow_attr, trim_url(anchor)) + if middle.startswith('http://') or middle.startswith('https://'): + middle = '%s' % (middle, nofollow_attr, trim_url(middle)) + if '@' in middle and not middle.startswith('www.') and not ':' in middle \ + and simple_email_re.match(middle): + middle = '%s' % (middle, middle) + if lead + middle + trail != word: + words[i] = lead + middle + trail + return ''.join(words) + +def clean_html(text): + """ + Cleans the given HTML. Specifically, it does the following: + * Converts and to and . + * Encodes all ampersands correctly. + * Removes all "target" attributes from tags. + * Removes extraneous HTML, such as presentational tags that open and + immediately close and
. + * Converts hard-coded bullets into HTML unordered lists. + * Removes stuff like "

  

", but only if it's at the + bottom of the text. + """ + from djangotext import normalize_newlines + text = normalize_newlines(text) + text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) + text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) + text = fix_ampersands(text) + # Remove all target="" attributes from
tags. + text = link_target_attribute_re.sub('\\1', text) + # Trim stupid HTML such as
. + text = html_gunk_re.sub('', text) + # Convert hard-coded bullets into HTML unordered lists. + def replace_p_tags(match): + s = match.group().replace('

', '') + for d in DOTS: + s = s.replace('

%s' % d, '

  • ') + return '
      \n%s\n
    ' % s + text = hard_coded_bullets_re.sub(replace_p_tags, text) + # Remove stuff like "

      

    ", but only if it's at the bottom of the text. + text = trailing_empty_content_re.sub('', text) + return text + diff --git a/scrapeit/djangotext.py b/scrapeit/djangotext.py new file mode 100644 index 0000000..33a88cf --- /dev/null +++ b/scrapeit/djangotext.py @@ -0,0 +1,111 @@ +import re + +# Capitalizes the first letter of a string. +capfirst = lambda x: x and x[0].upper() + x[1:] + +def wrap(text, width): + """ + A word-wrap function that preserves existing line breaks and most spaces in + the text. Expects that existing line breaks are posix newlines (\n). + See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061 + """ + return reduce(lambda line, word, width=width: '%s%s%s' % + (line, + ' \n'[(len(line[line.rfind('\n')+1:]) + + len(word.split('\n',1)[0] + ) >= width)], + word), + text.split(' ') + ) + +def truncate_words(s, num): + "Truncates a string after a certain number of words." + length = int(num) + words = s.split() + if len(words) > length: + words = words[:length] + if not words[-1].endswith('...'): + words.append('...') + return ' '.join(words) + +def get_valid_filename(s): + """ + Returns the given string converted to a string that can be used for a clean + filename. Specifically, leading and trailing spaces are removed; other + spaces are converted to underscores; and all non-filename-safe characters + are removed. + >>> get_valid_filename("john's portrait in 2004.jpg") + 'johns_portrait_in_2004.jpg' + """ + s = s.strip().replace(' ', '_') + return re.sub(r'[^-A-Za-z0-9_.]', '', s) + +def get_text_list(list_, last_word='or'): + """ + >>> get_text_list(['a', 'b', 'c', 'd']) + 'a, b, c or d' + >>> get_text_list(['a', 'b', 'c'], 'and') + 'a, b and c' + >>> get_text_list(['a', 'b'], 'and') + 'a and b' + >>> get_text_list(['a']) + 'a' + >>> get_text_list([]) + '' + """ + if len(list_) == 0: return '' + if len(list_) == 1: return list_[0] + return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1]) + +def normalize_newlines(text): + return re.sub(r'\r\n|\r|\n', '\n', text) + +def recapitalize(text): + "Recapitalizes text, placing caps after end-of-sentence punctuation." +# capwords = () + text = text.lower() + capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])') + text = capsRE.sub(lambda x: x.group(1).upper(), text) +# for capword in capwords: +# capwordRE = re.compile(r'\b%s\b' % capword, re.I) +# text = capwordRE.sub(capword, text) + return text + +def phone2numeric(phone): + "Converts a phone number with letters into its numeric equivalent." + letters = re.compile(r'[A-PR-Y]', re.I) + char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3', + 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5', + 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7', + 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8', + 'y': '9', 'x': '9'}.get(m.group(0).lower()) + return letters.sub(char2number, phone) + +# From http://www.xhaus.com/alan/python/httpcomp.html#gzip +# Used with permission. +def compress_string(s): + import cStringIO, gzip + zbuf = cStringIO.StringIO() + zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) + zfile.write(s) + zfile.close() + return zbuf.getvalue() + +smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') +def smart_split(text): + """ + Generator that splits a string by spaces, leaving quoted phrases together. + Supports both single and double quotes, and supports escaping quotes with + backslashes. In the output, strings will keep their initial and trailing + quote marks. + >>> list(smart_split('This is "a person\'s" test.')) + ['This', 'is', '"a person\'s"', 'test.'] + """ + for bit in smart_split_re.finditer(text): + bit = bit.group(0) + if bit[0] == '"': + yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"' + elif bit[0] == "'": + yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'" + else: + yield bit diff --git a/scrapeit/epguides.py b/scrapeit/epguides.py new file mode 100644 index 0000000..a290319 --- /dev/null +++ b/scrapeit/epguides.py @@ -0,0 +1,68 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re + +from BeautifulSoup import BeautifulSoup + +from google import google +from utils import read_url, read_url_utf8, stripTags +import tvcom +import imdb + +def epguidesUrl(title): + ''' + Search Epguide Url for Show via Show Title. + Use Google to search the url, this is also done on Epguide. + ''' + for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1): + if url.startswith('http://epguides.com'): + if re.search(title, name): + return url + return None + +def getShowImdb(title): + imdbid = None + url = epguidesUrl(title) + if url: + data = read_url(url) + soup = BeautifulSoup(data) + links = soup('a', {'href': re.compile('imdb.com/title/tt')}) + if links: + link = links[0].get('href') + imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0]) + if not imdbid: + imdbid = imdb.guess(title) + return imdbid + +def getEpisodeData(title, episode, show_url = None): + ''' + Collect information about an episode. + + Returns dict with title, show, description and episode + ''' + episodeData = { + 'title': u'', + 'show': title, + 'description': u'', + 'episode': episode, + } + description = u'' + data = u'' + if not show_url: + show_url = epguidesUrl(title) + if show_url: + data = read_url_utf8(show_url) + else: + return imdb.getEpisodeData(title, episode) + estring = u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip() + for line in data.split('\n'): + a = line.split(estring) + if len(a) == 2: + soup = BeautifulSoup(line) + episodeData['title'] = soup('a')[0].contents[0] + tvcom_url = soup('a')[0].get('href') + episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description'] + break + return episodeData diff --git a/scrapeit/google.py b/scrapeit/google.py new file mode 100644 index 0000000..7af65b7 --- /dev/null +++ b/scrapeit/google.py @@ -0,0 +1,375 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +""" +Query Web search engines. + +This module works by filtering the HTML returned by the search engine and thus tends to break when +search engines modify their HTML output. + +Public domain, Connelly Barnes 2005-2007. Compatible with Python 2.3-2.5. + +See L{examples} for a quick start. See L{description} for the full +explanation, precautions, and legal disclaimers. + +""" + +import re +import time +import urllib +import urllib2 +import weakref +import threading +import Queue + +from utils import read_url + +__version__ = '1.0.2' + +# Default headers for HTTP requests. +DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'} + +# Default maximum number of results. +DEFAULT_MAX_RESULTS = 10 + +# Function names for supported search engines. +SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo'] + +__all__ = SEARCH_ENGINES + ['examples', 'description'] + +# -------------------------------------------------------------------- +# Functions +# -------------------------------------------------------------------- + +def quote_plus(s): + """ + A variant of urllib.quote_plus which handles ASCII and Unicode. + """ + return urllib.quote_plus(s.encode('utf-8')) + + +def fix_url(url): + """ + Given url str, trim redirect stuff and return actual URL. + + Currently this just returns the URL unmodified. + """ +# if url.lower().find('http%3a//') > 0: +# return 'http://' + url[url.lower().rindex('http%3a//')+9:] +# if url.find('http://') > 0: +# return url[url.rindex('http://'):] + return url + + +def get_search_page_links(page, results_per_page, begin, end, link_re): + """ + Given str contents of search result page, return list of links. + + Returns list of (name, url, desc) str tuples. See make_searcher() + for a description of results_per_page and link_re. + """ + if begin is not None and begin in page: + page = page[page.index(begin):] + if end is not None and end in page: + page = page[:page.index(end)] + ans = [] + for match in re.compile(link_re, re.DOTALL).finditer(page): + (name, url, desc) = match.group('name', 'url', 'desc') + url = fix_url(url) + ans += [(html_to_text(name), url, html_to_text(desc))] + return ans + + +def html_to_text(s): + """ + Given an HTML formatted str, convert it to a text str. + """ + s = re.sub(r'<.*?>', '', s) + s = s.replace('\r', ' ') + s = s.replace('\n', ' ') + s = s.replace('\t', ' ') + s = s.replace('&', '&') + s = s.replace('<', '<') + s = s.replace('>', '>') + s = s.replace('"', '"') + s = s.replace('·', '\xb7') + for i in range(256): + s = s.replace('&#%d;' % i, chr(i)) + while s.replace(' ', ' ') != s: + s = s.replace(' ', ' ') + return s.strip() + + +def nonblocking(f, blocking_return=None, sleep_time=0.01): + """ + Wrap a callable which returns an iter so that it no longer blocks. + + The wrapped iterator returns blocking_return while callable f is + blocking. The callable f is called in a background thread. If the + wrapped iterator is deleted, then the iterator returned by f is + deleted also and the background thread is terminated. + """ + def g(*args, **kwargs): + f_iter = f(*args, **kwargs) + g_iter = None + def run(): + while True: + g_obj = g_iter() + if g_obj is None: + return + if g_obj.q.qsize() == 0: + try: + f_next = f_iter.next() + except Exception, e: + g_obj.exc = e + return + g_obj.q.put(f_next) + else: + del g_obj + time.sleep(sleep_time) + class Iter: + def __init__(self): + self.q = Queue.Queue() + self.exc = None + self.thread = threading.Thread(target=run) + self.thread.setDaemon(True) + def next(self): + if self.exc is not None: + raise self.exc + try: + return self.q.get_nowait() + except Queue.Empty: + return blocking_return + def __iter__(self): + return self + + obj = Iter() + g_iter = weakref.ref(obj) + obj.thread.start() + try: + return obj + finally: + del obj + return g + + +def make_searcher(query_url, results_per_page, page_url, page_mode, + begin, end, link_re): + """ + Return a search function for the given search engine. + + Here query_url is the URL for the initial search, with %(q)s for + the query string, results_per_page is the number of search results + per page, page_url is the URL for the 2nd and subsequent pages of + search results, with %(q)s for the query string and %(n)s for the + page "number." Here page_mode controls the actual value for the + page "number:" + + - page_mode='page0': Use 0-based index of the page. + - page_mode='page1': Use 1-based index of the page. + - page_mode='offset0': Use 0-based index of the search result, + which is a multiple of results_per_page. + - page_mode='offset1': Use 1-based index of the search result + (one plus a multiple of results_per_page). + + If begin is not None, then only text after the first occurrence of + begin will be used in the search results page. If end is not None, + then only text before the first occurrence of end will be used. + + Finally, link_re is a regex string (see module re) which matches + three named groups: 'name', 'url', and 'desc'. These correspond to + the name, URL and description of each search result. The regex is + applied in re.DOTALL mode. + + Returns a search() function which has the same interface as + described in the module docstring. + """ + def search_blocking(query, max_results): + last_links = None + page_num = 0 +# done = False + q = Queue.Queue() + for i in range(max_results): + if q.qsize() == 0: + if page_num == 0: + page = read_url(query_url % {'q': quote_plus(query)}) + else: +# if done: +# break + if page_mode == 'page0': + n = page_num + elif page_mode == 'page1': + n = page_num + 1 + elif page_mode == 'offset0': + n = page_num * results_per_page + elif page_mode == 'offset1': + n = page_num * results_per_page + 1 + else: + raise ValueError('unknown page mode') + page = read_url(page_url % {'n': n, 'q': quote_plus(query)}) + page_num += 1 + links = get_search_page_links(page, results_per_page, begin, end, link_re) + if len(links) == 0 or links == last_links: + break +# if len(links) < results_per_page: +# done = True + last_links = links + for link in links: + q.put(link) + yield q.get() + + search_nonblocking = nonblocking(search_blocking) + + def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True): + """ + See docstring for web_search module. + """ + if blocking: + return search_blocking(query, max_results) + else: + return search_nonblocking(query, max_results) + + return search + + +def examples(): + """ + Examples of the web_search module. + + Example 1: + + >>> from web_search import google + >>> for (name, url, desc) in google('python', 20): + ... print name, url + ... + (First 20 results for Google search of "python"). + + Example 2: + + >>> from web_search import dmoz + >>> list(dmoz('abc', 10)) + [('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...] + + """ + print examples.__doc__ + + +def description(): + """ + Full explanation and precautions for web_search module. + + The search functions in this module follow a common interface:: + + search(query, max_results=10, blocking=True) => + iterator of (name, url, description) search results. + + Here query is the query string, max_results gives the maximum number + of search results, and the items in the returned iterator are string + 3-tuples containing the Website name, URL, and description for each + search result. + + If blocking=False, then an iterator is returned which does not block + execution: the iterator yields None when the next search result is + not yet available (a background thread is created). + + Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn', + 'yahoo'. This module is not associated with or endorsed by any of + these search engine corporations. + + Be warned that if searches are made too frequently, or max_results is + large and you enumerate all search results, then you will be a drain + on the search engine's bandwidth, and the search engine organization + may respond by banning your IP address or IP address range. + + This software has been placed in the public domain with the + following legal notice:: + + http://oregonstate.edu/~barnesc/documents/public_domain.txt + + """ + print description.__doc__ + + +# -------------------------------------------------------------------- +# Search engines +# -------------------------------------------------------------------- + +ask = make_searcher('http://www.ask.com/web?q=%(q)s', 10, + 'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1', + None, None, + r'
    (?P.*?)' + + r'.*?(?P.*?)') + +dmoz = make_searcher('http://search.dmoz.org/cgi-bin/search?search=%(q)s', 20, + 'http://search.dmoz.org/cgi-bin/search?start=%(n)d&search=%(q)s', 'offset1', + None, None, + r'
  • (?P.*?)' + + r'.*? - (?P.*?)
    ') + +excite = make_searcher('http://msxml.excite.com/info.xcite/search/web/%(q)s', 20, + 'http://msxml.excite.com/info.xcite/search/web/%(q)s/%(n)d', 'offset1', + None, None, + r'
    (?P.*?)' + + r'(?P.*?)') + +google = make_searcher('http://www.google.com/search?q=%(q)s', 10, + 'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0', + None, None, + r'(?P.*?)' + + r'.*?(?:
    |)' + + r'(?P.*?)' + '(?:|Results', '
    ', + r'

    (?P.*?)' + + r'(?P.*?)
  • ') + +yahoo = make_searcher('http://search.yahoo.com/search?p=%(q)s', 10, + 'http://search.yahoo.com/search?p=%(q)s&b=%(n)d', 'offset1', + None, None, + '
  • .*?)".*?>(?P.*?)' + + r'.*?
    (?P.*?)
    ') + +# -------------------------------------------------------------------- +# Unit tests +# -------------------------------------------------------------------- + +def test_engine(search): + """ + Test a search engine function returned by make_searcher(). + """ + for query in ['abc', 'microsoft', 'love', 'pweropieiw', 'addfdae']: + popular = query in ['abc', 'microsoft', 'love', 'run'] + for n in [6, 17, 31]: + n1 = len(list(search(query, n))) + if popular: + assert n1 == n + else: + assert n1 <= n + n2 = 0 + for item in search(query, n, False): + if item is not None: + n2 += 1 + else: + time.sleep(0.01) + if popular: + assert n2 == n + else: + assert n2 <= n + + +def test(): + """ + Unit test main routine. + """ + import inspect + print 'Testing:' + for name in SEARCH_ENGINES: + print ' ' + (name + ':').ljust(20), + test_engine(getattr(inspect.getmodule(test), name)) + print 'OK' + + +if __name__ == '__main__': + test() diff --git a/scrapeit/googlemovie.py b/scrapeit/googlemovie.py new file mode 100644 index 0000000..1905234 --- /dev/null +++ b/scrapeit/googlemovie.py @@ -0,0 +1,34 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re +from urllib import quote +from BeautifulSoup import BeautifulSoup + +from utils import read_url, read_url_utf8, stripTags + +def getGoogleMovieId(title): + url = 'http://google.com/movies?q=%s&btnG=Search+Movies' % quote(title) + data = read_url(url) + cids = re.compile('reviews\?cid=(.*?)&').findall(data) + if cids: + return cids[0] + return '' + +def getGoogleMovieData(title, year = None, cid = None): + gdata = { + 'title': title, + 'year': year, + 'cid': cid, + 'rating': '', + } + if not cid: + cid = getGoogleMovieId("%s (%s)" % (title, year)) + if cid: + gdata['cid'] = cid + data = read_url('http://www.google.com/movies/reviews?cid=%s' % cid) + gdata['rating'] = re.compile('font size=.3>(.*?) / 5').findall(data)[0] + gdata['reviews'] = re.compile('Based on (.*?) reviews').findall(data)[0] + gdata['year'] = re.compile(".*?\((.*?)\).*?</title").findall(data)[0] + return gdata \ No newline at end of file diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py new file mode 100644 index 0000000..0b507cc --- /dev/null +++ b/scrapeit/imdb.py @@ -0,0 +1,441 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import urllib2 +from urllib import quote +import re, time +import os + +from elementtree.ElementTree import parse, tostring +from BeautifulSoup import BeautifulSoup + +from google import google +from utils import stripTags, read_url_utf8, htmldecode + +import utils + +def read_url(url): + base = "/var/cache/scrapeit/cache/" + path = os.path.join(base, url.replace('http://','')) + if path.endswith('/'): + path = "%sindex.html" % path + if os.path.isdir(path): + path = "%s/index.html" % path + if os.path.exists(path): + f = open(path) + data = f.read() + f.close() + return data + else: + data = utils.read_url(url) + folder = os.path.dirname(path) + if not os.path.exists(folder): + os.makedirs(folder) + f = open(path, 'w') + f.write(data) + f.close() + return data + +def _get_data(url): + data = None + try: + data = read_url(url) + except: + print "error reading data from", url + return data + +def get_image(url): + return read_url(url) + +def _castList(data, regexp): + soup = re.compile(regexp).findall(data) + if soup: + soup = BeautifulSoup(soup[0]) + names = [] + for i in soup('a', {'href': re.compile('/name/nm')}): + if i.string: + cast = stripTags(i.string) + if cast not in names: + names.append(cast) + return names + return [] + +def _getTerm(data, regexp): + term = '' + try: + reg = re.compile(regexp, re.IGNORECASE) + m = reg.search(data) + if m: + term = stripTags(m.group(1)).strip() + except: + print "waring, parsing failed for", regexp + return term.encode('utf8') + + +class IMDb: + def __init__(self, imdb): + self.imdb = imdb + self.pageSource = None + self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb + + self.businessSource = None + self.businessUrl = "%sbusiness" % self.pageUrl + self.connectionsSource = None + self.connectionsUrl = "%smovieconnections" % self.pageUrl + self.creditsSource = None + self.creditsUrl = "%sfullcredits" % self.pageUrl + self.episodesSource = None + self.episodesUrl = "%sepisodes" % self.pageUrl + self.keywordSource = None + self.keywordUrl = "%skeywords" % self.pageUrl + self.plotSource = None + self.plotUrl = "%splotsummary" % self.pageUrl + self.releaseinfoSource = None + self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl + self.triviaSource = None + self.triviaUrl = "%strivia" % self.pageUrl + + def getPage(self, forcereload = False): + if forcereload or not self.pageSource: + self.pageSource = read_url(self.pageUrl) + return self.pageSource + + def parse_raw_value(self, key, value): + if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): + value = stripTags(value).strip() + if key == 'runtime': + parsed_value = _getTerm(value, '(.*?) min') + parsed_value = _getTerm(parsed_value, '([0-9]+)') + if not parsed_value: + parsed_value = _getTerm(value, '(.*?) sec') + parsed_value = _getTerm(parsed_value, '([0-9]+)') + if not parsed_value: + parsed_value = 0 + else: + parsed_value = int(parsed_value) + else: + parsed_value = int(parsed_value) * 60 + elif key in ('country', 'language'): + parsed_value = value.split(' / ') + elif key == 'genre': + parsed_value = value.replace('more', '').strip().split(' / ') + elif key == 'tagline': + parsed_value = value.replace('more', '').strip() + elif key == 'plot_outline': + parsed_value = value.replace('(view trailer)', '').strip() + if parsed_value.endswith('more'): + parsed_value = parsed_value[:-4].strip() + elif key == 'tv_series': + m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value) + if m: + parsed_value = m[0][0] + else: + parsed_value = '' + else: + print value + parsed_value = value + return parsed_value + + def parse(self): + data = self.getPage() + IMDbDict ={} + #Poster + IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"') + if not IMDbDict['poster']: + IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' + #Title, Year + title = u'' + year = u'' + flat_data = data.replace('\n', '').replace('\r', '') + html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data) + if html_title: + title = html_title[0][0] + IMDbDict['year'] = html_title[0][1] + IMDbDict['title'] = stripTags(title).strip() + else: + title = _getTerm(data, '<title>(.*?)').split('(') + year = title[-1].split(')')[0].strip() + title = title[0].strip().decode('utf-8') + IMDbDict['title'] = title + IMDbDict['year'] = year + IMDbDict['title'] = htmldecode(IMDbDict['title']) + if IMDbDict['title'][0] == '"' and IMDbDict['title'][-1] == '"': + IMDbDict['title'] = IMDbDict['title'][1:-1] + + #Votes + m = re.compile('(.*?)/10 \((.*?) votes\)', re.IGNORECASE).search(data) + if m: + IMDbDict['rating'] = int(float(m.group(1)) * 1000) + IMDbDict['votes'] = int(m.group(2).replace(',', '')) + else: + IMDbDict['rating'] = -1 + IMDbDict['votes'] = -1 + + data = data.replace('\n',' ') + #some values + keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series') + for key in keys: + IMDbDict[key] = '' + IMDbDict['runtime'] = 0 + soup = BeautifulSoup(data) + for info in soup('div', {'class': 'info'}): + key = str(info).split('
  • ')[0].split('
    ') + if len(key) > 1: + raw_value = str(info).split('
    ')[1] + key = key[1][:-1].lower().replace(' ', '_') + if key in keys: + IMDbDict[key] = self.parse_raw_value(key, raw_value) + + #is episode + IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') + + IMDbDict['episodes'] = self.parseEpisodes() + IMDbDict['credits'] = self.parseCredits() + IMDbDict['plot'] = self.parsePlot() + IMDbDict['keywords'] = self.parseKeywords() + + IMDbDict['trivia'] = self.parseTrivia() + IMDbDict['connections'] = self.parseConnections() + IMDbDict['release_date'] = self.parseReleaseinfo() + IMDbDict['business'] = self.parseBusiness() + self.IMDbDict = IMDbDict + return self.IMDbDict + + def getCredits(self, forcereload = False): + if forcereload or not self.creditsSource: + self.creditsSource = read_url(self.creditsUrl) + return self.creditsSource + + def parseCredits(self): + data = self.getCredits() + credits = {} + credits['director'] = _castList(data, 'Directed by.*?(.*?)') + credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') + credits['producer'] = _castList(data, 'Produced by.*?(.*?)') + #credits['cast'] = _castList(data, 'Cast
    .*?(') + credits['cast'] = [] + soup = re.compile('Cast
    .*?(').findall(data) + soup = BeautifulSoup(data) + cast = soup('table', {'class': 'cast'}) + if cast: + cast = str(cast[0]) + names = re.compile('(.*?).*?(.*?)').findall(cast) + for name in names: + real_name = name[0] + role_name = name[1] + if role_name: + role_name = role_name.split('(')[0].replace('/ ...','').strip() + credits['cast'].append((stripTags(real_name), stripTags(role_name))) + self.credits = credits + return self.credits + + def getPlot(self, forcereload = False): + if forcereload or not self.plotSource: + self.plotSource = read_url(self.plotUrl) + return self.plotSource + + def parsePlot(self): + soup = BeautifulSoup(self.getPlot()) + plot = soup('p', {'class':'plotpar'}) + if plot: + plot = str(plot[0]).split('')[0] + else: + plot = u'' + plot = stripTags(plot).strip() + self.plot = plot + return plot + + def getEpisodes(self, forcereload = False): + if forcereload or not self.episodesSource: + self.episodesSource = read_url(self.episodesUrl) + return self.episodesSource + + def parseEpisodes(self): + episodes = {} + cdata = self.getEpisodes().replace('\r\n',' ') + regexp = r'''

    Season (.*?), Episode (.*?): (.*?)

    .*?
    (.*?)
    ''' + #regexp = r'''Season (.*?), Episode (.*?): (.*?)
    .*?
    (.*?)''' + reg = re.compile(regexp, re.IGNORECASE) + m = reg.findall(cdata) + for match in m: + try: + episode = "S%02dE%02d" % (int(match[0]), int(match[1])) + episodes[episode] = {} + episodes[episode]['imdb'] = match[2] + episodes[episode]['title'] = match[3].strip() + description = htmldecode(match[4]) + description = stripTags(description.split('Next US airings:')[0]) + episodes[episode]['description'] = description + except: + import traceback + print traceback.print_exc() + pass + self.episodes = episodes + return self.episodes + + def getKeywords(self, forcereload = False): + if forcereload or not self.keywordSource: + self.keywordSource = read_url(self.keywordUrl) + return self.keywordSource + + def parseKeywords(self): + soup = BeautifulSoup(self.getKeywords()) + keywords = [] + for key in soup('a', {'href': re.compile('/keyword')}): + keywords.append(htmldecode(key.string)) + self.keywords = keywords + return self.keywords + + def getTrivia(self, forcereload = False): + if forcereload or not self.triviaSource: + self.triviaSource = read_url(self.triviaUrl) + return self.triviaSource + + def parseTrivia(self): + trivia = [] + soup = BeautifulSoup(self.getTrivia()) + triviaList = [] + for i in soup('ul', {'class': "trivia"}): + for t in i('li'): + t = str(t).replace('
    ', '').strip() + if t.startswith('
  • ') and t.endswith('
  • '): + t = t[4:-5].strip() + trivia.append(t) + self.trivia = trivia + return self.trivia + + def getConnections(self, forcereload = False): + if forcereload or not self.connectionsSource: + self.connectionsSource = read_url(self.connectionsUrl) + return self.connectionsSource + + def parseConnections(self): + connections = {} + soup = BeautifulSoup(self.getConnections()) + content = soup('div', {'id': 'tn15content'})[0] + blocks = str(content).split('
    ')[1:] + for c in blocks: + connection = c.split('
    ')[0] + cs = BeautifulSoup(c) + if connection: + #relation -> list of imdb ids + connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] + return connections + + def getReleaseinfo(self, forcereload = False): + if forcereload or not self.releaseinfoSource: + self.releaseinfoSource = read_url(self.releaseinfoUrl) + return self.releaseinfoSource + + def parseReleaseinfo(self): + soup = BeautifulSoup(self.getReleaseinfo()) + for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'): + d = row('td', {'align':'right'}) + if d: + try: + possible_date = stripTags(str(d[0])).strip() + rdate = time.strptime(possible_date, "%d %B %Y") + rdate = time.strftime('%Y-%m-%d', rdate) + return rdate + except: + pass + return None + + def getBusiness(self, forcereload = False): + if forcereload or not self.businessSource: + self.businessSource = read_url(self.businessUrl) + return self.businessSource + + def parseBusiness(self): + soup = BeautifulSoup(self.getBusiness()) + business = {'budget': 0, 'gross': 0, 'profit': 0} + content = soup('div', {'id': 'tn15content'})[0] + blocks = str(content).split('
    ')[1:] + for c in blocks: + cs = BeautifulSoup(c) + line = c.split('
    ') + if line: + title = line[0] + line = line[1] + if title in ['Budget', 'Gross']: + values = re.compile('\$(.*?) ').findall(line) + values = [int(value.replace(',','')) for value in values] + if values: + business[title.lower()] = max(values) + if business['budget'] and business['gross']: + business['profit'] = business['gross'] - business['budget'] + return business + +def guess(title, director=''): + #FIXME: proper file -> title + title = title.split('-')[0] + title = title.split('(')[0] + title = title.split('.')[0] + title = title.strip() + imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) + return_url = '' + + #lest first try google + #i.e. site:imdb.com Michael Stevens Sin + if director: + search = 'site:imdb.com %s "%s"' % (director, title) + else: + search = 'site:imdb.com "%s"' % title + for (name, url, desc) in google(search, 1): + if url.startswith('http://www.imdb.com/title/tt'): + return url[28:35] + + req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) + u = urllib2.urlopen(req) + data = u.read() + return_url = u.url + u.close() + + if return_url.startswith('http://www.imdb.com/title/tt'): + return return_url[28:35] + if data: + imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?(.*?) %').findall(data) + + ratings = ratings[len(ratings)- len(titles):] + + for title in titles: + movies.append({'title': title, 'rating': ratings[titles.index(title)], 'torrent': ''}) + + offset += 10 + return movies + \ No newline at end of file diff --git a/scrapeit/scrapetorrent.py b/scrapeit/scrapetorrent.py new file mode 100644 index 0000000..4022b87 --- /dev/null +++ b/scrapeit/scrapetorrent.py @@ -0,0 +1,16 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +from urllib import quote +import re + +from BeautifulSoup import BeautifulSoup + + +def search(query): + '''search for torrents on scrapetorrent + ''' + torrents = [] + return torrents + diff --git a/scrapeit/thepiratebay.py b/scrapeit/thepiratebay.py new file mode 100644 index 0000000..1d42b51 --- /dev/null +++ b/scrapeit/thepiratebay.py @@ -0,0 +1,104 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re +import socket +from urllib import quote + +from BeautifulSoup import BeautifulSoup + +from google import google +from utils import read_url, read_url_utf8 + + +socket.setdefaulttimeout(10.0) + +season_episode = re.compile("S..E..", re.IGNORECASE) + +def shows(name = None): + data = read_url_utf8('http://thepiratebay.org/tv/all') + shows = re.compile('
      (.*?)
      ').findall(data) + if not name: + return shows + for show in shows: + id = show[0] + if name == show[1]: + return id + return '' + +def findMatch(data, reg): + m = re.compile(reg).findall(data) + if m: + return m[0] + return u'' + +def get_info(url): + url = url.strip() + if url.startswith('/'): + url = 'http://thepiratebay.org' + url + data = read_url(url) + line = data.replace('\n', ' ') + info = {} + info['torrent'] = findMatch(data, '(http://.*?.torrent)"') + info['files'] = findMatch(data, '
      (.*?)
      ') + try: + info['files'] = int(info['files']) + except: + info['files'] = 0 + info['spoken_language'] = findMatch(line, '
      Spoken language\(s\):
      .*?
      (.*?)
      ') + info['texted_language'] = findMatch(line, '
      Texted language\(s\):
      .*?
      (.*?)
      ') + return info + +def get_episode_name(string): + episode = '' + ep = season_episode.findall(string) + if ep: + episode = ep[0].upper() + return episode + +def in_killwords(string): + string = string.lower() + match = False + for w in ['swesub', 'mpeg']: + if w in string: + match = True + return match + +def get_episode(show_id, episode): + if show_id <= 0: + return '' + tpbe = get_episodes(show_id) + for e in tpbe: + link =e[0] + ep = get_episode_name(e[1]) + if ep == episode: + info = get_info(link) + if not in_killwords(info['torrent']) \ + and info['files'] > 0 and info['files'] < 10 \ + and (not info['texted_language'] or info['texted_language'] == info['spoken_language']): + return info['torrent'] + return u'' + +def get_episodes(id): + data = read_url("http://thepiratebay.org/tv/%s" % id) + episodes = re.compile('
      (.*?)').findall(data) + return episodes + +def search(query): + torrents = [] + url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query) + page = read_url(url) + soup = BeautifulSoup(page) + for row in soup('tr'): + torrentType = row.findAll('td', {'class': 'vertTh'}) + if torrentType: + torrentType = torrentType[0]('a')[0].get('href').split('/')[-1] + # 201 = Movies , 202 = Movie DVDR + if torrentType in ['201']: + torrent = row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href') + torrents.append(torrent) + return torrents + +def searchByImdb(imdb): + return search("tt" + imdb) diff --git a/scrapeit/torrent.py b/scrapeit/torrent.py new file mode 100644 index 0000000..0ede826 --- /dev/null +++ b/scrapeit/torrent.py @@ -0,0 +1,18 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import mininova +import btjunkie +import thepiratebay + +def search(query): + '''meta function to search with the best known torrent search engine + ''' + return btjunkie.search(query) + +def searchByImdb(imdb): + '''meta function to search by imdb with the best known torrent search engine + ''' + return mininova.searchByImdb(imdb) + diff --git a/scrapeit/tvcom.py b/scrapeit/tvcom.py new file mode 100644 index 0000000..27bfe06 --- /dev/null +++ b/scrapeit/tvcom.py @@ -0,0 +1,34 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re + +from BeautifulSoup import BeautifulSoup + +from utils import read_url_utf8, stripTags + +def getEpisodeData(url): + ''' prases informatin on tvcom episode pages + returns dict with title, show, description, score + ''' + tvcom = { + 'description': u'' + } + data = read_url_utf8(url).replace('\n',' ') + regexp = r'''
      .*?
      (.*?)
      ''' + reg = re.compile(regexp, re.IGNORECASE) + m = reg.findall(data) + for match in m: + description = match.strip() + description = stripTags(description).replace('Watch Video','') + tvcom['description'] = description.strip() + soup = BeautifulSoup(data) + #optional data + try: + tvcom['show'] = soup('h1')[0].contents[0] + tvcom['title'] = soup('h1')[1].contents[0] + tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0] + except: + pass + return tvcom diff --git a/scrapeit/tvrss.py b/scrapeit/tvrss.py new file mode 100755 index 0000000..ccd4717 --- /dev/null +++ b/scrapeit/tvrss.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +from os.path import * +import sys +import datetime +import time +import re +from urllib2 import urlopen +import Image +import StringIO + +import feedparser + +from utils import read_url + + +hr_hdtv = re.compile('HR HDTV') +hdtv = re.compile('HDTV') + +def get_url(title): + return title.replace(' ','_').replace('/', '_').lower() + +def get_show(string): + return string.split(';')[0].split(':')[1].strip() + +def get_title(string): + title = string.split(';')[1].split(':')[1].strip() + if title != 'n/a': + return title + return '' + +def get_season(string): + try: + season = int(string.split(';')[2].split(':')[1].strip()) + except: + return None + return season + +def get_episode(string): + try: + episode = int(string.split(';')[3].split(':')[1].strip()) + except: + return None + return episode + +def get_episodedate(string): + s = string.split('Episode Date:') + if len(s) == 2: + return s[1].strip() + return None + +def choose_item(old, new): + if old['link'] == new['link']: + return False + if not hdtv.search(old['title']): + if hdtv.search(new['title']): + display_item(new) + log.debug("vs.") + display_item(old) + return True + return False + +def get_imdbdata(imdbid): + thumbnail = None + description='' + imdb = IMDb.parse(imdbid) + if imdb: + poster = imdb['poster'] + if poster != 'http://i.imdb.com/Heads/npa.gif': + log.debug("getting poster %s" % poster) + try: + thumbnail = read_url(poster) + im = Image.open(StringIO.StringIO(thumbnail)) + out = StringIO.StringIO() + im.crop((0,0,100,100)).convert().save(out, 'JPEG') + thumbnail = out.getvalue() + except: + thumbnail = None + if imdb['summary']: + description=imdb['summary'] + else: + description=imdb['tagline'] + return (imdb, description, thumbnail) + else: + return(imdb, '', None) + +def load(): + log.debug("getting new shows from tvrss...") + feed = feedparser.parse('http://tvrss.net/feed/combined/') + shows = {} + for item in feed['entries']: + show = get_show(item['description']) + season = get_season(item['description']) + episode = get_episode(item['description']) + episodedate = get_episodedate(item['description']) + estring = None + if season and episode: + estring = "S%02dE%02d" %(season, episode) + elif episodedate: + estring = episodedate + if estring: + if show and not hr_hdtv.search(item['title']): + if shows.has_key(show): + if shows[show].has_key(estring): + if choose_item(shows[show][estring], item): + shows[show][estring] = item + else: + shows[show][estring] = item + else: + shows[show] = {} + shows[show][estring] = item + for show in shows: + imdb = None + try: + model.ShowsBlacklist.byShowUrl(get_url(show)) + log.debug("ignoring blacklisted show %s" % show) + continue + except: + pass + s = None + try: + s = model.Shows.byUrl(get_url(show)) + except SQLObjectNotFound: + try: + alias = model.ShowsAlias.byAlias(get_url(show)) + s = alias.show + except SQLObjectNotFound: + s = None + if not s: + log.debug("about to add %s" % show) + thumbnail = None + description='' + ur = '-' + try: + imdbid = IMDb.guess(show) + if imdbid: + imdb, description, thumbnail = get_imdbdata(imdbid) + if imdb: + ur = imdb['rating'] + except: + import traceback + print ptraceback.print_exc() + pass + s= model.Shows( + title = show, + url = get_url(show), + description = description, + imdb = imdbid, + imdbUserRating = ur + ) + s.thumbnail = thumbnail + meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl) + if meta: + s.metacriticUrl = meta['url'] + s.metacriticScore = "%s" % meta['score'] + for review in meta['critics']: + model.addReview(s, review) + model.hub.commit() + log.debug('added %s' % show) + for episode in shows[show]: + episode_title = get_title(shows[show][episode]['description']) + episode_description = '' + episode_imdb = '' + q = model.Episodes.select(AND( + model.Episodes.q.showID == s.id, + model.Episodes.q.episode == episode)) + if q.count() == 0: + if not imdb: + try: + imdbid = IMDb.guess(show) + if imdbid: + imdb = IMDb.parse(imdbid) + except: + pass + if imdb and imdb['episodes'].has_key(episode): + episode_title = imdb['episodes'][episode]['title'] + episode_description = imdb['episodes'][episode]['description'] + episode_imdb = imdb['episodes'][episode]['imdb'] + if not episode_description or not episode_title: + tvcom_data = tvcom.get(show, episode) + if not episode_description: + episode_description = tvcom_data['description'] + if not episode_title: + episode_title = tvcom_data['title'] + e = model.Episodes( + showID = s.id, + title = episode_title, + episode = episode, + torrent = shows[show][episode]['enclosures'][0]['href'], + description = episode_description, + imdb = episode_imdb, + thumbnail = None, + pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed'])) + ) + s.lastUpdate = datetime.datetime.now() + model.hub.commit() + log.debug("from tvrss add %s %s" %(episode, show)) + log.debug("updating tvrss done.") + +if __name__ == '__main__': + # first look on the command line for a desired config file, + # if it's not on the command line, then + # look for setup.py in this directory. If it's not there, this script is + # probably installed + if len(sys.argv) > 1: + turbogears.update_config(configfile=sys.argv[1], + modulename="btvcr.config") + elif exists(join(dirname(__file__), "setup.py")): + turbogears.update_config(configfile="dev.cfg", + modulename="btvcr.config") + else: + turbogears.update_config(configfile="prod.cfg", + modulename="btvcr.config") + + from btvcr.controllers import Root + load() diff --git a/scrapeit/utils.py b/scrapeit/utils.py new file mode 100644 index 0000000..0363971 --- /dev/null +++ b/scrapeit/utils.py @@ -0,0 +1,150 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +""" +screape tools +""" + +import re +import time +import urllib +import urllib2 + +import djangohtml + + +# Default headers for HTTP requests. +DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'} + +# -------------------------------------------------------------------- +# Functions +# -------------------------------------------------------------------- + +def quote_plus(s): + """ + A variant of urllib.quote_plus which handles ASCII and Unicode. + """ + return urllib.quote_plus(s.encode('utf-8')) + + +def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True): + """ + Read str contents of given str URL. + + Here headers is a map of str -> str for HTTP request headers. If + blocking is True, returns the str page contents. If blocking is + False, returns an iterator which gives None until a successful read, + at which point the str page contents is yielded. + """ + req = urllib2.Request(url, None, headers) + f = urllib2.urlopen(req) + data = f.read() + f.close() + ctype = f.headers.getheader('content-type') + charset = ctype.split('charset=') + if len(charset)>1: charset = charset[1] + else: charset = 'latin-1' + data = unicode(data, charset) + return data + +def read_url(url, headers=DEFAULT_HEADERS, blocking=True): + """ + Read str contents of given str URL. + + Here headers is a map of str -> str for HTTP request headers. If + blocking is True, returns the str page contents. If blocking is + False, returns an iterator which gives None until a successful read, + at which point the str page contents is yielded. + """ + req = urllib2.Request(url, None, headers) + f = urllib2.urlopen(req) + data = f.read() + f.close() + return data + +def get_url(url, headers=DEFAULT_HEADERS, blocking=True): + """ + opens given str URL and returns the url after redirection. + """ + rurl = url + try: + req = urllib2.Request(url, None, headers) + rurl = urllib2.urlopen(req).url + rurl = rurl.replace('&src=rss', '') + except: + rurl = url + return rurl + + +def fix_url(url): + """ + Given url str, trim redirect stuff and return actual URL. + + Currently this just returns the URL unmodified. + """ +# if url.lower().find('http%3a//') > 0: +# return 'http://' + url[url.lower().rindex('http%3a//')+9:] +# if url.find('http://') > 0: +# return url[url.rindex('http://'):] + return url + + +_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?') +import htmlentitydefs + +def html_entity_decode(s, encoding = 'utf-8'): + r = [] + p = 0 + mo = _html_entity_re.search(s, p) + while mo: + r.append(s[p:mo.start()].decode(encoding)) + i = mo.lastindex + e = mo.group(i) + try: + if i == 1: + c = htmlentitydefs.name2codepoint[e] + elif i == 2: + c = int(e) + elif i == 3: + c = int(e, 16) + else: + assert 0 + r.append(unichr(c)) + except KeyError: + r.append(mo.group(0)) + + p = mo.end() + mo = _html_entity_re.search(s, p) + r.append(s[p:].decode(encoding)) + return u''.join(r) + +def stripTags(s): + return djangohtml.strip_tags(htmldecode(s)) + + +from htmlentitydefs import name2codepoint + +# This pattern matches a character entity reference (a decimal numeric +# references, a hexadecimal numeric reference, or a named reference). +charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') + +def htmldecode(text): + """Decode HTML entities in the given text.""" + if type(text) != unicode: + text = unicode(text) + if type(text) is unicode: + uchr = unichr + else: + uchr = lambda value: value > 255 and unichr(value) or chr(value) + def entitydecode(match, uchr=uchr): + entity = match.group(1) + if entity.startswith('#x'): + return uchr(int(entity[2:], 16)) + elif entity.startswith('#'): + return uchr(int(entity[1:])) + elif entity in name2codepoint: + return uchr(name2codepoint[entity]) + else: + return match.group(0) + return charrefpat.sub(entitydecode, text) + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5cf8020 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- Mode: Python; -*- +# vi:si:et:sw=2:sts=2:ts=2 +# encoding: utf-8 +from setuptools import setup, find_packages + +import os + +setup( + name="scrapeit", + version="0.1", + + # uncomment the following lines if you fill them out in release.py + description="collection of scrapers for various websites", + author="bot", + author_email="bot@mailb.org", + #url=url, + #download_url=download_url, + #license=license, + packages=find_packages(), + zip_safe=False, + keywords = [ + ], + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + ) +