add scrapeit

2007-03-01 15:11:35 +00:00 · 2007-03-01 15:11:35 +00:00 · ca2a42e773
commit ca2a42e773
18 changed files with 1864 additions and 0 deletions
--- a/scrapeit/init.py
+++ b/scrapeit/init.py
@ -0,0 +1,14 @@
 # -*- Mode: Python; -*-
 # vi:si:et:sw=2:sts=2:ts=2
 # encoding: utf-8
 import btjunkie
 import google
 import imdb
 import mininova
 import thepiratebay
 import torrent
 import rottentomatoes
 __version__ = '1.0.0'
--- a/scrapeit/btjunkie.py
+++ b/scrapeit/btjunkie.py
@ -0,0 +1,32 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from urllib import quote
 import re
 from BeautifulSoup import BeautifulSoup
 from utils import read_url, stripTags
 from btutils import torrentsWeLike
 def search(query):
  '''search for torrents on btjunkie
  '''
  url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1"  % quote(query)
  page = read_url(url)
  soup = BeautifulSoup(page)
  torrents = soup.findAll('a', {'class': 'BlckUnd'})
  torrents = filter(torrentsWeLike, torrents)
  torrent_links = []
  for t in torrents:
    tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href']
    tlink = tlink.replace('do=stat', 'do=download')
    torrent_links.append(tlink)
  return torrent_links
 def searchByImdb(imdb):
  '''search for torrents by imdb, not supported on btjunkie right now
  '''
  return []
--- a/scrapeit/btutils.py
+++ b/scrapeit/btutils.py
@ -0,0 +1,25 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from utils import stripTags
 def torrentsWeLike(link):
  '''check if torrent title looks like something we want to see, 
  dvdrip / no cam / no dubbed versions
  '''
  text = stripTags(unicode(link)).lower()
  #no cams / telesyncs or other stuff
  for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'):
    if word in text:
      return False
  #no dubbed versions
  for word in ('italian', 'german', 'spanish', 'french'):
    if word in text:
      return False
  #only dvdrips or dvdscrs
  for word in ('dvdrip', 'dvdscr', 'dvd screener'):
    if word in text:
      return True
  return False
--- a/scrapeit/djangohtml.py
+++ b/scrapeit/djangohtml.py
@ -0,0 +1,115 @@
 "HTML utilities suitable for global use."
 import re, string
 # Configuration for urlize() function
 LEADING_PUNCTUATION  = ['(', '<', '&lt;']
 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
 # list of possible strings used for bullets in bulleted lists
 DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
 word_split_re = re.compile(r'(\s+)')
 punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
    ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
    '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
 simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
 link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
 trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
 del x # Temporary variable
 def escape(html):
    "Returns the given HTML with ampersands, quotes and carets encoded"
    if not isinstance(html, basestring):
        html = str(html)
    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
 def linebreaks(value):
    "Converts newlines into <p> and <br />s"
    value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
    paras = re.split('\n{2,}', value)
    paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
    return '\n\n'.join(paras)
 def strip_tags(value):
    "Returns the given HTML with all tags stripped"
    return re.sub(r'<[^>]*?>', '', value)
 def strip_spaces_between_tags(value):
    "Returns the given HTML with spaces between tags normalized to a single space"
    return re.sub(r'>\s+<', '> <', value)
 def strip_entities(value):
    "Returns the given HTML with all entities (&something;) stripped"
    return re.sub(r'&(?:\w+|#\d);', '', value)
 def fix_ampersands(value):
    "Returns the given HTML with all unencoded ampersands encoded correctly"
    return unencoded_ampersands_re.sub('&amp;', value)
 def urlize(text, trim_url_limit=None, nofollow=False):
    """
    Converts any URLs in text into clickable links. Works on http://, https:// and
    www. links. Links can have trailing punctuation (periods, commas, close-parens)
    and leading punctuation (opening parens) and it'll still do the right thing.
    If trim_url_limit is not None, the URLs in link text will be limited to
    trim_url_limit characters.
    If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
    """
    trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or ''))  or x
    words = word_split_re.split(text)
    nofollow_attr = nofollow and ' rel="nofollow"' or ''
    for i, word in enumerate(words):
        match = punctuation_re.match(word)
        if match:
            lead, middle, trail = match.groups()
            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
                    len(middle) > 0 and middle[0] in string.letters + string.digits and \
                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
                middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(anchor))
            if middle.startswith('http://') or middle.startswith('https://'):
                middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
            if '@' in middle and not middle.startswith('www.') and not ':' in middle \
                and simple_email_re.match(middle):
                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
            if lead + middle + trail != word:
                words[i] = lead + middle + trail
    return ''.join(words)
 def clean_html(text):
    """
    Cleans the given HTML. Specifically, it does the following:
        * Converts <b> and <i> to <strong> and <em>.
        * Encodes all ampersands correctly.
        * Removes all "target" attributes from <a> tags.
        * Removes extraneous HTML, such as presentational tags that open and
          immediately close and <br clear="all">.
        * Converts hard-coded bullets into HTML unordered lists.
        * Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
          bottom of the text.
    """
    from djangotext import normalize_newlines
    text = normalize_newlines(text)
    text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
    text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
    text = fix_ampersands(text)
    # Remove all target="" attributes from <a> tags.
    text = link_target_attribute_re.sub('\\1', text)
    # Trim stupid HTML such as <br clear="all">.
    text = html_gunk_re.sub('', text)
    # Convert hard-coded bullets into HTML unordered lists.
    def replace_p_tags(match):
        s = match.group().replace('</p>', '</li>')
        for d in DOTS:
            s = s.replace('<p>%s' % d, '<li>')
        return '<ul>\n%s\n</ul>' % s
    text = hard_coded_bullets_re.sub(replace_p_tags, text)
    # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
    text = trailing_empty_content_re.sub('', text)
    return text
--- a/scrapeit/djangotext.py
+++ b/scrapeit/djangotext.py
@ -0,0 +1,111 @@
 import re
 # Capitalizes the first letter of a string.
 capfirst = lambda x: x and x[0].upper() + x[1:]
 def wrap(text, width):
    """
    A word-wrap function that preserves existing line breaks and most spaces in
    the text. Expects that existing line breaks are posix newlines (\n).
    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
    """
    return reduce(lambda line, word, width=width: '%s%s%s' %
                  (line,
                   ' \n'[(len(line[line.rfind('\n')+1:])
                         + len(word.split('\n',1)[0]
                              ) >= width)],
                   word),
                  text.split(' ')
                 )
 def truncate_words(s, num):
    "Truncates a string after a certain number of words."
    length = int(num)
    words = s.split()
    if len(words) > length:
        words = words[:length]
        if not words[-1].endswith('...'):
            words.append('...')
    return ' '.join(words)
 def get_valid_filename(s):
    """
    Returns the given string converted to a string that can be used for a clean
    filename. Specifically, leading and trailing spaces are removed; other
    spaces are converted to underscores; and all non-filename-safe characters
    are removed.
    >>> get_valid_filename("john's portrait in 2004.jpg")
    'johns_portrait_in_2004.jpg'
    """
    s = s.strip().replace(' ', '_')
    return re.sub(r'[^-A-Za-z0-9_.]', '', s)
 def get_text_list(list_, last_word='or'):
    """
    >>> get_text_list(['a', 'b', 'c', 'd'])
    'a, b, c or d'
    >>> get_text_list(['a', 'b', 'c'], 'and')
    'a, b and c'
    >>> get_text_list(['a', 'b'], 'and')
    'a and b'
    >>> get_text_list(['a'])
    'a'
    >>> get_text_list([])
    ''
    """
    if len(list_) == 0: return ''
    if len(list_) == 1: return list_[0]
    return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
 def normalize_newlines(text):
    return re.sub(r'\r\n|\r|\n', '\n', text)
 def recapitalize(text):
    "Recapitalizes text, placing caps after end-of-sentence punctuation."
 #     capwords = ()
    text = text.lower()
    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
    text = capsRE.sub(lambda x: x.group(1).upper(), text)
 #     for capword in capwords:
 #         capwordRE = re.compile(r'\b%s\b' % capword, re.I)
 #         text = capwordRE.sub(capword, text)
    return text
 def phone2numeric(phone):
    "Converts a phone number with letters into its numeric equivalent."
    letters = re.compile(r'[A-PR-Y]', re.I)
    char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
         'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
         'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
         's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
         'y': '9', 'x': '9'}.get(m.group(0).lower())
    return letters.sub(char2number, phone)
 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
 # Used with permission.
 def compress_string(s):
    import cStringIO, gzip
    zbuf = cStringIO.StringIO()
    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    zfile.write(s)
    zfile.close()
    return zbuf.getvalue()
 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
 def smart_split(text):
    """
    Generator that splits a string by spaces, leaving quoted phrases together.
    Supports both single and double quotes, and supports escaping quotes with
    backslashes. In the output, strings will keep their initial and trailing
    quote marks.
    >>> list(smart_split('This is "a person\'s" test.'))
    ['This', 'is', '"a person\'s"', 'test.']
    """
    for bit in smart_split_re.finditer(text):
        bit = bit.group(0)
        if bit[0] == '"':
            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
        elif bit[0] == "'":
            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
        else:
            yield bit
--- a/scrapeit/epguides.py
+++ b/scrapeit/epguides.py
@ -0,0 +1,68 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 from BeautifulSoup import BeautifulSoup
 from google import google
 from utils import read_url, read_url_utf8, stripTags
 import tvcom
 import imdb
 def epguidesUrl(title):
  ''' 
    Search Epguide Url for Show via Show Title.
    Use Google to search the url, this is also done on Epguide.
  '''
  for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1):
    if url.startswith('http://epguides.com'):
      if re.search(title, name):
        return url
  return None
 def getShowImdb(title):
  imdbid = None
  url = epguidesUrl(title)
  if url:
    data = read_url(url)
    soup = BeautifulSoup(data)
    links = soup('a', {'href': re.compile('imdb.com/title/tt')})
    if links:
      link = links[0].get('href')
      imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
  if not imdbid:
    imdbid = imdb.guess(title)
  return imdbid
 def getEpisodeData(title, episode, show_url = None):
  '''
    Collect information about an episode.
    Returns dict with title, show, description and episode
  '''
  episodeData = {
    'title': u'',
    'show': title,
    'description': u'',
    'episode': episode,
  }
  description = u''
  data = u''
  if not show_url:
    show_url = epguidesUrl(title)
  if show_url:
    data = read_url_utf8(show_url)
  else:
    return imdb.getEpisodeData(title, episode)
  estring =  u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip()
  for line in data.split('\n'):
    a = line.split(estring)
    if len(a) == 2:
      soup = BeautifulSoup(line)
      episodeData['title'] = soup('a')[0].contents[0]
      tvcom_url = soup('a')[0].get('href')
      episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description']
      break
  return episodeData
--- a/scrapeit/google.py
+++ b/scrapeit/google.py
@ -0,0 +1,375 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 """
 Query Web search engines.
 This module works by filtering the HTML returned by the search engine and thus tends to break when
 search engines modify their HTML output.
 Public domain, Connelly Barnes 2005-2007.  Compatible with Python 2.3-2.5.
 See L{examples} for a quick start.  See L{description} for the full
 explanation, precautions, and legal disclaimers.
 """
 import re
 import time
 import urllib
 import urllib2
 import weakref
 import threading
 import Queue
 from utils import read_url
 __version__ = '1.0.2'
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
 # Default maximum number of results.
 DEFAULT_MAX_RESULTS = 10
 # Function names for supported search engines.
 SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo']
 __all__ = SEARCH_ENGINES + ['examples', 'description']
 # --------------------------------------------------------------------
 # Functions
 # --------------------------------------------------------------------
 def quote_plus(s):
  """
  A variant of urllib.quote_plus which handles ASCII and Unicode.
  """
  return urllib.quote_plus(s.encode('utf-8'))
 def fix_url(url):
  """
  Given url str, trim redirect stuff and return actual URL.
  Currently this just returns the URL unmodified.
  """
 #  if url.lower().find('http%3a//') > 0:
 #    return 'http://' + url[url.lower().rindex('http%3a//')+9:]
 #  if url.find('http://') > 0:
 #    return url[url.rindex('http://'):]
  return url
 def get_search_page_links(page, results_per_page, begin, end, link_re):
  """
  Given str contents of search result page, return list of links.
  Returns list of (name, url, desc) str tuples.  See make_searcher()
  for a description of results_per_page and link_re.
  """
  if begin is not None and begin in page:
    page = page[page.index(begin):]
  if end is not None and end in page:
    page = page[:page.index(end)]
  ans = []
  for match in re.compile(link_re, re.DOTALL).finditer(page):
    (name, url, desc) = match.group('name', 'url', 'desc')
    url = fix_url(url)
    ans += [(html_to_text(name), url, html_to_text(desc))]
  return ans
 def html_to_text(s):
  """
  Given an HTML formatted str, convert it to a text str.
  """
  s = re.sub(r'<.*?>', '', s)
  s = s.replace('\r', ' ')
  s = s.replace('\n', ' ')
  s = s.replace('\t', ' ')
  s = s.replace('&amp;', '&')
  s = s.replace('&lt;', '<')
  s = s.replace('&gt;', '>')
  s = s.replace('&quot;', '"')
  s = s.replace('&middot;', '\xb7')
  for i in range(256):
    s = s.replace('&#%d;' % i, chr(i))
  while s.replace('  ', ' ') != s:
    s = s.replace('  ', ' ')
  return s.strip()
 def nonblocking(f, blocking_return=None, sleep_time=0.01):
  """
  Wrap a callable which returns an iter so that it no longer blocks.
  The wrapped iterator returns blocking_return while callable f is
  blocking.  The callable f is called in a background thread.  If the
  wrapped iterator is deleted, then the iterator returned by f is
  deleted also and the background thread is terminated.
  """
  def g(*args, **kwargs):
    f_iter = f(*args, **kwargs)
    g_iter = None
    def run():
      while True:
        g_obj = g_iter()
        if g_obj is None:
          return
        if g_obj.q.qsize() == 0:
          try:
            f_next = f_iter.next()
          except Exception, e:
            g_obj.exc = e
            return
          g_obj.q.put(f_next)
        else:
          del g_obj
          time.sleep(sleep_time)
    class Iter:
      def __init__(self):
        self.q = Queue.Queue()
        self.exc = None
        self.thread = threading.Thread(target=run)
        self.thread.setDaemon(True)
      def next(self):
        if self.exc is not None:
          raise self.exc
        try:
          return self.q.get_nowait()
        except Queue.Empty:
          return blocking_return
      def __iter__(self):
        return self
    obj = Iter()
    g_iter = weakref.ref(obj)
    obj.thread.start()
    try:
      return obj
    finally:
      del obj
  return g
 def make_searcher(query_url, results_per_page, page_url, page_mode,
                  begin, end, link_re):
  """
  Return a search function for the given search engine.
  Here query_url is the URL for the initial search, with %(q)s for
  the query string, results_per_page is the number of search results
  per page, page_url is the URL for the 2nd and subsequent pages of
  search results, with %(q)s for the query string and %(n)s for the
  page "number."  Here page_mode controls the actual value for the
  page "number:"
   - page_mode='page0':   Use 0-based index of the page.
   - page_mode='page1':   Use 1-based index of the page.
   - page_mode='offset0': Use 0-based index of the search result,
                          which is a multiple of results_per_page.
   - page_mode='offset1': Use 1-based index of the search result
                          (one plus a multiple of results_per_page).
  If begin is not None, then only text after the first occurrence of
  begin will be used in the search results page.  If end is not None,
  then only text before the first occurrence of end will be used.
  Finally, link_re is a regex string (see module re) which matches
  three named groups: 'name', 'url', and 'desc'.  These correspond to
  the name, URL and description of each search result.  The regex is
  applied in re.DOTALL mode.
  Returns a search() function which has the same interface as
  described in the module docstring.
  """
  def search_blocking(query, max_results):
    last_links = None
    page_num = 0
 #    done = False
    q = Queue.Queue()
    for i in range(max_results):
      if q.qsize() == 0:
        if page_num == 0:
          page = read_url(query_url % {'q': quote_plus(query)})
        else:
 #          if done:
 #            break
          if page_mode == 'page0':
            n = page_num
          elif page_mode == 'page1':
            n = page_num + 1
          elif page_mode == 'offset0':
            n = page_num * results_per_page
          elif page_mode == 'offset1':
            n = page_num * results_per_page + 1
          else:
            raise ValueError('unknown page mode')
          page = read_url(page_url % {'n': n, 'q': quote_plus(query)})
        page_num += 1
        links = get_search_page_links(page, results_per_page, begin, end, link_re)
        if len(links) == 0 or links == last_links:
          break
 #        if len(links) < results_per_page:
 #          done = True
        last_links = links
        for link in links:
          q.put(link)
      yield q.get()
  search_nonblocking = nonblocking(search_blocking)
  def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
    """
    See docstring for web_search module.
    """
    if blocking:
      return search_blocking(query, max_results)
    else:
      return search_nonblocking(query, max_results)
  return search
 def examples():
  """
  Examples of the web_search module.
  Example 1:
   >>> from web_search import google
   >>> for (name, url, desc) in google('python', 20):
   ...   print name, url
   ...
   (First 20 results for Google search of "python").
  Example 2:
   >>> from web_search import dmoz
   >>> list(dmoz('abc', 10))
   [('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...]
  """
  print examples.__doc__
 def description():
  """
  Full explanation and precautions for web_search module.
  The search functions in this module follow a common interface::
      search(query, max_results=10, blocking=True) =>
        iterator of (name, url, description) search results.
  Here query is the query string, max_results gives the maximum number
  of search results, and the items in the returned iterator are string
  3-tuples containing the Website name, URL, and description for each
  search result.
  If blocking=False, then an iterator is returned which does not block
  execution: the iterator yields None when the next search result is
  not yet available (a background thread is created).
  Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn',
  'yahoo'.  This module is not associated with or endorsed by any of
  these search engine corporations.
  Be warned that if searches are made too frequently, or max_results is
  large and you enumerate all search results, then you will be a drain
  on the search engine's bandwidth, and the search engine organization
  may respond by banning your IP address or IP address range.
  This software has been placed in the public domain with the
  following legal notice::
      http://oregonstate.edu/~barnesc/documents/public_domain.txt
  """
  print description.__doc__
 # --------------------------------------------------------------------
 # Search engines
 # --------------------------------------------------------------------
 ask       = make_searcher('http://www.ask.com/web?q=%(q)s', 10,
                          'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1',
                          None, None,
                          r'<a .*? class="L4" href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
                          r'.*?</div>(?P<desc>.*?)</div>')
 dmoz      = make_searcher('http://search.dmoz.org/cgi-bin/search?search=%(q)s', 20,
                          'http://search.dmoz.org/cgi-bin/search?start=%(n)d&search=%(q)s', 'offset1',
                          None, None,
                          r'<li><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
                          r'.*? - (?P<desc>.*?)<br>')
 excite    = make_searcher('http://msxml.excite.com/info.xcite/search/web/%(q)s', 20,
                          'http://msxml.excite.com/info.xcite/search/web/%(q)s/%(n)d', 'offset1',
                          None, None,
                          r'<div class="listingmain" style=""><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
                          r'(?P<desc>.*?)</span>')
 google    = make_searcher('http://www.google.com/search?q=%(q)s', 10,
                          'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
                          None, None,
                          r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
                          r'.*?(?:<br>|<table.*?>)' +
                          r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
 msn       = make_searcher('http://search.msn.com/results.aspx?q=%(q)s', 10,
                          'http://search.msn.com/results.aspx?q=%(q)s&first=%(n)d', 'offset1',
                          '<h2>Results</h2>', '<div id="ads_rightC">',
                          r'<h3><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
                          r'(?P<desc>.*?)<li class="first">')
 yahoo     = make_searcher('http://search.yahoo.com/search?p=%(q)s', 10,
                          'http://search.yahoo.com/search?p=%(q)s&b=%(n)d', 'offset1',
                          None, None,
                          '<li><div><a class=yschttl.*?href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
                          r'.*?<div class=yschabstr>(?P<desc>.*?)</div>')
 # --------------------------------------------------------------------
 # Unit tests
 # --------------------------------------------------------------------
 def test_engine(search):
  """
  Test a search engine function returned by make_searcher().
  """
  for query in ['abc', 'microsoft', 'love', 'pweropieiw', 'addfdae']:
    popular = query in ['abc', 'microsoft', 'love', 'run']
    for n in [6, 17, 31]:
      n1 = len(list(search(query, n)))
      if popular:
        assert n1 == n
      else:
        assert n1 <= n
      n2 = 0
      for item in search(query, n, False):
        if item is not None:
          n2 += 1
        else:
          time.sleep(0.01)
      if popular:
        assert n2 == n
      else:
        assert n2 <= n
 def test():
  """
  Unit test main routine.
  """
  import inspect
  print 'Testing:'
  for name in SEARCH_ENGINES:
    print '  ' + (name + ':').ljust(20),
    test_engine(getattr(inspect.getmodule(test), name))
    print 'OK'
 if __name__ == '__main__':
  test()
--- a/scrapeit/googlemovie.py
+++ b/scrapeit/googlemovie.py
@ -0,0 +1,34 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 from urllib import quote 
 from BeautifulSoup import BeautifulSoup
 from utils import read_url, read_url_utf8, stripTags
 def getGoogleMovieId(title):
  url = 'http://google.com/movies?q=%s&btnG=Search+Movies' % quote(title)
  data = read_url(url)
  cids = re.compile('reviews\?cid=(.*?)&').findall(data)
  if cids:
    return cids[0]
  return ''
 def getGoogleMovieData(title, year = None, cid = None):
  gdata = {
    'title': title,
    'year': year,
    'cid': cid,
    'rating': '',
  }
  if not cid:
    cid = getGoogleMovieId("%s (%s)" % (title, year))
  if cid:
    gdata['cid'] = cid
    data = read_url('http://www.google.com/movies/reviews?cid=%s' % cid)
    gdata['rating'] = re.compile('font size=.3><b><nobr>(.*?)&nbsp;/&nbsp;5').findall(data)[0]
    gdata['reviews'] = re.compile('Based on (.*?) reviews').findall(data)[0]
    gdata['year'] = re.compile("<title>.*?\((.*?)\).*?</title").findall(data)[0]
  return gdata
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -0,0 +1,441 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import urllib2
 from urllib import quote
 import re, time
 import os
 from elementtree.ElementTree import parse, tostring
 from BeautifulSoup import BeautifulSoup
 from google import google
 from utils import stripTags, read_url_utf8, htmldecode
 import utils
 def read_url(url):
  base = "/var/cache/scrapeit/cache/"
  path = os.path.join(base, url.replace('http://',''))
  if path.endswith('/'):
    path = "%sindex.html" % path
  if os.path.isdir(path):
    path = "%s/index.html" % path
  if os.path.exists(path):
    f = open(path)
    data = f.read()
    f.close()
    return data
  else:
    data = utils.read_url(url)
    folder = os.path.dirname(path)
    if not os.path.exists(folder):
      os.makedirs(folder)
    f = open(path, 'w')
    f.write(data)
    f.close()
    return data
 def  _get_data(url):
  data = None
  try:
    data = read_url(url)
  except:
    print "error reading data from", url
  return data
 def get_image(url):
  return read_url(url)
 def _castList(data, regexp):
  soup = re.compile(regexp).findall(data)
  if soup:
    soup = BeautifulSoup(soup[0])
    names = []
    for i in soup('a', {'href': re.compile('/name/nm')}):
      if i.string:
        cast = stripTags(i.string)
        if cast not in names:
          names.append(cast)
    return names
  return []
 def _getTerm(data, regexp):
  term = ''
  try:
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.search(data)
    if m:
      term = stripTags(m.group(1)).strip()
  except:
    print "waring, parsing failed for", regexp
  return term.encode('utf8')
 class IMDb:
  def __init__(self, imdb):
    self.imdb = imdb
    self.pageSource = None
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
    self.businessSource = None
    self.businessUrl = "%sbusiness" % self.pageUrl
    self.connectionsSource = None
    self.connectionsUrl = "%smovieconnections" % self.pageUrl
    self.creditsSource = None
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesSource = None
    self.episodesUrl = "%sepisodes" % self.pageUrl
    self.keywordSource = None
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotSource = None
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoSource = None
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
    self.triviaSource = None
    self.triviaUrl = "%strivia" % self.pageUrl
  def getPage(self, forcereload = False):
    if forcereload or not self.pageSource:
      self.pageSource = read_url(self.pageUrl)
    return self.pageSource
  def parse_raw_value(self, key, value):
    if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
      value = stripTags(value).strip()    
    if key == 'runtime':
      parsed_value = _getTerm(value, '(.*?) min')
      parsed_value = _getTerm(parsed_value, '([0-9]+)')
      if not parsed_value:
        parsed_value = _getTerm(value, '(.*?) sec')
        parsed_value = _getTerm(parsed_value, '([0-9]+)')
        if not parsed_value:
          parsed_value = 0
        else:
          parsed_value = int(parsed_value)
      else:
        parsed_value = int(parsed_value) * 60
    elif key in ('country', 'language'):
      parsed_value = value.split(' / ')
    elif key == 'genre':
      parsed_value = value.replace('more', '').strip().split(' / ')
    elif key == 'tagline':
      parsed_value = value.replace('more', '').strip()
    elif key == 'plot_outline':
      parsed_value = value.replace('(view trailer)', '').strip()
      if parsed_value.endswith('more'):
        parsed_value = parsed_value[:-4].strip()
    elif key == 'tv_series':
      m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
      if m:
        parsed_value = m[0][0]
      else:
        parsed_value = ''
    else:
      print value
      parsed_value = value
    return parsed_value
  def parse(self):
    data = self.getPage()
    IMDbDict ={}
    #Poster
    IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'  
    #Title, Year
    title = u''
    year  = u''
    flat_data = data.replace('\n', '').replace('\r', '')
    html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data)
    if html_title:
      title = html_title[0][0]
      IMDbDict['year'] = html_title[0][1]
      IMDbDict['title'] = stripTags(title).strip()
    else:
      title = _getTerm(data, '<title>(.*?)</title>').split('(')
      year = title[-1].split(')')[0].strip()
      title = title[0].strip().decode('utf-8')
      IMDbDict['title'] = title
      IMDbDict['year']  = year
    IMDbDict['title'] = htmldecode(IMDbDict['title'])
    if IMDbDict['title'][0] == '"' and  IMDbDict['title'][-1] == '"':
      IMDbDict['title'] =  IMDbDict['title'][1:-1]
    #Votes
    m = re.compile('<b>(.*?)/10</b> \(<a href="ratings">(.*?) votes</a>\)', re.IGNORECASE).search(data)
    if m:
      IMDbDict['rating'] = int(float(m.group(1)) * 1000)
      IMDbDict['votes'] = int(m.group(2).replace(',', ''))
    else:
      IMDbDict['rating'] = -1
      IMDbDict['votes'] = -1
    data = data.replace('\n',' ')
    #some values
    keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series')
    for key in keys:
      IMDbDict[key] = ''
    IMDbDict['runtime'] = 0
    soup = BeautifulSoup(data)
    for info in soup('div', {'class': 'info'}):
      key = str(info).split('</h5>')[0].split('<h5>')
      if len(key) > 1:
        raw_value = str(info).split('</h5>')[1]
        key = key[1][:-1].lower().replace(' ', '_')
        if key in keys:
          IMDbDict[key] = self.parse_raw_value(key, raw_value)
    #is episode
    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
    IMDbDict['episodes'] = self.parseEpisodes()
    IMDbDict['credits'] = self.parseCredits()
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = self.parseKeywords()
    IMDbDict['trivia'] = self.parseTrivia()
    IMDbDict['connections'] = self.parseConnections()
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    self.IMDbDict = IMDbDict
    return self.IMDbDict
  def getCredits(self, forcereload = False):
    if forcereload or not self.creditsSource:
      self.creditsSource = read_url(self.creditsUrl)
    return self.creditsSource
  def parseCredits(self):
    data = self.getCredits()
    credits = {}
    credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
    credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
    credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
    #credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
    credits['cast'] = []
    soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
    soup = BeautifulSoup(data)
    cast = soup('table', {'class': 'cast'})
    if cast:
      cast = str(cast[0])
      names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
      for name in names:
        real_name = name[0]
        role_name = name[1]
        if role_name:
          role_name = role_name.split('(')[0].replace('/ ...','').strip()
        credits['cast'].append((stripTags(real_name), stripTags(role_name)))
    self.credits = credits
    return self.credits
  def getPlot(self, forcereload = False):
    if forcereload or not self.plotSource:
      self.plotSource = read_url(self.plotUrl)
    return self.plotSource
  def parsePlot(self):
    soup = BeautifulSoup(self.getPlot())
    plot = soup('p', {'class':'plotpar'})
    if plot:
      plot = str(plot[0]).split('<i>')[0]
    else:
      plot = u''
    plot = stripTags(plot).strip()
    self.plot = plot
    return plot
  def getEpisodes(self, forcereload = False):
    if forcereload or not self.episodesSource:
      self.episodesSource = read_url(self.episodesUrl)
    return self.episodesSource
  def parseEpisodes(self):
    episodes = {}
    cdata = self.getEpisodes().replace('\r\n',' ')
    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
    #regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.findall(cdata)
    for match in m:
      try:
        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
        episodes[episode] = {}
        episodes[episode]['imdb'] = match[2]
        episodes[episode]['title'] = match[3].strip()
        description = htmldecode(match[4])
        description = stripTags(description.split('Next US airings:')[0])
        episodes[episode]['description'] = description
      except:
        import traceback
        print traceback.print_exc()
        pass
    self.episodes = episodes
    return self.episodes
  def getKeywords(self, forcereload = False):
    if forcereload or not self.keywordSource:
      self.keywordSource = read_url(self.keywordUrl)
    return self.keywordSource
  def parseKeywords(self):
    soup = BeautifulSoup(self.getKeywords())
    keywords = []
    for key in soup('a', {'href': re.compile('/keyword')}):
      keywords.append(htmldecode(key.string))
    self.keywords = keywords
    return self.keywords
  def getTrivia(self, forcereload = False):
    if forcereload or not self.triviaSource:
      self.triviaSource = read_url(self.triviaUrl)
    return self.triviaSource
  def parseTrivia(self):
    trivia = []
    soup = BeautifulSoup(self.getTrivia())
    triviaList = []
    for i in  soup('ul', {'class': "trivia"}):
      for t in i('li'):
        t = str(t).replace('<br />', '').strip()
        if t.startswith('<li>') and t.endswith('</li>'):
          t = t[4:-5].strip()          
        trivia.append(t)
    self.trivia = trivia
    return self.trivia
  def getConnections(self, forcereload = False):
    if forcereload or not self.connectionsSource:
      self.connectionsSource = read_url(self.connectionsUrl)
    return self.connectionsSource
  def parseConnections(self):
    connections = {}
    soup = BeautifulSoup(self.getConnections())
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      connection = c.split('</h5>')[0]
      cs = BeautifulSoup(c)
      if connection:
        #relation -> list of imdb ids
        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
    return connections
  def getReleaseinfo(self, forcereload = False):
    if forcereload or not self.releaseinfoSource:
      self.releaseinfoSource = read_url(self.releaseinfoUrl)
    return self.releaseinfoSource
  def parseReleaseinfo(self):
    soup = BeautifulSoup(self.getReleaseinfo())
    for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
      d = row('td', {'align':'right'})
      if d:
        try:
          possible_date = stripTags(str(d[0])).strip()
          rdate = time.strptime(possible_date, "%d %B %Y")
          rdate = time.strftime('%Y-%m-%d', rdate)
          return rdate
        except:
          pass
    return None
  def getBusiness(self, forcereload = False):
    if forcereload or not self.businessSource:
      self.businessSource = read_url(self.businessUrl)
    return self.businessSource
  def parseBusiness(self):
    soup = BeautifulSoup(self.getBusiness())
    business = {'budget': 0, 'gross': 0, 'profit': 0}
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      cs = BeautifulSoup(c)
      line = c.split('</h5>')
      if line:
        title = line[0]
        line = line[1]
        if title in ['Budget', 'Gross']:
          values = re.compile('\$(.*?) ').findall(line)
          values = [int(value.replace(',','')) for value in values]
          if values:
            business[title.lower()] = max(values)
    if business['budget'] and business['gross']:
      business['profit'] = business['gross'] - business['budget']
    return business
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
  title = title.split('(')[0]
  title = title.split('.')[0]
  title = title.strip()
  imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
  return_url = ''
  #lest first try google
  #i.e. site:imdb.com Michael Stevens Sin
  if director:
    search = 'site:imdb.com %s "%s"' % (director, title)
  else:
    search = 'site:imdb.com "%s"' % title
  for (name, url, desc) in google(search, 1):
    if url.startswith('http://www.imdb.com/title/tt'):
      return url[28:35]
  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  if data: 
    imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
    if imdb_id:
      return imdb_id
  imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  return None
 def getEpisodeData(title, episode, show_url = None):
  '''
    Collect information about an episode.
    Returns dict with title, show, description and episode
  '''
  episodeData = {
    'title': u'',
    'show': title,
    'description': u'',
    'episode': episode,
  }
  description = u''
  if not show_url:
    imdbid = guess(title)
  else:
    imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
  if imdbid:
    i = IMDb(imdbid).parse()
    episodeData['title'] = i['episodes'][episode]['title']
    episodeData['description'] = i['episodes'][episode]['description']
    episodeData['imdb'] = i['episodes'][episode]['imdb']
  return episodeData
 if __name__ == '__main__':
  import sys
  #print parse(sys.argv[1])
  print "imdb:", guess(sys.argv[1])
--- a/scrapeit/mininova.py
+++ b/scrapeit/mininova.py
@ -0,0 +1,40 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 import socket
 from urllib import quote
 from BeautifulSoup import BeautifulSoup
 from utils  import read_url, read_url_utf8
 from btutils import torrentsWeLike
 socket.setdefaulttimeout(10.0)
 def search(query):
  '''search for torrents on mininova
  '''
  torrents = []
  url = "http://www.mininova.org/search/%s/seeds" % quote(query)
  page = read_url(url)
  soup = BeautifulSoup(page)
  for row in soup('tr'): 
    links = row('a', {'href':re.compile('/tor')})
    if links and torrentsWeLike(links[0]):
      torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
      torrents.append(torrent_url)
  return torrents
 def searchByImdb(imdb):
  '''search for torrents on mininova by imdb
  '''  
  torrents = []
  page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
  soup = BeautifulSoup(page)
  for row in soup('tr'): 
    links = row('a', {'href':re.compile('/get')})
    if links:
      torrent_url = "http://www.mininova.org%s" % links[0].get('href')
      torrents.append(torrent_url)
  return torrents
--- a/scrapeit/rottentomatoes.py
+++ b/scrapeit/rottentomatoes.py
@ -0,0 +1,37 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from urllib import quote
 import re
 from BeautifulSoup import BeautifulSoup
 from utils import read_url
 def getRottenTomatoes(rating = 70):
  '''
    Get movie TITLES 
    rated ABOVE 70 or value passed as first argument 
    from RottenTomatoes
  '''
  movies = []
  offset = 0
  titles = ['1']
  while titles:
    url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
    page = read_url(url)
    soup = BeautifulSoup(page)
    titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
    data = str(soup)
    ratings = re.compile('<span class="bold">(.*?) %</span>').findall(data)
    ratings = ratings[len(ratings)- len(titles):]
    for title in titles:
      movies.append({'title': title, 'rating': ratings[titles.index(title)], 'torrent': ''})
    offset += 10
  return movies
--- a/scrapeit/scrapetorrent.py
+++ b/scrapeit/scrapetorrent.py
@ -0,0 +1,16 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from urllib import quote
 import re
 from BeautifulSoup import BeautifulSoup
 def search(query):
  '''search for torrents on scrapetorrent
  '''
  torrents = []
  return torrents
--- a/scrapeit/thepiratebay.py
+++ b/scrapeit/thepiratebay.py
@ -0,0 +1,104 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 import socket
 from urllib import quote
 from BeautifulSoup import BeautifulSoup
 from google import google
 from utils import read_url, read_url_utf8
 socket.setdefaulttimeout(10.0)
 season_episode = re.compile("S..E..", re.IGNORECASE)
 def shows(name = None):
  data = read_url_utf8('http://thepiratebay.org/tv/all')
  shows = re.compile('<dt><a href="/tv/(.*?)/">(.*?)</a></dt>').findall(data)
  if not name:
    return shows
  for show in shows:
    id = show[0]
    if name == show[1]: 
      return id
  return ''
 def findMatch(data, reg):
  m = re.compile(reg).findall(data)
  if m: 
    return m[0]
  return u''
 def get_info(url):
  url = url.strip()
  if url.startswith('/'):
    url = 'http://thepiratebay.org' + url
  data = read_url(url)
  line = data.replace('\n', ' ')
  info = {}
  info['torrent'] = findMatch(data, '(http://.*?.torrent)"')
  info['files'] = findMatch(data, '<dd><a href="/details.php.id=.*?&amp;fl#show">(.*?)</a></dd>')
  try:
    info['files'] = int(info['files'])
  except:
    info['files'] = 0
  info['spoken_language'] = findMatch(line, '<dt>Spoken language\(s\):</dt>.*?<dd>(.*?)</dd>')
  info['texted_language'] = findMatch(line, '<dt>Texted language\(s\):</dt>.*?<dd>(.*?)</dd>')
  return info
 def get_episode_name(string):
  episode = ''
  ep = season_episode.findall(string)
  if ep:
    episode = ep[0].upper()
  return episode
 def in_killwords(string):
  string = string.lower()
  match = False
  for w in ['swesub', 'mpeg']:
    if w in string:
      match = True
  return match
 def get_episode(show_id, episode):
  if show_id <= 0:
    return ''
  tpbe = get_episodes(show_id)
  for e in tpbe:
    link =e[0]
    ep = get_episode_name(e[1])
    if ep == episode:
      info = get_info(link)
      if not in_killwords(info['torrent']) \
         and info['files'] > 0 and info['files'] < 10 \
         and (not info['texted_language'] or info['texted_language'] == info['spoken_language']):
        return info['torrent']
  return u''
 def get_episodes(id):
  data = read_url("http://thepiratebay.org/tv/%s" % id)
  episodes = re.compile('<nobr><a href="(.*?)">(.*?)</a></nobr>').findall(data)
  return episodes
 def search(query):
  torrents = []
  url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query)
  page = read_url(url)
  soup = BeautifulSoup(page)
  for row in soup('tr'): 
    torrentType = row.findAll('td', {'class': 'vertTh'})
    if torrentType:
      torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
      # 201 = Movies , 202 = Movie DVDR
      if torrentType in ['201']:
        torrent =  row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
        torrents.append(torrent)
  return torrents
 def searchByImdb(imdb):
  return search("tt" + imdb)
--- a/scrapeit/torrent.py
+++ b/scrapeit/torrent.py
@ -0,0 +1,18 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import mininova
 import btjunkie
 import thepiratebay
 def search(query):
  '''meta function to search with the best known torrent search engine
  '''
  return btjunkie.search(query)
 def searchByImdb(imdb):
  '''meta function to search by imdb with the best known torrent search engine
  '''
  return mininova.searchByImdb(imdb)
--- a/scrapeit/tvcom.py
+++ b/scrapeit/tvcom.py
@ -0,0 +1,34 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 from BeautifulSoup import BeautifulSoup
 from utils import read_url_utf8, stripTags
 def getEpisodeData(url):
  ''' prases informatin on tvcom episode pages
      returns dict with title, show, description, score
  '''
  tvcom = {
    'description': u''
  }
  data = read_url_utf8(url).replace('\n',' ')
  regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
  reg = re.compile(regexp, re.IGNORECASE)
  m = reg.findall(data)
  for match in m:
    description = match.strip()
    description = stripTags(description).replace('Watch Video','')
    tvcom['description'] = description.strip()
  soup = BeautifulSoup(data)
  #optional data
  try:
    tvcom['show'] = soup('h1')[0].contents[0]
    tvcom['title'] = soup('h1')[1].contents[0]
    tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
  except:
    pass
  return tvcom
--- a/scrapeit/tvrss.py
+++ b/scrapeit/tvrss.py
@ -0,0 +1,219 @@
 #!/usr/bin/env python
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from os.path import *
 import sys
 import datetime
 import time
 import re
 from urllib2 import urlopen
 import Image
 import StringIO
 import feedparser
 from  utils import read_url
 hr_hdtv = re.compile('HR HDTV')  
 hdtv = re.compile('HDTV')
 def get_url(title):
    return title.replace(' ','_').replace('/', '_').lower()
 def get_show(string):
  return string.split(';')[0].split(':')[1].strip()
 def get_title(string):
  title = string.split(';')[1].split(':')[1].strip()
  if title != 'n/a':
    return title
  return ''
 def get_season(string):
  try:
    season = int(string.split(';')[2].split(':')[1].strip())
  except:
    return None
  return season
 def get_episode(string):
  try:
    episode = int(string.split(';')[3].split(':')[1].strip())
  except:
    return None
  return episode
 def get_episodedate(string):
  s = string.split('Episode Date:')
  if len(s) == 2:
    return s[1].strip()
  return None
 def choose_item(old, new):
  if old['link'] == new['link']:
    return False  
  if not hdtv.search(old['title']):
    if hdtv.search(new['title']):
      display_item(new)
      log.debug("vs.")
      display_item(old)
      return True
  return False
 def get_imdbdata(imdbid):
  thumbnail = None
  description=''
  imdb = IMDb.parse(imdbid)
  if imdb:
    poster = imdb['poster']
    if poster != 'http://i.imdb.com/Heads/npa.gif':
      log.debug("getting poster %s" % poster)
      try:
        thumbnail = read_url(poster)
        im = Image.open(StringIO.StringIO(thumbnail))
        out = StringIO.StringIO()
        im.crop((0,0,100,100)).convert().save(out, 'JPEG')
        thumbnail = out.getvalue()
      except:
        thumbnail = None
    if imdb['summary']:
      description=imdb['summary']
    else: 
      description=imdb['tagline']
    return (imdb, description, thumbnail)
  else:
    return(imdb, '', None)
 def load():
  log.debug("getting new shows from tvrss...")
  feed = feedparser.parse('http://tvrss.net/feed/combined/')
  shows = {}
  for item in feed['entries']:
    show = get_show(item['description'])
    season = get_season(item['description'])
    episode = get_episode(item['description'])
    episodedate = get_episodedate(item['description'])
    estring = None
    if season and episode:
      estring = "S%02dE%02d" %(season, episode)
    elif episodedate:
      estring = episodedate
    if estring:
      if show and not hr_hdtv.search(item['title']):
        if shows.has_key(show):
          if shows[show].has_key(estring):
            if choose_item(shows[show][estring], item):
              shows[show][estring] = item
          else:
            shows[show][estring] = item
        else:
          shows[show] = {}
          shows[show][estring] = item
  for show in shows:
    imdb = None
    try:
      model.ShowsBlacklist.byShowUrl(get_url(show))
      log.debug("ignoring blacklisted show %s" % show)
      continue
    except:
      pass
    s = None
    try:
      s =  model.Shows.byUrl(get_url(show))
    except SQLObjectNotFound:
      try:
        alias = model.ShowsAlias.byAlias(get_url(show))
        s = alias.show
      except SQLObjectNotFound:
        s = None
    if not s:
      log.debug("about to add %s" % show)
      thumbnail = None
      description=''
      ur = '-'
      try:
        imdbid = IMDb.guess(show)
        if imdbid:
          imdb, description, thumbnail = get_imdbdata(imdbid)
          if imdb:
            ur = imdb['rating']
      except:
        import traceback
        print ptraceback.print_exc()
        pass
      s= model.Shows(
        title = show,
        url = get_url(show),
        description = description,
        imdb = imdbid,
        imdbUserRating = ur
      )
      s.thumbnail = thumbnail
      meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
      if meta:
        s.metacriticUrl = meta['url']
        s.metacriticScore =  "%s" % meta['score']
        for review in  meta['critics']:
          model.addReview(s, review)
      model.hub.commit()
      log.debug('added %s' % show)
    for episode in shows[show]:
      episode_title = get_title(shows[show][episode]['description'])
      episode_description = ''
      episode_imdb = ''
      q = model.Episodes.select(AND(
              model.Episodes.q.showID == s.id,
              model.Episodes.q.episode == episode))
      if q.count() == 0:
        if not imdb:
          try:
            imdbid = IMDb.guess(show)
            if imdbid:
              imdb = IMDb.parse(imdbid)
          except:
            pass
        if imdb and imdb['episodes'].has_key(episode):
          episode_title  = imdb['episodes'][episode]['title']
          episode_description = imdb['episodes'][episode]['description']
          episode_imdb = imdb['episodes'][episode]['imdb']
        if not episode_description or not episode_title:
          tvcom_data = tvcom.get(show, episode)
          if not episode_description: 
            episode_description = tvcom_data['description']
          if not episode_title: 
            episode_title = tvcom_data['title']
        e = model.Episodes(
          showID = s.id,
          title = episode_title,
          episode = episode,
          torrent = shows[show][episode]['enclosures'][0]['href'],
          description = episode_description,
          imdb = episode_imdb,
          thumbnail = None,
          pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
        )
        s.lastUpdate = datetime.datetime.now()
        model.hub.commit()
        log.debug("from tvrss add %s %s" %(episode, show))
  log.debug("updating tvrss done.")
 if __name__ == '__main__':
  # first look on the command line for a desired config file,
  # if it's not on the command line, then
  # look for setup.py in this directory. If it's not there, this script is
  # probably installed
  if len(sys.argv) > 1:
      turbogears.update_config(configfile=sys.argv[1], 
          modulename="btvcr.config")
  elif exists(join(dirname(__file__), "setup.py")):
      turbogears.update_config(configfile="dev.cfg",
          modulename="btvcr.config")
  else:
      turbogears.update_config(configfile="prod.cfg",
          modulename="btvcr.config")
  from btvcr.controllers import Root
  load()
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@ -0,0 +1,150 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 """
 screape tools
 """
 import re
 import time
 import urllib
 import urllib2
 import djangohtml
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
 # --------------------------------------------------------------------
 # Functions
 # --------------------------------------------------------------------
 def quote_plus(s):
  """
  A variant of urllib.quote_plus which handles ASCII and Unicode.
  """
  return urllib.quote_plus(s.encode('utf-8'))
 def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  Read str contents of given str URL.
  Here headers is a map of str -> str for HTTP request headers.  If
  blocking is True, returns the str page contents.  If blocking is
  False, returns an iterator which gives None until a successful read,
  at which point the str page contents is yielded.
  """
  req = urllib2.Request(url, None, headers)
  f = urllib2.urlopen(req)
  data = f.read()
  f.close()
  ctype = f.headers.getheader('content-type')
  charset = ctype.split('charset=')
  if len(charset)>1: charset = charset[1]
  else: charset = 'latin-1'
  data = unicode(data, charset)
  return data
 def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  Read str contents of given str URL.
  Here headers is a map of str -> str for HTTP request headers.  If
  blocking is True, returns the str page contents.  If blocking is
  False, returns an iterator which gives None until a successful read,
  at which point the str page contents is yielded.
  """
  req = urllib2.Request(url, None, headers)
  f = urllib2.urlopen(req)
  data = f.read()
  f.close()
  return data
 def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  opens given str URL and returns the url after redirection.
  """
  rurl = url
  try:
    req = urllib2.Request(url, None, headers)
    rurl = urllib2.urlopen(req).url
    rurl = rurl.replace('&src=rss', '')
  except:
    rurl = url
  return rurl
 def fix_url(url):
  """
  Given url str, trim redirect stuff and return actual URL.
  Currently this just returns the URL unmodified.
  """
 #  if url.lower().find('http%3a//') > 0:
 #    return 'http://' + url[url.lower().rindex('http%3a//')+9:]
 #  if url.find('http://') > 0:
 #    return url[url.rindex('http://'):]
  return url
 _html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
 import htmlentitydefs
 def html_entity_decode(s, encoding = 'utf-8'):
  r = []
  p = 0
  mo = _html_entity_re.search(s, p)
  while mo:
    r.append(s[p:mo.start()].decode(encoding))
    i = mo.lastindex
    e = mo.group(i)
    try:
      if i == 1:
        c = htmlentitydefs.name2codepoint[e]
      elif i == 2:
        c = int(e)
      elif i == 3:
        c = int(e, 16)
      else:
        assert 0
      r.append(unichr(c))
    except KeyError:
      r.append(mo.group(0))
    p = mo.end()
    mo = _html_entity_re.search(s, p)
  r.append(s[p:].decode(encoding))
  return u''.join(r)
 def stripTags(s):
  return djangohtml.strip_tags(htmldecode(s))
 from htmlentitydefs import name2codepoint
 # This pattern matches a character entity reference (a decimal numeric
 # references, a hexadecimal numeric reference, or a named reference).
 charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
 def htmldecode(text):
  """Decode HTML entities in the given text."""
  if type(text) != unicode:
    text = unicode(text)
  if type(text) is unicode:
    uchr = unichr
  else:
    uchr = lambda value: value > 255 and unichr(value) or chr(value)
  def entitydecode(match, uchr=uchr):
    entity = match.group(1)
    if entity.startswith('#x'):
      return uchr(int(entity[2:], 16))
    elif entity.startswith('#'):
      return uchr(int(entity[1:]))
    elif entity in name2codepoint:
      return uchr(name2codepoint[entity])
    else:
      return match.group(0)
  return charrefpat.sub(entitydecode, text)
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,31 @@
 #!/usr/bin/env python
 # -*- Mode: Python; -*-
 # vi:si:et:sw=2:sts=2:ts=2
 # encoding: utf-8
 from setuptools import setup, find_packages
 import os
 setup(
    name="scrapeit",
    version="0.1",
    # uncomment the following lines if you fill them out in release.py
    description="collection of scrapers for various websites",
    author="bot",
    author_email="bot@mailb.org",
    #url=url,
    #download_url=download_url,
    #license=license,
    packages=find_packages(),
    zip_safe=False,
    keywords = [
    ],
    classifiers = [
        'Development Status :: 3 - Alpha',
        'Operating System :: OS Independent',
        'Programming Language :: Python',
        'Topic :: Software Development :: Libraries :: Python Modules',
    ],
    )