add scrapeit

2007-03-01 15:11:35 +00:00 · 2007-03-01 15:11:35 +00:00 · ca2a42e773
commit ca2a42e773
18 changed files with 1864 additions and 0 deletions
--- a/scrapeit/init.py
+++ b/scrapeit/init.py
@ -0,0 +1,14 @@
+# -*- Mode: Python; -*-
+# vi:si:et:sw=2:sts=2:ts=2
+# encoding: utf-8
+
+import btjunkie
+import google
+import imdb
+import mininova
+import thepiratebay
+import torrent
+import rottentomatoes
+
+
+__version__ = '1.0.0'
--- a/scrapeit/btjunkie.py
+++ b/scrapeit/btjunkie.py
@ -0,0 +1,32 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from urllib import quote
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url, stripTags
+from btutils import torrentsWeLike
+
+
+def search(query):
+  '''search for torrents on btjunkie
+  '''
+  url = "http://btjunkie.org/search?q=%s&c=6&t=0&o=52&m=0&l=1"  % quote(query)
+  page = read_url(url)
+  soup = BeautifulSoup(page)
+  torrents = soup.findAll('a', {'class': 'BlckUnd'})
+  torrents = filter(torrentsWeLike, torrents)
+  torrent_links = []
+  for t in torrents:
+    tlink = "http://btjunkie.org%s.torrent" % t.attrMap['href']
+    tlink = tlink.replace('do=stat', 'do=download')
+    torrent_links.append(tlink)
+  return torrent_links
+
+def searchByImdb(imdb):
+  '''search for torrents by imdb, not supported on btjunkie right now
+  '''
+  return []
--- a/scrapeit/btutils.py
+++ b/scrapeit/btutils.py
@ -0,0 +1,25 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from utils import stripTags
+
+
+def torrentsWeLike(link):
+  '''check if torrent title looks like something we want to see, 
+  dvdrip / no cam / no dubbed versions
+  '''
+  text = stripTags(unicode(link)).lower()
+  #no cams / telesyncs or other stuff
+  for word in ('cam', 'telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'vcd', 'ts-screener'):
+    if word in text:
+      return False
+  #no dubbed versions
+  for word in ('italian', 'german', 'spanish', 'french'):
+    if word in text:
+      return False
+  #only dvdrips or dvdscrs
+  for word in ('dvdrip', 'dvdscr', 'dvd screener'):
+    if word in text:
+      return True
+  return False
--- a/scrapeit/djangohtml.py
+++ b/scrapeit/djangohtml.py
@ -0,0 +1,115 @@
+"HTML utilities suitable for global use."
+
+import re, string
+
+# Configuration for urlize() function
+LEADING_PUNCTUATION  = ['(', '<', '&lt;']
+TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
+
+# list of possible strings used for bullets in bulleted lists
+DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
+
+unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
+word_split_re = re.compile(r'(\s+)')
+punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
+    ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
+    '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
+simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
+link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
+html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
+hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
+trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
+del x # Temporary variable
+
+def escape(html):
+    "Returns the given HTML with ampersands, quotes and carets encoded"
+    if not isinstance(html, basestring):
+        html = str(html)
+    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
+
+def linebreaks(value):
+    "Converts newlines into <p> and <br />s"
+    value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
+    paras = re.split('\n{2,}', value)
+    paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
+    return '\n\n'.join(paras)
+
+def strip_tags(value):
+    "Returns the given HTML with all tags stripped"
+    return re.sub(r'<[^>]*?>', '', value)
+
+def strip_spaces_between_tags(value):
+    "Returns the given HTML with spaces between tags normalized to a single space"
+    return re.sub(r'>\s+<', '> <', value)
+
+def strip_entities(value):
+    "Returns the given HTML with all entities (&something;) stripped"
+    return re.sub(r'&(?:\w+|#\d);', '', value)
+
+def fix_ampersands(value):
+    "Returns the given HTML with all unencoded ampersands encoded correctly"
+    return unencoded_ampersands_re.sub('&amp;', value)
+
+def urlize(text, trim_url_limit=None, nofollow=False):
+    """
+    Converts any URLs in text into clickable links. Works on http://, https:// and
+    www. links. Links can have trailing punctuation (periods, commas, close-parens)
+    and leading punctuation (opening parens) and it'll still do the right thing.
+
+    If trim_url_limit is not None, the URLs in link text will be limited to
+    trim_url_limit characters.
+
+    If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
+    """
+    trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or ''))  or x
+    words = word_split_re.split(text)
+    nofollow_attr = nofollow and ' rel="nofollow"' or ''
+    for i, word in enumerate(words):
+        match = punctuation_re.match(word)
+        if match:
+            lead, middle, trail = match.groups()
+            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
+                    len(middle) > 0 and middle[0] in string.letters + string.digits and \
+                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
+                middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(anchor))
+            if middle.startswith('http://') or middle.startswith('https://'):
+                middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
+            if '@' in middle and not middle.startswith('www.') and not ':' in middle \
+                and simple_email_re.match(middle):
+                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
+            if lead + middle + trail != word:
+                words[i] = lead + middle + trail
+    return ''.join(words)
+
+def clean_html(text):
+    """
+    Cleans the given HTML. Specifically, it does the following:
+        * Converts <b> and <i> to <strong> and <em>.
+        * Encodes all ampersands correctly.
+        * Removes all "target" attributes from <a> tags.
+        * Removes extraneous HTML, such as presentational tags that open and
+          immediately close and <br clear="all">.
+        * Converts hard-coded bullets into HTML unordered lists.
+        * Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
+          bottom of the text.
+    """
+    from djangotext import normalize_newlines
+    text = normalize_newlines(text)
+    text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
+    text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
+    text = fix_ampersands(text)
+    # Remove all target="" attributes from <a> tags.
+    text = link_target_attribute_re.sub('\\1', text)
+    # Trim stupid HTML such as <br clear="all">.
+    text = html_gunk_re.sub('', text)
+    # Convert hard-coded bullets into HTML unordered lists.
+    def replace_p_tags(match):
+        s = match.group().replace('</p>', '</li>')
+        for d in DOTS:
+            s = s.replace('<p>%s' % d, '<li>')
+        return '<ul>\n%s\n</ul>' % s
+    text = hard_coded_bullets_re.sub(replace_p_tags, text)
+    # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
+    text = trailing_empty_content_re.sub('', text)
+    return text
+
--- a/scrapeit/djangotext.py
+++ b/scrapeit/djangotext.py
@ -0,0 +1,111 @@
+import re
+
+# Capitalizes the first letter of a string.
+capfirst = lambda x: x and x[0].upper() + x[1:]
+
+def wrap(text, width):
+    """
+    A word-wrap function that preserves existing line breaks and most spaces in
+    the text. Expects that existing line breaks are posix newlines (\n).
+    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
+    """
+    return reduce(lambda line, word, width=width: '%s%s%s' %
+                  (line,
+                   ' \n'[(len(line[line.rfind('\n')+1:])
+                         + len(word.split('\n',1)[0]
+                              ) >= width)],
+                   word),
+                  text.split(' ')
+                 )
+
+def truncate_words(s, num):
+    "Truncates a string after a certain number of words."
+    length = int(num)
+    words = s.split()
+    if len(words) > length:
+        words = words[:length]
+        if not words[-1].endswith('...'):
+            words.append('...')
+    return ' '.join(words)
+
+def get_valid_filename(s):
+    """
+    Returns the given string converted to a string that can be used for a clean
+    filename. Specifically, leading and trailing spaces are removed; other
+    spaces are converted to underscores; and all non-filename-safe characters
+    are removed.
+    >>> get_valid_filename("john's portrait in 2004.jpg")
+    'johns_portrait_in_2004.jpg'
+    """
+    s = s.strip().replace(' ', '_')
+    return re.sub(r'[^-A-Za-z0-9_.]', '', s)
+
+def get_text_list(list_, last_word='or'):
+    """
+    >>> get_text_list(['a', 'b', 'c', 'd'])
+    'a, b, c or d'
+    >>> get_text_list(['a', 'b', 'c'], 'and')
+    'a, b and c'
+    >>> get_text_list(['a', 'b'], 'and')
+    'a and b'
+    >>> get_text_list(['a'])
+    'a'
+    >>> get_text_list([])
+    ''
+    """
+    if len(list_) == 0: return ''
+    if len(list_) == 1: return list_[0]
+    return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
+
+def normalize_newlines(text):
+    return re.sub(r'\r\n|\r|\n', '\n', text)
+
+def recapitalize(text):
+    "Recapitalizes text, placing caps after end-of-sentence punctuation."
+#     capwords = ()
+    text = text.lower()
+    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
+    text = capsRE.sub(lambda x: x.group(1).upper(), text)
+#     for capword in capwords:
+#         capwordRE = re.compile(r'\b%s\b' % capword, re.I)
+#         text = capwordRE.sub(capword, text)
+    return text
+
+def phone2numeric(phone):
+    "Converts a phone number with letters into its numeric equivalent."
+    letters = re.compile(r'[A-PR-Y]', re.I)
+    char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
+         'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
+         'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
+         's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
+         'y': '9', 'x': '9'}.get(m.group(0).lower())
+    return letters.sub(char2number, phone)
+
+# From http://www.xhaus.com/alan/python/httpcomp.html#gzip
+# Used with permission.
+def compress_string(s):
+    import cStringIO, gzip
+    zbuf = cStringIO.StringIO()
+    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
+    zfile.write(s)
+    zfile.close()
+    return zbuf.getvalue()
+
+smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
+def smart_split(text):
+    """
+    Generator that splits a string by spaces, leaving quoted phrases together.
+    Supports both single and double quotes, and supports escaping quotes with
+    backslashes. In the output, strings will keep their initial and trailing
+    quote marks.
+    >>> list(smart_split('This is "a person\'s" test.'))
+    ['This', 'is', '"a person\'s"', 'test.']
+    """
+    for bit in smart_split_re.finditer(text):
+        bit = bit.group(0)
+        if bit[0] == '"':
+            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
+        elif bit[0] == "'":
+            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
+        else:
+            yield bit
--- a/scrapeit/epguides.py
+++ b/scrapeit/epguides.py
@ -0,0 +1,68 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from google import google
+from utils import read_url, read_url_utf8, stripTags
+import tvcom
+import imdb
+
+def epguidesUrl(title):
+  ''' 
+    Search Epguide Url for Show via Show Title.
+    Use Google to search the url, this is also done on Epguide.
+  '''
+  for (name, url, desc) in google('allintitle: site:epguides.com %s' % title, 1):
+    if url.startswith('http://epguides.com'):
+      if re.search(title, name):
+        return url
+  return None
+
+def getShowImdb(title):
+  imdbid = None
+  url = epguidesUrl(title)
+  if url:
+    data = read_url(url)
+    soup = BeautifulSoup(data)
+    links = soup('a', {'href': re.compile('imdb.com/title/tt')})
+    if links:
+      link = links[0].get('href')
+      imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
+  if not imdbid:
+    imdbid = imdb.guess(title)
+  return imdbid
+  
+def getEpisodeData(title, episode, show_url = None):
+  '''
+    Collect information about an episode.
+    
+    Returns dict with title, show, description and episode
+  '''
+  episodeData = {
+    'title': u'',
+    'show': title,
+    'description': u'',
+    'episode': episode,
+  }
+  description = u''
+  data = u''
+  if not show_url:
+    show_url = epguidesUrl(title)
+  if show_url:
+    data = read_url_utf8(show_url)
+  else:
+    return imdb.getEpisodeData(title, episode)
+  estring =  u'' +episode.replace('S','').replace('E','-').replace('0',' ').strip()
+  for line in data.split('\n'):
+    a = line.split(estring)
+    if len(a) == 2:
+      soup = BeautifulSoup(line)
+      episodeData['title'] = soup('a')[0].contents[0]
+      tvcom_url = soup('a')[0].get('href')
+      episodeData['description'] = tvcom.getEpisodeData(tvcom_url)['description']
+      break
+  return episodeData
--- a/scrapeit/google.py
+++ b/scrapeit/google.py
@ -0,0 +1,375 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+"""
+Query Web search engines.
+
+This module works by filtering the HTML returned by the search engine and thus tends to break when
+search engines modify their HTML output.
+
+Public domain, Connelly Barnes 2005-2007.  Compatible with Python 2.3-2.5.
+
+See L{examples} for a quick start.  See L{description} for the full
+explanation, precautions, and legal disclaimers.
+
+"""
+
+import re
+import time
+import urllib
+import urllib2
+import weakref
+import threading
+import Queue
+
+from utils import read_url
+
+__version__ = '1.0.2'
+
+# Default headers for HTTP requests.
+DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
+
+# Default maximum number of results.
+DEFAULT_MAX_RESULTS = 10
+
+# Function names for supported search engines.
+SEARCH_ENGINES = ['ask', 'dmoz', 'excite', 'google', 'msn', 'yahoo']
+
+__all__ = SEARCH_ENGINES + ['examples', 'description']
+
+# --------------------------------------------------------------------
+# Functions
+# --------------------------------------------------------------------
+
+def quote_plus(s):
+  """
+  A variant of urllib.quote_plus which handles ASCII and Unicode.
+  """
+  return urllib.quote_plus(s.encode('utf-8'))
+
+
+def fix_url(url):
+  """
+  Given url str, trim redirect stuff and return actual URL.
+
+  Currently this just returns the URL unmodified.
+  """
+#  if url.lower().find('http%3a//') > 0:
+#    return 'http://' + url[url.lower().rindex('http%3a//')+9:]
+#  if url.find('http://') > 0:
+#    return url[url.rindex('http://'):]
+  return url
+
+
+def get_search_page_links(page, results_per_page, begin, end, link_re):
+  """
+  Given str contents of search result page, return list of links.
+
+  Returns list of (name, url, desc) str tuples.  See make_searcher()
+  for a description of results_per_page and link_re.
+  """
+  if begin is not None and begin in page:
+    page = page[page.index(begin):]
+  if end is not None and end in page:
+    page = page[:page.index(end)]
+  ans = []
+  for match in re.compile(link_re, re.DOTALL).finditer(page):
+    (name, url, desc) = match.group('name', 'url', 'desc')
+    url = fix_url(url)
+    ans += [(html_to_text(name), url, html_to_text(desc))]
+  return ans
+
+
+def html_to_text(s):
+  """
+  Given an HTML formatted str, convert it to a text str.
+  """
+  s = re.sub(r'<.*?>', '', s)
+  s = s.replace('\r', ' ')
+  s = s.replace('\n', ' ')
+  s = s.replace('\t', ' ')
+  s = s.replace('&amp;', '&')
+  s = s.replace('&lt;', '<')
+  s = s.replace('&gt;', '>')
+  s = s.replace('&quot;', '"')
+  s = s.replace('&middot;', '\xb7')
+  for i in range(256):
+    s = s.replace('&#%d;' % i, chr(i))
+  while s.replace('  ', ' ') != s:
+    s = s.replace('  ', ' ')
+  return s.strip()
+
+
+def nonblocking(f, blocking_return=None, sleep_time=0.01):
+  """
+  Wrap a callable which returns an iter so that it no longer blocks.
+
+  The wrapped iterator returns blocking_return while callable f is
+  blocking.  The callable f is called in a background thread.  If the
+  wrapped iterator is deleted, then the iterator returned by f is
+  deleted also and the background thread is terminated.
+  """
+  def g(*args, **kwargs):
+    f_iter = f(*args, **kwargs)
+    g_iter = None
+    def run():
+      while True:
+        g_obj = g_iter()
+        if g_obj is None:
+          return
+        if g_obj.q.qsize() == 0:
+          try:
+            f_next = f_iter.next()
+          except Exception, e:
+            g_obj.exc = e
+            return
+          g_obj.q.put(f_next)
+        else:
+          del g_obj
+          time.sleep(sleep_time)
+    class Iter:
+      def __init__(self):
+        self.q = Queue.Queue()
+        self.exc = None
+        self.thread = threading.Thread(target=run)
+        self.thread.setDaemon(True)
+      def next(self):
+        if self.exc is not None:
+          raise self.exc
+        try:
+          return self.q.get_nowait()
+        except Queue.Empty:
+          return blocking_return
+      def __iter__(self):
+        return self
+
+    obj = Iter()
+    g_iter = weakref.ref(obj)
+    obj.thread.start()
+    try:
+      return obj
+    finally:
+      del obj
+  return g
+
+
+def make_searcher(query_url, results_per_page, page_url, page_mode,
+                  begin, end, link_re):
+  """
+  Return a search function for the given search engine.
+
+  Here query_url is the URL for the initial search, with %(q)s for
+  the query string, results_per_page is the number of search results
+  per page, page_url is the URL for the 2nd and subsequent pages of
+  search results, with %(q)s for the query string and %(n)s for the
+  page "number."  Here page_mode controls the actual value for the
+  page "number:"
+
+   - page_mode='page0':   Use 0-based index of the page.
+   - page_mode='page1':   Use 1-based index of the page.
+   - page_mode='offset0': Use 0-based index of the search result,
+                          which is a multiple of results_per_page.
+   - page_mode='offset1': Use 1-based index of the search result
+                          (one plus a multiple of results_per_page).
+
+  If begin is not None, then only text after the first occurrence of
+  begin will be used in the search results page.  If end is not None,
+  then only text before the first occurrence of end will be used.
+
+  Finally, link_re is a regex string (see module re) which matches
+  three named groups: 'name', 'url', and 'desc'.  These correspond to
+  the name, URL and description of each search result.  The regex is
+  applied in re.DOTALL mode.
+
+  Returns a search() function which has the same interface as
+  described in the module docstring.
+  """
+  def search_blocking(query, max_results):
+    last_links = None
+    page_num = 0
+#    done = False
+    q = Queue.Queue()
+    for i in range(max_results):
+      if q.qsize() == 0:
+        if page_num == 0:
+          page = read_url(query_url % {'q': quote_plus(query)})
+        else:
+#          if done:
+#            break
+          if page_mode == 'page0':
+            n = page_num
+          elif page_mode == 'page1':
+            n = page_num + 1
+          elif page_mode == 'offset0':
+            n = page_num * results_per_page
+          elif page_mode == 'offset1':
+            n = page_num * results_per_page + 1
+          else:
+            raise ValueError('unknown page mode')
+          page = read_url(page_url % {'n': n, 'q': quote_plus(query)})
+        page_num += 1
+        links = get_search_page_links(page, results_per_page, begin, end, link_re)
+        if len(links) == 0 or links == last_links:
+          break
+#        if len(links) < results_per_page:
+#          done = True
+        last_links = links
+        for link in links:
+          q.put(link)
+      yield q.get()
+
+  search_nonblocking = nonblocking(search_blocking)
+
+  def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
+    """
+    See docstring for web_search module.
+    """
+    if blocking:
+      return search_blocking(query, max_results)
+    else:
+      return search_nonblocking(query, max_results)
+
+  return search
+
+
+def examples():
+  """
+  Examples of the web_search module.
+
+  Example 1:
+
+   >>> from web_search import google
+   >>> for (name, url, desc) in google('python', 20):
+   ...   print name, url
+   ...
+   (First 20 results for Google search of "python").
+
+  Example 2:
+
+   >>> from web_search import dmoz
+   >>> list(dmoz('abc', 10))
+   [('ABC.com', 'http://www.abc.com', "What's on ABC..."), ...]
+
+  """
+  print examples.__doc__
+
+
+def description():
+  """
+  Full explanation and precautions for web_search module.
+
+  The search functions in this module follow a common interface::
+
+      search(query, max_results=10, blocking=True) =>
+        iterator of (name, url, description) search results.
+
+  Here query is the query string, max_results gives the maximum number
+  of search results, and the items in the returned iterator are string
+  3-tuples containing the Website name, URL, and description for each
+  search result.
+
+  If blocking=False, then an iterator is returned which does not block
+  execution: the iterator yields None when the next search result is
+  not yet available (a background thread is created).
+
+  Supported search engines are 'ask', 'dmoz', 'excite', 'google', 'msn',
+  'yahoo'.  This module is not associated with or endorsed by any of
+  these search engine corporations.
+
+  Be warned that if searches are made too frequently, or max_results is
+  large and you enumerate all search results, then you will be a drain
+  on the search engine's bandwidth, and the search engine organization
+  may respond by banning your IP address or IP address range.
+
+  This software has been placed in the public domain with the
+  following legal notice::
+
+      http://oregonstate.edu/~barnesc/documents/public_domain.txt
+
+  """
+  print description.__doc__
+
+
+# --------------------------------------------------------------------
+# Search engines
+# --------------------------------------------------------------------
+
+ask       = make_searcher('http://www.ask.com/web?q=%(q)s', 10,
+                          'http://www.ask.com/web?page=%(n)d&q=%(q)s', 'page1',
+                          None, None,
+                          r'<a .*? class="L4" href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
+                          r'.*?</div>(?P<desc>.*?)</div>')
+
+dmoz      = make_searcher('http://search.dmoz.org/cgi-bin/search?search=%(q)s', 20,
+                          'http://search.dmoz.org/cgi-bin/search?start=%(n)d&search=%(q)s', 'offset1',
+                          None, None,
+                          r'<li><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
+                          r'.*? - (?P<desc>.*?)<br>')
+
+excite    = make_searcher('http://msxml.excite.com/info.xcite/search/web/%(q)s', 20,
+                          'http://msxml.excite.com/info.xcite/search/web/%(q)s/%(n)d', 'offset1',
+                          None, None,
+                          r'<div class="listingmain" style=""><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
+                          r'(?P<desc>.*?)</span>')
+
+google    = make_searcher('http://www.google.com/search?q=%(q)s', 10,
+                          'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
+                          None, None,
+                          r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
+                          r'.*?(?:<br>|<table.*?>)' +
+                          r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
+
+msn       = make_searcher('http://search.msn.com/results.aspx?q=%(q)s', 10,
+                          'http://search.msn.com/results.aspx?q=%(q)s&first=%(n)d', 'offset1',
+                          '<h2>Results</h2>', '<div id="ads_rightC">',
+                          r'<h3><a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
+                          r'(?P<desc>.*?)<li class="first">')
+
+yahoo     = make_searcher('http://search.yahoo.com/search?p=%(q)s', 10,
+                          'http://search.yahoo.com/search?p=%(q)s&b=%(n)d', 'offset1',
+                          None, None,
+                          '<li><div><a class=yschttl.*?href="(?P<url>.*?)".*?>(?P<name>.*?)</a>' +
+                          r'.*?<div class=yschabstr>(?P<desc>.*?)</div>')
+
+# --------------------------------------------------------------------
+# Unit tests
+# --------------------------------------------------------------------
+
+def test_engine(search):
+  """
+  Test a search engine function returned by make_searcher().
+  """
+  for query in ['abc', 'microsoft', 'love', 'pweropieiw', 'addfdae']:
+    popular = query in ['abc', 'microsoft', 'love', 'run']
+    for n in [6, 17, 31]:
+      n1 = len(list(search(query, n)))
+      if popular:
+        assert n1 == n
+      else:
+        assert n1 <= n
+      n2 = 0
+      for item in search(query, n, False):
+        if item is not None:
+          n2 += 1
+        else:
+          time.sleep(0.01)
+      if popular:
+        assert n2 == n
+      else:
+        assert n2 <= n
+
+
+def test():
+  """
+  Unit test main routine.
+  """
+  import inspect
+  print 'Testing:'
+  for name in SEARCH_ENGINES:
+    print '  ' + (name + ':').ljust(20),
+    test_engine(getattr(inspect.getmodule(test), name))
+    print 'OK'
+
+
+if __name__ == '__main__':
+  test()
--- a/scrapeit/googlemovie.py
+++ b/scrapeit/googlemovie.py
@ -0,0 +1,34 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+from urllib import quote 
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url, read_url_utf8, stripTags
+
+def getGoogleMovieId(title):
+  url = 'http://google.com/movies?q=%s&btnG=Search+Movies' % quote(title)
+  data = read_url(url)
+  cids = re.compile('reviews\?cid=(.*?)&').findall(data)
+  if cids:
+    return cids[0]
+  return ''
+
+def getGoogleMovieData(title, year = None, cid = None):
+  gdata = {
+    'title': title,
+    'year': year,
+    'cid': cid,
+    'rating': '',
+  }
+  if not cid:
+    cid = getGoogleMovieId("%s (%s)" % (title, year))
+  if cid:
+    gdata['cid'] = cid
+    data = read_url('http://www.google.com/movies/reviews?cid=%s' % cid)
+    gdata['rating'] = re.compile('font size=.3><b><nobr>(.*?)&nbsp;/&nbsp;5').findall(data)[0]
+    gdata['reviews'] = re.compile('Based on (.*?) reviews').findall(data)[0]
+    gdata['year'] = re.compile("<title>.*?\((.*?)\).*?</title").findall(data)[0]
+  return gdata
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -0,0 +1,441 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import urllib2
+from urllib import quote
+import re, time
+import os
+
+from elementtree.ElementTree import parse, tostring
+from BeautifulSoup import BeautifulSoup
+
+from google import google
+from utils import stripTags, read_url_utf8, htmldecode
+
+import utils
+
+def read_url(url):
+  base = "/var/cache/scrapeit/cache/"
+  path = os.path.join(base, url.replace('http://',''))
+  if path.endswith('/'):
+    path = "%sindex.html" % path
+  if os.path.isdir(path):
+    path = "%s/index.html" % path
+  if os.path.exists(path):
+    f = open(path)
+    data = f.read()
+    f.close()
+    return data
+  else:
+    data = utils.read_url(url)
+    folder = os.path.dirname(path)
+    if not os.path.exists(folder):
+      os.makedirs(folder)
+    f = open(path, 'w')
+    f.write(data)
+    f.close()
+    return data
+  
+def  _get_data(url):
+  data = None
+  try:
+    data = read_url(url)
+  except:
+    print "error reading data from", url
+  return data
+
+def get_image(url):
+  return read_url(url)
+
+def _castList(data, regexp):
+  soup = re.compile(regexp).findall(data)
+  if soup:
+    soup = BeautifulSoup(soup[0])
+    names = []
+    for i in soup('a', {'href': re.compile('/name/nm')}):
+      if i.string:
+        cast = stripTags(i.string)
+        if cast not in names:
+          names.append(cast)
+    return names
+  return []
+
+def _getTerm(data, regexp):
+  term = ''
+  try:
+    reg = re.compile(regexp, re.IGNORECASE)
+    m = reg.search(data)
+    if m:
+      term = stripTags(m.group(1)).strip()
+  except:
+    print "waring, parsing failed for", regexp
+  return term.encode('utf8')
+
+
+class IMDb:
+  def __init__(self, imdb):
+    self.imdb = imdb
+    self.pageSource = None
+    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
+
+    self.businessSource = None
+    self.businessUrl = "%sbusiness" % self.pageUrl
+    self.connectionsSource = None
+    self.connectionsUrl = "%smovieconnections" % self.pageUrl
+    self.creditsSource = None
+    self.creditsUrl = "%sfullcredits" % self.pageUrl
+    self.episodesSource = None
+    self.episodesUrl = "%sepisodes" % self.pageUrl
+    self.keywordSource = None
+    self.keywordUrl = "%skeywords" % self.pageUrl
+    self.plotSource = None
+    self.plotUrl = "%splotsummary" % self.pageUrl
+    self.releaseinfoSource = None
+    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
+    self.triviaSource = None
+    self.triviaUrl = "%strivia" % self.pageUrl
+    
+  def getPage(self, forcereload = False):
+    if forcereload or not self.pageSource:
+      self.pageSource = read_url(self.pageUrl)
+    return self.pageSource
+
+  def parse_raw_value(self, key, value):
+    if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
+      value = stripTags(value).strip()    
+    if key == 'runtime':
+      parsed_value = _getTerm(value, '(.*?) min')
+      parsed_value = _getTerm(parsed_value, '([0-9]+)')
+      if not parsed_value:
+        parsed_value = _getTerm(value, '(.*?) sec')
+        parsed_value = _getTerm(parsed_value, '([0-9]+)')
+        if not parsed_value:
+          parsed_value = 0
+        else:
+          parsed_value = int(parsed_value)
+      else:
+        parsed_value = int(parsed_value) * 60
+    elif key in ('country', 'language'):
+      parsed_value = value.split(' / ')
+    elif key == 'genre':
+      parsed_value = value.replace('more', '').strip().split(' / ')
+    elif key == 'tagline':
+      parsed_value = value.replace('more', '').strip()
+    elif key == 'plot_outline':
+      parsed_value = value.replace('(view trailer)', '').strip()
+      if parsed_value.endswith('more'):
+        parsed_value = parsed_value[:-4].strip()
+    elif key == 'tv_series':
+      m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
+      if m:
+        parsed_value = m[0][0]
+      else:
+        parsed_value = ''
+    else:
+      print value
+      parsed_value = value
+    return parsed_value
+      
+  def parse(self):
+    data = self.getPage()
+    IMDbDict ={}
+    #Poster
+    IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
+    if not IMDbDict['poster']:
+      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'  
+    #Title, Year
+    title = u''
+    year  = u''
+    flat_data = data.replace('\n', '').replace('\r', '')
+    html_title = re.compile('<strong class="title">(.*?) <small>\(<a href="/Sections/Years/(.*?)">').findall(flat_data)
+    if html_title:
+      title = html_title[0][0]
+      IMDbDict['year'] = html_title[0][1]
+      IMDbDict['title'] = stripTags(title).strip()
+    else:
+      title = _getTerm(data, '<title>(.*?)</title>').split('(')
+      year = title[-1].split(')')[0].strip()
+      title = title[0].strip().decode('utf-8')
+      IMDbDict['title'] = title
+      IMDbDict['year']  = year
+    IMDbDict['title'] = htmldecode(IMDbDict['title'])
+    if IMDbDict['title'][0] == '"' and  IMDbDict['title'][-1] == '"':
+      IMDbDict['title'] =  IMDbDict['title'][1:-1]
+    
+    #Votes
+    m = re.compile('<b>(.*?)/10</b> \(<a href="ratings">(.*?) votes</a>\)', re.IGNORECASE).search(data)
+    if m:
+      IMDbDict['rating'] = int(float(m.group(1)) * 1000)
+      IMDbDict['votes'] = int(m.group(2).replace(',', ''))
+    else:
+      IMDbDict['rating'] = -1
+      IMDbDict['votes'] = -1
+
+    data = data.replace('\n',' ')
+    #some values
+    keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series')
+    for key in keys:
+      IMDbDict[key] = ''
+    IMDbDict['runtime'] = 0
+    soup = BeautifulSoup(data)
+    for info in soup('div', {'class': 'info'}):
+      key = str(info).split('</h5>')[0].split('<h5>')
+      if len(key) > 1:
+        raw_value = str(info).split('</h5>')[1]
+        key = key[1][:-1].lower().replace(' ', '_')
+        if key in keys:
+          IMDbDict[key] = self.parse_raw_value(key, raw_value)
+    
+    #is episode
+    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
+
+    IMDbDict['episodes'] = self.parseEpisodes()
+    IMDbDict['credits'] = self.parseCredits()
+    IMDbDict['plot'] = self.parsePlot()
+    IMDbDict['keywords'] = self.parseKeywords()
+
+    IMDbDict['trivia'] = self.parseTrivia()
+    IMDbDict['connections'] = self.parseConnections()
+    IMDbDict['release_date'] = self.parseReleaseinfo()
+    IMDbDict['business'] = self.parseBusiness()
+    self.IMDbDict = IMDbDict
+    return self.IMDbDict
+    
+  def getCredits(self, forcereload = False):
+    if forcereload or not self.creditsSource:
+      self.creditsSource = read_url(self.creditsUrl)
+    return self.creditsSource
+    
+  def parseCredits(self):
+    data = self.getCredits()
+    credits = {}
+    credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
+    credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
+    credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
+    #credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
+    credits['cast'] = []
+    soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
+    soup = BeautifulSoup(data)
+    cast = soup('table', {'class': 'cast'})
+    if cast:
+      cast = str(cast[0])
+      names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
+      for name in names:
+        real_name = name[0]
+        role_name = name[1]
+        if role_name:
+          role_name = role_name.split('(')[0].replace('/ ...','').strip()
+        credits['cast'].append((stripTags(real_name), stripTags(role_name)))
+    self.credits = credits
+    return self.credits
+    
+  def getPlot(self, forcereload = False):
+    if forcereload or not self.plotSource:
+      self.plotSource = read_url(self.plotUrl)
+    return self.plotSource
+
+  def parsePlot(self):
+    soup = BeautifulSoup(self.getPlot())
+    plot = soup('p', {'class':'plotpar'})
+    if plot:
+      plot = str(plot[0]).split('<i>')[0]
+    else:
+      plot = u''
+    plot = stripTags(plot).strip()
+    self.plot = plot
+    return plot
+    
+  def getEpisodes(self, forcereload = False):
+    if forcereload or not self.episodesSource:
+      self.episodesSource = read_url(self.episodesUrl)
+    return self.episodesSource
+    
+  def parseEpisodes(self):
+    episodes = {}
+    cdata = self.getEpisodes().replace('\r\n',' ')
+    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>.*?</b><br>(.*?)<br/>'''
+    #regexp = r'''Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></span><br>.*?<br>(.*?)</td>'''
+    reg = re.compile(regexp, re.IGNORECASE)
+    m = reg.findall(cdata)
+    for match in m:
+      try:
+        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
+        episodes[episode] = {}
+        episodes[episode]['imdb'] = match[2]
+        episodes[episode]['title'] = match[3].strip()
+        description = htmldecode(match[4])
+        description = stripTags(description.split('Next US airings:')[0])
+        episodes[episode]['description'] = description
+      except:
+        import traceback
+        print traceback.print_exc()
+        pass
+    self.episodes = episodes
+    return self.episodes
+
+  def getKeywords(self, forcereload = False):
+    if forcereload or not self.keywordSource:
+      self.keywordSource = read_url(self.keywordUrl)
+    return self.keywordSource
+
+  def parseKeywords(self):
+    soup = BeautifulSoup(self.getKeywords())
+    keywords = []
+    for key in soup('a', {'href': re.compile('/keyword')}):
+      keywords.append(htmldecode(key.string))
+    self.keywords = keywords
+    return self.keywords
+
+  def getTrivia(self, forcereload = False):
+    if forcereload or not self.triviaSource:
+      self.triviaSource = read_url(self.triviaUrl)
+    return self.triviaSource
+
+  def parseTrivia(self):
+    trivia = []
+    soup = BeautifulSoup(self.getTrivia())
+    triviaList = []
+    for i in  soup('ul', {'class': "trivia"}):
+      for t in i('li'):
+        t = str(t).replace('<br />', '').strip()
+        if t.startswith('<li>') and t.endswith('</li>'):
+          t = t[4:-5].strip()          
+        trivia.append(t)
+    self.trivia = trivia
+    return self.trivia
+    
+  def getConnections(self, forcereload = False):
+    if forcereload or not self.connectionsSource:
+      self.connectionsSource = read_url(self.connectionsUrl)
+    return self.connectionsSource
+
+  def parseConnections(self):
+    connections = {}
+    soup = BeautifulSoup(self.getConnections())
+    content = soup('div', {'id': 'tn15content'})[0]
+    blocks = str(content).split('<h5>')[1:]
+    for c in blocks:
+      connection = c.split('</h5>')[0]
+      cs = BeautifulSoup(c)
+      if connection:
+        #relation -> list of imdb ids
+        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
+    return connections
+
+  def getReleaseinfo(self, forcereload = False):
+    if forcereload or not self.releaseinfoSource:
+      self.releaseinfoSource = read_url(self.releaseinfoUrl)
+    return self.releaseinfoSource
+
+  def parseReleaseinfo(self):
+    soup = BeautifulSoup(self.getReleaseinfo())
+    for row in soup('table',{'border': '0', 'cellpadding':'2'})[0]('tr'):
+      d = row('td', {'align':'right'})
+      if d:
+        try:
+          possible_date = stripTags(str(d[0])).strip()
+          rdate = time.strptime(possible_date, "%d %B %Y")
+          rdate = time.strftime('%Y-%m-%d', rdate)
+          return rdate
+        except:
+          pass
+    return None
+    
+  def getBusiness(self, forcereload = False):
+    if forcereload or not self.businessSource:
+      self.businessSource = read_url(self.businessUrl)
+    return self.businessSource
+
+  def parseBusiness(self):
+    soup = BeautifulSoup(self.getBusiness())
+    business = {'budget': 0, 'gross': 0, 'profit': 0}
+    content = soup('div', {'id': 'tn15content'})[0]
+    blocks = str(content).split('<h5>')[1:]
+    for c in blocks:
+      cs = BeautifulSoup(c)
+      line = c.split('</h5>')
+      if line:
+        title = line[0]
+        line = line[1]
+        if title in ['Budget', 'Gross']:
+          values = re.compile('\$(.*?) ').findall(line)
+          values = [int(value.replace(',','')) for value in values]
+          if values:
+            business[title.lower()] = max(values)
+    if business['budget'] and business['gross']:
+      business['profit'] = business['gross'] - business['budget']
+    return business
+
+def guess(title, director=''):
+  #FIXME: proper file -> title
+  title = title.split('-')[0]
+  title = title.split('(')[0]
+  title = title.split('.')[0]
+  title = title.strip()
+  imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
+  return_url = ''
+
+  #lest first try google
+  #i.e. site:imdb.com Michael Stevens Sin
+  if director:
+    search = 'site:imdb.com %s "%s"' % (director, title)
+  else:
+    search = 'site:imdb.com "%s"' % title
+  for (name, url, desc) in google(search, 1):
+    if url.startswith('http://www.imdb.com/title/tt'):
+      return url[28:35]
+      
+  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
+  u = urllib2.urlopen(req)
+  data = u.read()
+  return_url = u.url
+  u.close()
+
+  if return_url.startswith('http://www.imdb.com/title/tt'):
+    return return_url[28:35]
+  if data: 
+    imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
+    if imdb_id:
+      return imdb_id
+  
+  imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
+  req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
+  u = urllib2.urlopen(req)
+  data = u.read()
+  return_url = u.url
+  u.close()
+  if return_url.startswith('http://www.imdb.com/title/tt'):
+    return return_url[28:35]
+    
+  return None
+
+def getEpisodeData(title, episode, show_url = None):
+  '''
+    Collect information about an episode.
+    
+    Returns dict with title, show, description and episode
+  '''
+  episodeData = {
+    'title': u'',
+    'show': title,
+    'description': u'',
+    'episode': episode,
+  }
+  description = u''
+  if not show_url:
+    imdbid = guess(title)
+  else:
+    imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
+  if imdbid:
+    i = IMDb(imdbid).parse()
+    episodeData['title'] = i['episodes'][episode]['title']
+    episodeData['description'] = i['episodes'][episode]['description']
+    episodeData['imdb'] = i['episodes'][episode]['imdb']
+  return episodeData
+  
+
+if __name__ == '__main__':
+  import sys
+  #print parse(sys.argv[1])
+  print "imdb:", guess(sys.argv[1])
--- a/scrapeit/mininova.py
+++ b/scrapeit/mininova.py
@ -0,0 +1,40 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+import re
+import socket
+from urllib import quote
+
+from BeautifulSoup import BeautifulSoup
+
+from utils  import read_url, read_url_utf8
+from btutils import torrentsWeLike
+
+socket.setdefaulttimeout(10.0)
+
+def search(query):
+  '''search for torrents on mininova
+  '''
+  torrents = []
+  url = "http://www.mininova.org/search/%s/seeds" % quote(query)
+  page = read_url(url)
+  soup = BeautifulSoup(page)
+  for row in soup('tr'): 
+    links = row('a', {'href':re.compile('/tor')})
+    if links and torrentsWeLike(links[0]):
+      torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
+      torrents.append(torrent_url)
+  return torrents
+
+def searchByImdb(imdb):
+  '''search for torrents on mininova by imdb
+  '''  
+  torrents = []
+  page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
+  soup = BeautifulSoup(page)
+  for row in soup('tr'): 
+    links = row('a', {'href':re.compile('/get')})
+    if links:
+      torrent_url = "http://www.mininova.org%s" % links[0].get('href')
+      torrents.append(torrent_url)
+  return torrents
--- a/scrapeit/rottentomatoes.py
+++ b/scrapeit/rottentomatoes.py
@ -0,0 +1,37 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from urllib import quote
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url
+
+
+def getRottenTomatoes(rating = 70):
+  '''
+    Get movie TITLES 
+    rated ABOVE 70 or value passed as first argument 
+    from RottenTomatoes
+  '''
+  movies = []
+  offset = 0
+  titles = ['1']
+  while titles:
+    url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
+    page = read_url(url)
+    soup = BeautifulSoup(page)
+    titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
+    data = str(soup)
+    ratings = re.compile('<span class="bold">(.*?) %</span>').findall(data)
+  
+    ratings = ratings[len(ratings)- len(titles):]
+
+    for title in titles:
+      movies.append({'title': title, 'rating': ratings[titles.index(title)], 'torrent': ''})
+  
+    offset += 10
+  return movies
+  
--- a/scrapeit/scrapetorrent.py
+++ b/scrapeit/scrapetorrent.py
@ -0,0 +1,16 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from urllib import quote
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+
+def search(query):
+  '''search for torrents on scrapetorrent
+  '''
+  torrents = []
+  return torrents
+
--- a/scrapeit/thepiratebay.py
+++ b/scrapeit/thepiratebay.py
@ -0,0 +1,104 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+import socket
+from urllib import quote
+
+from BeautifulSoup import BeautifulSoup
+
+from google import google
+from utils import read_url, read_url_utf8
+
+
+socket.setdefaulttimeout(10.0)
+
+season_episode = re.compile("S..E..", re.IGNORECASE)
+
+def shows(name = None):
+  data = read_url_utf8('http://thepiratebay.org/tv/all')
+  shows = re.compile('<dt><a href="/tv/(.*?)/">(.*?)</a></dt>').findall(data)
+  if not name:
+    return shows
+  for show in shows:
+    id = show[0]
+    if name == show[1]: 
+      return id
+  return ''
+  
+def findMatch(data, reg):
+  m = re.compile(reg).findall(data)
+  if m: 
+    return m[0]
+  return u''
+  
+def get_info(url):
+  url = url.strip()
+  if url.startswith('/'):
+    url = 'http://thepiratebay.org' + url
+  data = read_url(url)
+  line = data.replace('\n', ' ')
+  info = {}
+  info['torrent'] = findMatch(data, '(http://.*?.torrent)"')
+  info['files'] = findMatch(data, '<dd><a href="/details.php.id=.*?&amp;fl#show">(.*?)</a></dd>')
+  try:
+    info['files'] = int(info['files'])
+  except:
+    info['files'] = 0
+  info['spoken_language'] = findMatch(line, '<dt>Spoken language\(s\):</dt>.*?<dd>(.*?)</dd>')
+  info['texted_language'] = findMatch(line, '<dt>Texted language\(s\):</dt>.*?<dd>(.*?)</dd>')
+  return info
+  
+def get_episode_name(string):
+  episode = ''
+  ep = season_episode.findall(string)
+  if ep:
+    episode = ep[0].upper()
+  return episode
+
+def in_killwords(string):
+  string = string.lower()
+  match = False
+  for w in ['swesub', 'mpeg']:
+    if w in string:
+      match = True
+  return match
+    
+def get_episode(show_id, episode):
+  if show_id <= 0:
+    return ''
+  tpbe = get_episodes(show_id)
+  for e in tpbe:
+    link =e[0]
+    ep = get_episode_name(e[1])
+    if ep == episode:
+      info = get_info(link)
+      if not in_killwords(info['torrent']) \
+         and info['files'] > 0 and info['files'] < 10 \
+         and (not info['texted_language'] or info['texted_language'] == info['spoken_language']):
+        return info['torrent']
+  return u''
+ 
+def get_episodes(id):
+  data = read_url("http://thepiratebay.org/tv/%s" % id)
+  episodes = re.compile('<nobr><a href="(.*?)">(.*?)</a></nobr>').findall(data)
+  return episodes
+  
+def search(query):
+  torrents = []
+  url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query)
+  page = read_url(url)
+  soup = BeautifulSoup(page)
+  for row in soup('tr'): 
+    torrentType = row.findAll('td', {'class': 'vertTh'})
+    if torrentType:
+      torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
+      # 201 = Movies , 202 = Movie DVDR
+      if torrentType in ['201']:
+        torrent =  row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
+        torrents.append(torrent)
+  return torrents
+  
+def searchByImdb(imdb):
+  return search("tt" + imdb)
--- a/scrapeit/torrent.py
+++ b/scrapeit/torrent.py
@ -0,0 +1,18 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import mininova
+import btjunkie
+import thepiratebay
+
+def search(query):
+  '''meta function to search with the best known torrent search engine
+  '''
+  return btjunkie.search(query)
+
+def searchByImdb(imdb):
+  '''meta function to search by imdb with the best known torrent search engine
+  '''
+  return mininova.searchByImdb(imdb)
+
--- a/scrapeit/tvcom.py
+++ b/scrapeit/tvcom.py
@ -0,0 +1,34 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from utils import read_url_utf8, stripTags
+
+def getEpisodeData(url):
+  ''' prases informatin on tvcom episode pages
+      returns dict with title, show, description, score
+  '''
+  tvcom = {
+    'description': u''
+  }
+  data = read_url_utf8(url).replace('\n',' ')
+  regexp = r'''<div id="main-col">.*?<div>(.*?)<div class="ta-r mt-10 f-bold">'''
+  reg = re.compile(regexp, re.IGNORECASE)
+  m = reg.findall(data)
+  for match in m:
+    description = match.strip()
+    description = stripTags(description).replace('Watch Video','')
+    tvcom['description'] = description.strip()
+  soup = BeautifulSoup(data)
+  #optional data
+  try:
+    tvcom['show'] = soup('h1')[0].contents[0]
+    tvcom['title'] = soup('h1')[1].contents[0]
+    tvcom['score'] = soup("span", {'class':"f-28 f-bold mt-10 mb-10 f-FF9 db lh-18"})[0].contents[0]
+  except:
+    pass
+  return tvcom
--- a/scrapeit/tvrss.py
+++ b/scrapeit/tvrss.py
@ -0,0 +1,219 @@
+#!/usr/bin/env python
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+from os.path import *
+import sys
+import datetime
+import time
+import re
+from urllib2 import urlopen
+import Image
+import StringIO
+
+import feedparser
+
+from  utils import read_url
+
+
+hr_hdtv = re.compile('HR HDTV')  
+hdtv = re.compile('HDTV')
+
+def get_url(title):
+    return title.replace(' ','_').replace('/', '_').lower()
+
+def get_show(string):
+  return string.split(';')[0].split(':')[1].strip()
+
+def get_title(string):
+  title = string.split(';')[1].split(':')[1].strip()
+  if title != 'n/a':
+    return title
+  return ''
+
+def get_season(string):
+  try:
+    season = int(string.split(';')[2].split(':')[1].strip())
+  except:
+    return None
+  return season
+
+def get_episode(string):
+  try:
+    episode = int(string.split(';')[3].split(':')[1].strip())
+  except:
+    return None
+  return episode
+
+def get_episodedate(string):
+  s = string.split('Episode Date:')
+  if len(s) == 2:
+    return s[1].strip()
+  return None
+
+def choose_item(old, new):
+  if old['link'] == new['link']:
+    return False  
+  if not hdtv.search(old['title']):
+    if hdtv.search(new['title']):
+      display_item(new)
+      log.debug("vs.")
+      display_item(old)
+      return True
+  return False
+
+def get_imdbdata(imdbid):
+  thumbnail = None
+  description=''
+  imdb = IMDb.parse(imdbid)
+  if imdb:
+    poster = imdb['poster']
+    if poster != 'http://i.imdb.com/Heads/npa.gif':
+      log.debug("getting poster %s" % poster)
+      try:
+        thumbnail = read_url(poster)
+        im = Image.open(StringIO.StringIO(thumbnail))
+        out = StringIO.StringIO()
+        im.crop((0,0,100,100)).convert().save(out, 'JPEG')
+        thumbnail = out.getvalue()
+      except:
+        thumbnail = None
+    if imdb['summary']:
+      description=imdb['summary']
+    else: 
+      description=imdb['tagline']
+    return (imdb, description, thumbnail)
+  else:
+    return(imdb, '', None)
+
+def load():
+  log.debug("getting new shows from tvrss...")
+  feed = feedparser.parse('http://tvrss.net/feed/combined/')
+  shows = {}
+  for item in feed['entries']:
+    show = get_show(item['description'])
+    season = get_season(item['description'])
+    episode = get_episode(item['description'])
+    episodedate = get_episodedate(item['description'])
+    estring = None
+    if season and episode:
+      estring = "S%02dE%02d" %(season, episode)
+    elif episodedate:
+      estring = episodedate
+    if estring:
+      if show and not hr_hdtv.search(item['title']):
+        if shows.has_key(show):
+          if shows[show].has_key(estring):
+            if choose_item(shows[show][estring], item):
+              shows[show][estring] = item
+          else:
+            shows[show][estring] = item
+        else:
+          shows[show] = {}
+          shows[show][estring] = item
+  for show in shows:
+    imdb = None
+    try:
+      model.ShowsBlacklist.byShowUrl(get_url(show))
+      log.debug("ignoring blacklisted show %s" % show)
+      continue
+    except:
+      pass
+    s = None
+    try:
+      s =  model.Shows.byUrl(get_url(show))
+    except SQLObjectNotFound:
+      try:
+        alias = model.ShowsAlias.byAlias(get_url(show))
+        s = alias.show
+      except SQLObjectNotFound:
+        s = None
+    if not s:
+      log.debug("about to add %s" % show)
+      thumbnail = None
+      description=''
+      ur = '-'
+      try:
+        imdbid = IMDb.guess(show)
+        if imdbid:
+          imdb, description, thumbnail = get_imdbdata(imdbid)
+          if imdb:
+            ur = imdb['rating']
+      except:
+        import traceback
+        print ptraceback.print_exc()
+        pass
+      s= model.Shows(
+        title = show,
+        url = get_url(show),
+        description = description,
+        imdb = imdbid,
+        imdbUserRating = ur
+      )
+      s.thumbnail = thumbnail
+      meta = metacritic.scrapeMetacritic(s.title, s.metacriticUrl)
+      if meta:
+        s.metacriticUrl = meta['url']
+        s.metacriticScore =  "%s" % meta['score']
+        for review in  meta['critics']:
+          model.addReview(s, review)
+      model.hub.commit()
+      log.debug('added %s' % show)
+    for episode in shows[show]:
+      episode_title = get_title(shows[show][episode]['description'])
+      episode_description = ''
+      episode_imdb = ''
+      q = model.Episodes.select(AND(
+              model.Episodes.q.showID == s.id,
+              model.Episodes.q.episode == episode))
+      if q.count() == 0:
+        if not imdb:
+          try:
+            imdbid = IMDb.guess(show)
+            if imdbid:
+              imdb = IMDb.parse(imdbid)
+          except:
+            pass
+        if imdb and imdb['episodes'].has_key(episode):
+          episode_title  = imdb['episodes'][episode]['title']
+          episode_description = imdb['episodes'][episode]['description']
+          episode_imdb = imdb['episodes'][episode]['imdb']
+        if not episode_description or not episode_title:
+          tvcom_data = tvcom.get(show, episode)
+          if not episode_description: 
+            episode_description = tvcom_data['description']
+          if not episode_title: 
+            episode_title = tvcom_data['title']
+        e = model.Episodes(
+          showID = s.id,
+          title = episode_title,
+          episode = episode,
+          torrent = shows[show][episode]['enclosures'][0]['href'],
+          description = episode_description,
+          imdb = episode_imdb,
+          thumbnail = None,
+          pubDate = datetime.datetime.fromtimestamp(time.mktime(shows[show][episode]['updated_parsed']))
+        )
+        s.lastUpdate = datetime.datetime.now()
+        model.hub.commit()
+        log.debug("from tvrss add %s %s" %(episode, show))
+  log.debug("updating tvrss done.")
+
+if __name__ == '__main__':
+  # first look on the command line for a desired config file,
+  # if it's not on the command line, then
+  # look for setup.py in this directory. If it's not there, this script is
+  # probably installed
+  if len(sys.argv) > 1:
+      turbogears.update_config(configfile=sys.argv[1], 
+          modulename="btvcr.config")
+  elif exists(join(dirname(__file__), "setup.py")):
+      turbogears.update_config(configfile="dev.cfg",
+          modulename="btvcr.config")
+  else:
+      turbogears.update_config(configfile="prod.cfg",
+          modulename="btvcr.config")
+
+  from btvcr.controllers import Root
+  load()
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@ -0,0 +1,150 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+"""
+screape tools
+"""
+
+import re
+import time
+import urllib
+import urllib2
+
+import djangohtml
+
+
+# Default headers for HTTP requests.
+DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
+
+# --------------------------------------------------------------------
+# Functions
+# --------------------------------------------------------------------
+
+def quote_plus(s):
+  """
+  A variant of urllib.quote_plus which handles ASCII and Unicode.
+  """
+  return urllib.quote_plus(s.encode('utf-8'))
+
+
+def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
+  """
+  Read str contents of given str URL.
+
+  Here headers is a map of str -> str for HTTP request headers.  If
+  blocking is True, returns the str page contents.  If blocking is
+  False, returns an iterator which gives None until a successful read,
+  at which point the str page contents is yielded.
+  """
+  req = urllib2.Request(url, None, headers)
+  f = urllib2.urlopen(req)
+  data = f.read()
+  f.close()
+  ctype = f.headers.getheader('content-type')
+  charset = ctype.split('charset=')
+  if len(charset)>1: charset = charset[1]
+  else: charset = 'latin-1'
+  data = unicode(data, charset)
+  return data
+
+def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
+  """
+  Read str contents of given str URL.
+
+  Here headers is a map of str -> str for HTTP request headers.  If
+  blocking is True, returns the str page contents.  If blocking is
+  False, returns an iterator which gives None until a successful read,
+  at which point the str page contents is yielded.
+  """
+  req = urllib2.Request(url, None, headers)
+  f = urllib2.urlopen(req)
+  data = f.read()
+  f.close()
+  return data
+
+def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
+  """
+  opens given str URL and returns the url after redirection.
+  """
+  rurl = url
+  try:
+    req = urllib2.Request(url, None, headers)
+    rurl = urllib2.urlopen(req).url
+    rurl = rurl.replace('&src=rss', '')
+  except:
+    rurl = url
+  return rurl
+
+
+def fix_url(url):
+  """
+  Given url str, trim redirect stuff and return actual URL.
+
+  Currently this just returns the URL unmodified.
+  """
+#  if url.lower().find('http%3a//') > 0:
+#    return 'http://' + url[url.lower().rindex('http%3a//')+9:]
+#  if url.find('http://') > 0:
+#    return url[url.rindex('http://'):]
+  return url
+
+
+_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
+import htmlentitydefs
+
+def html_entity_decode(s, encoding = 'utf-8'):
+  r = []
+  p = 0
+  mo = _html_entity_re.search(s, p)
+  while mo:
+    r.append(s[p:mo.start()].decode(encoding))
+    i = mo.lastindex
+    e = mo.group(i)
+    try:
+      if i == 1:
+        c = htmlentitydefs.name2codepoint[e]
+      elif i == 2:
+        c = int(e)
+      elif i == 3:
+        c = int(e, 16)
+      else:
+        assert 0
+      r.append(unichr(c))
+    except KeyError:
+      r.append(mo.group(0))
+
+    p = mo.end()
+    mo = _html_entity_re.search(s, p)
+  r.append(s[p:].decode(encoding))
+  return u''.join(r)
+
+def stripTags(s):
+  return djangohtml.strip_tags(htmldecode(s))
+  
+  
+from htmlentitydefs import name2codepoint
+
+# This pattern matches a character entity reference (a decimal numeric
+# references, a hexadecimal numeric reference, or a named reference).
+charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
+
+def htmldecode(text):
+  """Decode HTML entities in the given text."""
+  if type(text) != unicode:
+    text = unicode(text)
+  if type(text) is unicode:
+    uchr = unichr
+  else:
+    uchr = lambda value: value > 255 and unichr(value) or chr(value)
+  def entitydecode(match, uchr=uchr):
+    entity = match.group(1)
+    if entity.startswith('#x'):
+      return uchr(int(entity[2:], 16))
+    elif entity.startswith('#'):
+      return uchr(int(entity[1:]))
+    elif entity in name2codepoint:
+      return uchr(name2codepoint[entity])
+    else:
+      return match.group(0)
+  return charrefpat.sub(entitydecode, text)
+
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- Mode: Python; -*-
+# vi:si:et:sw=2:sts=2:ts=2
+# encoding: utf-8
+from setuptools import setup, find_packages
+
+import os
+
+setup(
+    name="scrapeit",
+    version="0.1",
+    
+    # uncomment the following lines if you fill them out in release.py
+    description="collection of scrapers for various websites",
+    author="bot",
+    author_email="bot@mailb.org",
+    #url=url,
+    #download_url=download_url,
+    #license=license,
+    packages=find_packages(),
+    zip_safe=False,
+    keywords = [
+    ],
+    classifiers = [
+        'Development Status :: 3 - Alpha',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    )
+