lets start with google and imdb

2008-04-28 11:52:21 +02:00 · 2008-04-28 11:52:21 +02:00 · bbe4542bd2
commit bbe4542bd2
4 changed files with 805 additions and 0 deletions
--- a/ox/init.py
+++ b/ox/init.py
@ -0,0 +1,8 @@
 # -*- Mode: Python; -*-
 # vi:si:et:sw=2:sts=2:ts=2
 # encoding: utf-8
 __version__ = '0.1.0'
 from net import *
--- a/ox/google.py
+++ b/ox/google.py
@ -0,0 +1,187 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 import re
 import time
 import urllib
 import urllib2
 import weakref
 import threading
 import Queue
 import oxutils 
 from oxutils import stripTags
 '''
 FIXME this function should be replaced by something  more minimal find function
 usage:
 import google
 google.find(query)
 <generator object at 0x833aeac>
 for result in google.find(query): result
 result is title, url, description
 google.find(query, max_results)
 '''
 DEFAULT_MAX_RESULTS = 10
 def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
  google_timeout=24*60*60
  return oxutils.cache.getUrl(url, data, headers, google_timeout)
 def quote_plus(s):
  return urllib.quote_plus(s.encode('utf-8'))
 def get_search_page_links(page, results_per_page, begin, end, link_re):
  """
  Given str contents of search result page, return list of links.
  Returns list of (name, url, desc) str tuples.  See make_searcher()
  for a description of results_per_page and link_re.
  """
  if begin is not None and begin in page:
    page = page[page.index(begin):]
  if end is not None and end in page:
    page = page[:page.index(end)]
  ans = []
  for match in re.compile(link_re, re.DOTALL).finditer(page):
    (name, url, desc) = match.group('name', 'url', 'desc')
    ans += [(stripTags(name), url, stripTags(desc))]
  return ans
 def nonblocking(f, blocking_return=None, sleep_time=0.01):
  """
  Wrap a callable which returns an iter so that it no longer blocks.
  The wrapped iterator returns blocking_return while callable f is
  blocking.  The callable f is called in a background thread.  If the
  wrapped iterator is deleted, then the iterator returned by f is
  deleted also and the background thread is terminated.
  """
  def g(*args, **kwargs):
    f_iter = f(*args, **kwargs)
    g_iter = None
    def run():
      while True:
        g_obj = g_iter()
        if g_obj is None:
          return
        if g_obj.q.qsize() == 0:
          try:
            f_next = f_iter.next()
          except Exception, e:
            g_obj.exc = e
            return
          g_obj.q.put(f_next)
        else:
          del g_obj
          time.sleep(sleep_time)
    class Iter:
      def __init__(self):
        self.q = Queue.Queue()
        self.exc = None
        self.thread = threading.Thread(target=run)
        self.thread.setDaemon(True)
      def next(self):
        if self.exc is not None:
          raise self.exc
        try:
          return self.q.get_nowait()
        except Queue.Empty:
          return blocking_return
      def __iter__(self):
        return self
    obj = Iter()
    g_iter = weakref.ref(obj)
    obj.thread.start()
    try:
      return obj
    finally:
      del obj
  return g
 def make_searcher(query_url, results_per_page, page_url, page_mode,
                  begin, end, link_re):
  """
  Return a search function for the given search engine.
  Here query_url is the URL for the initial search, with %(q)s for
  the query string, results_per_page is the number of search results
  per page, page_url is the URL for the 2nd and subsequent pages of
  search results, with %(q)s for the query string and %(n)s for the
  page "number."  Here page_mode controls the actual value for the
  page "number:"
   - page_mode='page0':   Use 0-based index of the page.
   - page_mode='page1':   Use 1-based index of the page.
   - page_mode='offset0': Use 0-based index of the search result,
                          which is a multiple of results_per_page.
   - page_mode='offset1': Use 1-based index of the search result
                          (one plus a multiple of results_per_page).
  If begin is not None, then only text after the first occurrence of
  begin will be used in the search results page.  If end is not None,
  then only text before the first occurrence of end will be used.
  Finally, link_re is a regex string (see module re) which matches
  three named groups: 'name', 'url', and 'desc'.  These correspond to
  the name, URL and description of each search result.  The regex is
  applied in re.DOTALL mode.
  Returns a search() function which has the same interface as
  described in the module docstring.
  """
  def search_blocking(query, max_results):
    last_links = None
    page_num = 0
    q = Queue.Queue()
    for i in range(max_results):
      if q.qsize() == 0:
        if page_num == 0:
          page = getUrl(query_url % {'q': quote_plus(query)})
        else:
          if page_mode == 'page0':
            n = page_num
          elif page_mode == 'page1':
            n = page_num + 1
          elif page_mode == 'offset0':
            n = page_num * results_per_page
          elif page_mode == 'offset1':
            n = page_num * results_per_page + 1
          else:
            raise ValueError('unknown page mode')
          page = getUrl(page_url % {'n': n, 'q': quote_plus(query)})
        page_num += 1
        links = get_search_page_links(page, results_per_page, begin, end, link_re)
        if len(links) == 0 or links == last_links:
          break
        last_links = links
        for link in links:
          q.put(link)
      yield q.get()
  search_nonblocking = nonblocking(search_blocking)
  def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True):
    """
    See docstring for web_search module.
    """
    if blocking:
      return search_blocking(query, max_results)
    else:
      return search_nonblocking(query, max_results)
  return search
 find = make_searcher('http://www.google.com/search?q=%(q)s', 10,
                          'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0',
                          None, None,
                          r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' +
                          r'.*?(?:<br>|<table.*?>)' +
                          r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)')
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -0,0 +1,580 @@
 # -*- Mode: Python; -*-
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=2:sts=2:ts=2
 from oxutils import *
 import urllib2
 from urllib import quote
 import re, time
 import os
 import time
 from BeautifulSoup import BeautifulSoup
 import chardet
 import oxutils
 from oxutils import stripTags, htmldecode
 from oxutils.cache import getUrl, getUrlUnicode
 from oxutils.normalize import normalizeTitle
 import google
 def  _get_data(url):
  data = None
  try:
    data = getUrl(url)
  except:
    print "error reading data from", url
  return data
 def get_image(url):
  return getUrl(url)
 def _castList(data, regexp):
  soup = re.compile(regexp).findall(data)
  if soup:
    soup = BeautifulSoup(soup[0])
    names = []
    for i in soup('a', {'href': re.compile('/name/nm')}):
      if i.string:
        cast = stripTags(i.string)
        if cast not in names:
          names.append(cast)
    return names
  return []
 def _getTerm(data, regexp):
  term = ''
  try:
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.search(data)
    if m:
      term = stripTags(m.group(1)).strip()
  except:
    print "waring, parsing failed for", regexp
  return term.encode('utf8')
 class IMDb:
  def __init__(self, imdb):
    self.imdb = imdb
    self.pageSource = None
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
    self.businessSource = None
    self.businessUrl = "%sbusiness" % self.pageUrl
    self.connectionsSource = None
    self.connectionsUrl = "%smovieconnections" % self.pageUrl
    self.creditsSource = None
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesSource = None
    self.episodesUrl = "%sepisodes" % self.pageUrl
    self.keywordSource = None
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotSource = None
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoSource = None
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
    self.triviaSource = None
    self.triviaUrl = "%strivia" % self.pageUrl
    self.locationSource = None
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsSource = None
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
    self.trailerSource = None
    self.trailerUrl = "%strailers" % self.pageUrl
  def getPage(self, forcereload = False):
    if forcereload or not self.pageSource:
      self.pageSource = getUrlUnicode(self.pageUrl)
    return self.pageSource
  def parse_raw_value(self, key, value):
    if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'):
      value = unicode(value, 'utf-8')
      value = stripTags(value).strip()    
    if key == 'runtime':
      parsed_value = _getTerm(value, '(.*?) min')
      parsed_value = _getTerm(parsed_value, '([0-9]+)')
      if not parsed_value:
        parsed_value = _getTerm(value, '(.*?) sec')
        parsed_value = _getTerm(parsed_value, '([0-9]+)')
        if not parsed_value:
          parsed_value = 0
        else:
          parsed_value = int(parsed_value)
      else:
        parsed_value = int(parsed_value) * 60
    elif key in ('country', 'language'):
      parsed_value = value.split(' / ')
    elif key == 'genre':
      parsed_value = value.replace('more', '').strip().split(' / ')
    elif key == 'tagline':
      parsed_value = value.replace('more', '').strip()
    elif key == 'plot_outline':
      parsed_value = value.replace('(view trailer)', '').strip()
      if parsed_value.endswith('more'):
        parsed_value = parsed_value[:-4].strip()
    elif key == 'tv_series':
      m = re.compile('<a href="/title/tt(.*?)/">(.*?)</a>').findall(value)
      if m:
        parsed_value = m[0][0]
      else:
        parsed_value = ''
    elif key == 'also_known_as':
      parsed_value = ''
      m = re.compile('(.*) \(International: English title').findall(value)
      if m:
        parsed_value = m[0]
      else:
        m = re.compile('(.*) \(USA').findall(value)
        if m:
          parsed_value = m[0]
      parsed_value = parsed_value.split('<br />')[-1].split('(')[0]
      director = self.parseCredits().get('director', None)
      if director:
        director = director[0]
        parsed_value = parsed_value.replace(director, '')
      if parsed_value.startswith("'s"):
        parsed_value = parsed_value[2:].strip()
      parsed_value = parsed_value.strip()
    else:
      print value
      parsed_value = value
    return parsed_value
  def parseTitle(self):
    title = ''
    data = self.getPage()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if not html_title:
      html_title = soup('title')
    if html_title:
      html_title = str(html_title[0])
      html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
      title = stripTags(html_title)
      title = re.sub('\(\d\d\d\d\)', '', title)
      title = re.sub('\(\d\d\d\d/I*\)', '', title)
      for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
        title = title.replace(t, '')
    if title.find(u'\xa0') > -1:
      title = title[:title.find(u'\xa0')]
    title = normalizeTitle(title.strip())
    if title.startswith('"') and title.endswith('"'):
      title = normalizeTitle(title[1:-1])
    elif title.startswith('"') and title.find('"',1) > 0 and \
        title.find('"',1) == title.rfind('"'):
        se = re.compile("Season (\d*), Episode (\d*)\)").findall(data)
        if se:
          se = se[0]
          se = ' (S%02dE%02d)' % (int(se[0]), int(se[1]))
          title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:]
        else:
          title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:]
    return normalizeTitle(title)
  def parseYear(self):
    year = ''
    data = self.getPage()
    soup = BeautifulSoup(data)
    html_title = soup('div', {'id': 'tn15title'})
    if not html_title:
      html_title = soup('title')
    if html_title:
      html_title = str(html_title[0])
      html_title = stripTags(html_title)
      year = re.compile('\((\d\d\d\d)\)').findall(html_title)
      if not year:
        year = re.compile('\((\d\d\d\d)/').findall(html_title)
      if year: 
        year = year[0]
      else: year = ''
    return year
  def parse(self):
    data = self.getPage()
    IMDbDict ={}
    #Poster
    IMDbDict['poster'] = _getTerm(data, 'name="poster".*?<img .*?src="(.*?)"')
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'  
    #Title, Year
    IMDbDict['year'] = self.parseYear()
    IMDbDict['title'] = self.parseTitle()
    #Rating
    m = re.compile('<b>(.*?)/10</b>', re.IGNORECASE).search(data)
    if m:
      IMDbDict['rating'] = int(float(m.group(1)) * 1000)
    else:
      IMDbDict['rating'] = -1
    #Votes
    m = re.compile('<small>\(<a href="ratings">(.*?) votes</a>\)</small>', re.IGNORECASE).findall(data)
    if m:
      IMDbDict['votes'] = int(m[0].replace(',', ''))
    else:
      IMDbDict['votes'] = -1
    data = data.replace('\n',' ')
    #some values
    keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as')
    for key in keys:
      IMDbDict[key] = ''
    IMDbDict['runtime'] = 0
    soup = BeautifulSoup(data)
    for info in soup('div', {'class': 'info'}):
      key = str(info).split('</h5>')[0].split('<h5>')
      if len(key) > 1:
        raw_value = str(info).split('</h5>')[1]
        key = key[1][:-1].lower().replace(' ', '_')
        if key in keys:
          IMDbDict[key] = self.parse_raw_value(key, raw_value)
    IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title'])
    #is episode
    IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
    IMDbDict['episodes'] = self.parseEpisodes()
    if IMDbDict['episodes']:
      IMDbDict['tvshow'] = True
    else:
      IMDbDict['tvshow'] = False
    IMDbDict['credits'] = self.parseCredits()
    IMDbDict['plot'] = self.parsePlot()
    IMDbDict['keywords'] = self.parseKeywords()
    IMDbDict['trivia'] = self.parseTrivia()
    IMDbDict['connections'] = self.parseConnections()
    IMDbDict['locations'] = self.parseLocations()
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
    IMDbDict['reviews'] = self.parseExternalreviews()
    IMDbDict['stills'] = getMovieStills(self.imdb)
    #IMDbDict['trailer'] = self.parseTrailer()
    self.IMDbDict = IMDbDict
    if IMDbDict['episode_of']:
      episode_of =IMDb(IMDbDict['episode_of']).parse()
      for key in ('country', 'language'):
        if not IMDbDict[key]:
          IMDbDict[key] = episode_of[key]
    return self.IMDbDict
  def getCredits(self, forcereload = False):
    if forcereload or not self.creditsSource:
      self.creditsSource = getUrlUnicode(self.creditsUrl)
    return self.creditsSource
  def parseCredits(self):
    data = self.getCredits()
    credits = {}
    credits['director'] = _castList(data, 'Directed by.*?(<tr>.*?)</table>')
    credits['writer'] = _castList(data, 'Writing credits.*?(<tr>.*?)</table>')
    credits['producer'] = _castList(data, 'Produced by.*?(<tr>.*?)</table>')
    #credits['cast'] = _castList(data, 'Cast</b>.*?(<tr.*?)</table>')
    credits['cast'] = []
    soup = re.compile('Cast</b>.*?(<tr.*?)</table>').findall(data)
    soup = BeautifulSoup(data)
    cast = soup('table', {'class': 'cast'})
    if cast:
      cast = str(cast[0]).replace(u'\xa0', ' ')
      names = re.compile('<a href="/name/nm.*?/">(.*?)</a>.*?</td><td class="char">(.*?)</td></tr>').findall(cast)
      for name in names:
        real_name = name[0]
        role_name = name[1]
        if role_name:
          role_name = role_name.split('(')[0].replace('/ ...','')
        credits['cast'].append((stripTags(real_name), stripTags(role_name)))
    self.credits = credits
    return self.credits
  def getPlot(self, forcereload = False):
    if forcereload or not self.plotSource:
      self.plotSource = getUrlUnicode(self.plotUrl)
    return self.plotSource
  def parsePlot(self):
    soup = BeautifulSoup(self.getPlot())
    plot = soup('p', {'class':'plotpar'})
    if plot:
      plot = unicode(plot[0]).split('<i>')[0]
    else:
      plot = u''
    plot = stripTags(plot).strip()
    self.plot = plot
    return plot
  def getEpisodes(self, forcereload = False):
    if forcereload or not self.episodesSource:
      self.episodesSource = getUrlUnicode(self.episodesUrl)
    return self.episodesSource
  def parseEpisodes(self):
    episodes = {}
    cdata = self.getEpisodes().replace('\r\n', ' ')
    regexp = r'''<h4>Season (.*?), Episode (.*?): <a href="/title/tt(.*?)/">(.*?)</a></h4>(.*?)</b><br>(.*?)<br/>'''
    reg = re.compile(regexp, re.IGNORECASE)
    m = reg.findall(cdata)
    for match in m:
      try:
        episode = "S%02dE%02d" % (int(match[0]), int(match[1]))
        episodes[episode] = {}
        episodes[episode]['imdb'] = match[2]
        episodes[episode]['title'] = match[3].strip()
        if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
          episodes[episode]['title'] = u''
        description = htmldecode(match[5])
        description = stripTags(description.split('Next US airings:')[0])
        episodes[episode]['description'] = description
        episodes[episode]['date'] = ''
        try:
          d = stripTags(match[4])
          d = d.replace('Original Air Date: ', '')
          d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y'))
          episodes[episode]['date'] = d
        except:
          pass
      except:
        import traceback
        print traceback.print_exc()
        pass
    self.episodes = episodes
    return self.episodes
  def getLocations(self, forcereload = False):
    if forcereload or not self.locationSource:
      self.keywordSource = getUrlUnicode(self.locationUrl)
    return self.keywordSource
  def parseLocations(self):
    soup = BeautifulSoup(self.getLocations())
    locations = []
    for key in soup('a', {'href': re.compile('^/List')}):
      locations.append(htmldecode(key.string))
    self.locations = locations
    return self.locations
  def getKeywords(self, forcereload = False):
    if forcereload or not self.keywordSource:
      self.keywordSource = getUrlUnicode(self.keywordUrl)
    return self.keywordSource
  def parseKeywords(self):
    soup = BeautifulSoup(self.getKeywords())
    keywords = []
    for key in soup('a', {'href': re.compile('^/keyword/')}):
      k = htmldecode(key.string)
      k = k.replace(u'\xa0', ' ')
      keywords.append(k)
    self.keywords = keywords
    return self.keywords
  def getTrivia(self, forcereload = False):
    if forcereload or not self.triviaSource:
      self.triviaSource = getUrlUnicode(self.triviaUrl)
    return self.triviaSource
  def parseTrivia(self):
    trivia = []
    soup = BeautifulSoup(self.getTrivia())
    triviaList = []
    for i in  soup('ul', {'class': "trivia"}):
      for t in i('li'):
        t = str(t).replace('<br />', '').strip()
        if t.startswith('<li>') and t.endswith('</li>'):
          t = t[4:-5].strip()          
        trivia.append(t)
    self.trivia = trivia
    return self.trivia
  def getConnections(self, forcereload = False):
    if forcereload or not self.connectionsSource:
      self.connectionsSource = getUrlUnicode(self.connectionsUrl)
    return self.connectionsSource
  def parseConnections(self):
    connections = {}
    soup = BeautifulSoup(self.getConnections())
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      connection = c.split('</h5>')[0]
      cs = BeautifulSoup(c)
      if connection:
        #relation -> list of imdb ids
        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
    return connections
  def getReleaseinfo(self, forcereload = False):
    if forcereload or not self.releaseinfoSource:
      self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl)
    return self.releaseinfoSource
  def parseReleaseinfo(self):
    soup = BeautifulSoup(self.getReleaseinfo())
    info = soup('table',{'border': '0', 'cellpadding':'2'})
    if info:
      for row in info[0]('tr'):
        d = row('td', {'align':'right'})
        if d:
          try:
            possible_date = stripTags(str(d[0])).strip()
            rdate = time.strptime(possible_date, "%d %B %Y")
            rdate = time.strftime('%Y-%m-%d', rdate)
            return rdate
          except:
            pass
    return None
  def getBusiness(self, forcereload = False):
    if forcereload or not self.businessSource:
      self.businessSource = getUrlUnicode(self.businessUrl)
    return self.businessSource
  def parseBusiness(self):
    soup = BeautifulSoup(self.getBusiness())
    business = {'budget': 0, 'gross': 0, 'profit': 0}
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      cs = BeautifulSoup(c)
      line = c.split('</h5>')
      if line:
        title = line[0]
        line = line[1]
        if title in ['Budget', 'Gross']:
          values = re.compile('\$(.*?) ').findall(line)
          values = [int(value.replace(',','')) for value in values]
          if values:
            business[title.lower()] = max(values)
    if business['budget'] and business['gross']:
      business['profit'] = business['gross'] - business['budget']
    return business
  def getExternalreviews(self, forcereload = False):
    if forcereload or not self.externalreviewsSource:
      self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl)
    return self.externalreviewsSource
  def parseExternalreviews(self):
    soup = BeautifulSoup(self.getExternalreviews())
    ol = soup('ol')
    if ol:
      ol = ol[0]
      ret = {}
      for li in ol('li'):
        try:
          a = li('a')[0]
          href = a.get('href')
          txt = a.contents[0]
          ret[href] = txt
        except:
          pass
      return ret
    return {}
  def getTrailer(self, forcereload = False):
    if forcereload or not self.trailerSource:
      self.trailerSource = getUrlUnicode(self.trailerUrl)
    return self.trailerSource
  def parseTrailer(self):
    ret = {}
    soup = BeautifulSoup(self.getTrailer())
    for p in soup('p'):
      if p('a') and p.firstText():
        a = p('a')[0]
        href = a['href']
        if href and href.startswith('http'):
          title = a.string
          title = title.replace('www.', '')
          ret[href] = title
    return ret
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]
  title = title.split('(')[0]
  title = title.split('.')[0]
  title = title.strip()
  imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8'))
  return_url = ''
  #lest first try google
  #i.e. site:imdb.com Michael Stevens Sin
  if director:
    search = 'site:imdb.com %s "%s"' % (director, title)
  else:
    search = 'site:imdb.com "%s"' % title
  for (name, url, desc) in google.find(search, 2):
    if url.startswith('http://www.imdb.com/title/tt'):
      return url[28:35]
  try:
    req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
    u = urllib2.urlopen(req)
    data = u.read()
    return_url = u.url
    u.close()
  except:
    return None
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  if data: 
    imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
    if imdb_id:
      return imdb_id
  imdb_url = 'http://www.imdb.com/find?q=%s;s=tt;site=aka' % quote(title.encode('utf-8'))
  req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS)
  u = urllib2.urlopen(req)
  data = u.read()
  return_url = u.url
  u.close()
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  return None
 def getEpisodeData(title, episode, show_url = None):
  '''
    Collect information about an episode.
    Returns dict with title, show, description and episode
  '''
  episodeData = {
    'title': u'',
    'show': title,
    'description': u'',
    'episode': episode,
  }
  description = u''
  if not show_url:
    imdbid = guess(title)
  else:
    imdbid = "%07d" % int(re.compile('title/tt(\d*)').findall(link)[0])
  if imdbid:
    i = IMDb(imdbid).parse()
    episodeData['title'] = i['episodes'][episode]['title']
    episodeData['description'] = i['episodes'][episode]['description']
    episodeData['imdb'] = i['episodes'][episode]['imdb']
  return episodeData
 def getMovieStills(id):
  data = getUrl("http://imdb.com/gallery/ss/%s" % id)
  s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://i.imdb.com/Photos/Ss/%s/th-(.*?).jpg"''' % id).findall(data)
  stills = []
  for s in s_:
    if int(s[0]) > int(s[1]):
      stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2]))
  if not stills:
    s_ = re.compile('''<img width="(\d*?)" height="(\d*?)" src="http://(.*?)p.jpg"''').findall(data)
    stills = []
    for s in s_:
      if int(s[0]) > int(s[1]):
        stills.append("http://%sf.jpg" % s[2])
  return stills
 if __name__ == '__main__':
  import sys
  #print parse(sys.argv[1])
  print "imdb:", guess(sys.argv[1])
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,30 @@
 #!/usr/bin/env python
 # vi:si:et:sw=2:sts=2:ts=2
 # encoding: utf-8
 from setuptools import setup, find_packages
 import os
 setup(
  name="ox",
  version="0.1",
  # uncomment the following lines if you fill them out in release.py
  description="collection of scrapers for various websites",
  author="bot",
  author_email="bot@0xdb.org",
  url="http://ox.0xdb.org",
  download_url="http://ox.0xdb.org/download",
  license="GPL",
  packages=find_packages(),
  zip_safe=False,
  keywords = [
  ],
  classifiers = [
      'Development Status :: 3 - Alpha',
      'Operating System :: OS Independent',
      'Programming Language :: Python',
      'Topic :: Software Development :: Libraries :: Python Modules',
  ],
  )