add one dailymotion function

2008-04-30 15:31:50 +02:00 · 2008-04-30 15:31:50 +02:00 · 7668ceafc1
commit 7668ceafc1
parent 40185f89ab
2 changed files with 114 additions and 81 deletions
--- a/ox/dailymotion.py
+++ b/ox/dailymotion.py
@ -0,0 +1,15 @@
 import re
 from urllib import unquote
 from oxutils.cache import getUrl
 def getVideoUrl(url):
  data = getUrl(url)
  video = re.compile('''video", "(.*?)"''').findall(data)
  for v in video:
   v =  unquote(v).split('@@')[0]
   return "http://www.dailymotion.com" + v
  return ''
 if __name__ == '__main__':
  print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
  print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -38,14 +38,22 @@ def getUrlBase(imdbId):
 def getRawMovieData(imdbId):
  imdbId = normalizeImdbId(imdbId)
-  data = dict()
+  data = getMovieInfo(imdbId)
-  data['title'] = getTitle(imdbId)
+  data['credits'] = getMovieCredits(imdbId)
-  data['credits'] = getCredits(imdbId)
+  data['poster'] = getMoviePoster(imdbId)
-  data['poster'] = getPoster(imdbId)
+  data['connections'] = getMovieConnections(imdbId)
-  data['trailers'] = getMovieTrailers(imdbId)
+  data['company credits'] = getMovieCompanyCredits(imdbId)
-  data['companyCredits'] = getMovieCompanyCredits(imdbId)
+  data['filming locations'] = getMovieLocations(imdbId)
  data['movie connections'] = getMovieConnections(imdbId)
  data['external reviews'] = getMovieExternalReviews(imdbId)
  data['trivia'] = getMovieTrivia(imdbId)
  data['keywords'] = getMovieKeywords(imdbId)
  data['media'] = {}
  data['media']['images'] = getMovieImages(imdbId)
  data['media']['trailers'] = getMovieTrailers(imdbId)
  return data
-def parseBase(imdbId):
+def getMovieInfo(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
@ -76,6 +84,7 @@ def parseBase(imdbId):
  #get Title
  title = ''
  year = ''
  html_title = soup('div', {'id': 'tn15title'})
  if not html_title:
    html_title = soup('title')
@ -84,8 +93,11 @@ def parseBase(imdbId):
    html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
    title = htmldecode(html_title)
    title = stripTags(title)
-    title = re.sub('\(\d\d\d\d\)', '', title)
+    year = findRegexp(title, '\((\d{4})\)')
-    title = re.sub('\(\d\d\d\d/I*\)', '', title)
+    if not year:
      year = findRegexp(title, '\((\d{4})')
    title = re.sub('\(\d{4}\)', '', title)
    title = re.sub('\(\d{4}/I*\)', '', title)
    for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
      title = title.replace(t, '')
  title = title.strip()
@ -94,14 +106,34 @@ def parseBase(imdbId):
  if title.startswith('"') and title.endswith('"'):
    title = title[1:-1]
  info['title'] = title
  info['year'] = year
  '''
  #Rating
  rating = findRegexp(data, '<b>(.*?)/10</b>')
  if rating:
    info['rating'] = int(float(rating) * 1000)
  else:
    info['rating'] = -1
  #Votes
  votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
  if votes:
    info['votes'] = int(votes.replace(',', ''))
  else:
    info['votes'] = -1
  '''
  return info
-def getPoster(imdbId):
+def getMoviePoster(imdbId):
-  info = parseBase(imdbId)
+  info = getMovieInfo(imdbId)
  return info['poster']
-def getTitle(imdbId):
+def getMovieYear(imdbId):
-  info = parseBase(imdbId)
+  info = getMovieInfo(imdbId)
  return info['year']
 def getMovieTitle(imdbId):
  info = getMovieInfo(imdbId)
  return info['title']
 def creditList(data, section=None):
@ -118,7 +150,7 @@ def creditList(data, section=None):
    credits.append(c)
  return credits
-def getCredits(imdbId):
+def getMovieCredits(imdbId):
  credits = dict()
  url = "%s/fullcredits" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
      trivia.append(t)
  return trivia
 def getMovieConnections(imdbId):
  url = "%s/movieconnections" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  connections = {}
  content = soup('div', {'id': 'tn15content'})[0]
  blocks = str(content).split('<h5>')[1:]
  for c in blocks:
    connection = c.split('</h5>')[0]
    cs = BeautifulSoup(c)
    if connection:
      #relation -> list of imdb ids
      connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
  return connections
 def getMovieKeywords(imdbId):
  url = "%s/keywords" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  keywords = []
  for key in soup('a', {'href': re.compile('^/keyword/')}):
    k = htmldecode(key.string)
    k = k.replace(u'\xa0', ' ')
    keywords.append(k)
  return keywords
 def getMovieExternalReviews(imdbId):
  url = "%s/externalreviews" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
  soup = BeautifulSoup(data)
  ol = soup('ol')
  if ol:
    ol = ol[0]
    ret = {}
    for li in ol('li'):
      try:
        a = li('a')[0]
        href = a.get('href')
        txt = a.contents[0]
        ret[href] = txt
      except:
        pass
    return ret
  return {}
 '''the old code below'''
 class IMDb:
@ -224,14 +302,10 @@ class IMDb:
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb
    self.businessUrl = "%sbusiness" % self.pageUrl
    self.connectionsUrl = "%smovieconnections" % self.pageUrl
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesUrl = "%sepisodes" % self.pageUrl
    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
    self.locationUrl = "%slocations" % self.pageUrl
    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl
  def getPage(self):
    return getUrlUnicode(self.pageUrl)
@ -293,7 +367,7 @@ class IMDb:
    return parsed_value
  def parseTitle(self):
-    title = getTitle(self.imdb)
+    title = getMovieTitle(self.imdb)
    title = normalizeTitle(title)
    if title.startswith('"') and title.find('"',1) > 0 and \
      title.find('"',1) == title.rfind('"'):
@ -328,7 +402,7 @@ class IMDb:
    data = self.getPage()
    IMDbDict ={}
    #Poster
-    IMDbDict['poster'] = getPoster(self.imdb)
+    IMDbDict['poster'] = getMoviePoster(self.imdb)
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
    #Title, Year
@ -373,16 +447,16 @@ class IMDb:
      IMDbDict['tvshow'] = False
    IMDbDict['credits'] = self.getCredits()
    IMDbDict['plot'] = self.parsePlot()
-    IMDbDict['keywords'] = self.parseKeywords()
+    IMDbDict['keywords'] = getMovieKeywords(self.imdb)
    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
-    IMDbDict['connections'] = self.parseConnections()
+    IMDbDict['connections'] = getMovieConnections(self.imdb)
-    IMDbDict['locations'] = self.parseLocations()
+    IMDbDict['locations'] = getMovieLocations(self.imdb)
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
-    IMDbDict['reviews'] = self.parseExternalreviews()
+    IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
    IMDbDict['stills'] = getMovieStills(self.imdb)
-    #IMDbDict['trailer'] = self.parseTrailer()
+    #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
    self.IMDbDict = IMDbDict
    if IMDbDict['episode_of']:
@ -452,42 +526,6 @@ class IMDb:
    self.episodes = episodes
    return self.episodes
  def parseLocations(self):
    data = getUrlUnicode(self.locationUrl)
    soup = BeautifulSoup(data)
    locations = []
    for key in soup('a', {'href': re.compile('^/List')}):
      locations.append(htmldecode(key.string))
    self.locations = locations
    return self.locations
  def parseKeywords(self):
    data = getUrlUnicode(self.keywordUrl)
    soup = BeautifulSoup(data)
    keywords = []
    for key in soup('a', {'href': re.compile('^/keyword/')}):
      k = htmldecode(key.string)
      k = k.replace(u'\xa0', ' ')
      keywords.append(k)
    self.keywords = keywords
    return self.keywords
  def getConnections(self):
    return getUrlUnicode(self.connectionsUrl)
  def parseConnections(self):
    connections = {}
    soup = BeautifulSoup(self.getConnections())
    content = soup('div', {'id': 'tn15content'})[0]
    blocks = str(content).split('<h5>')[1:]
    for c in blocks:
      connection = c.split('</h5>')[0]
      cs = BeautifulSoup(c)
      if connection:
        #relation -> list of imdb ids
        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
    return connections
  def getReleaseinfo(self):
    return getUrlUnicode(self.releaseinfoUrl)
@ -530,26 +568,6 @@ class IMDb:
      business['profit'] = business['gross'] - business['budget']
    return business
  def getExternalreviews(self):
    return getUrlUnicode(self.externalreviewsUrl)
  def parseExternalreviews(self):
    soup = BeautifulSoup(self.getExternalreviews())
    ol = soup('ol')
    if ol:
      ol = ol[0]
      ret = {}
      for li in ol('li'):
        try:
          a = li('a')[0]
          href = a.get('href')
          txt = a.contents[0]
          ret[href] = txt
        except:
          pass
      return ret
    return {}
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]