add one dailymotion function

2008-04-30 15:31:50 +02:00 · 2008-04-30 15:31:50 +02:00 · 7668ceafc1
commit 7668ceafc1
parent 40185f89ab
2 changed files with 114 additions and 81 deletions
--- a/ox/dailymotion.py
+++ b/ox/dailymotion.py
@ -0,0 +1,15 @@
+import re
+from urllib import unquote
+from oxutils.cache import getUrl
+
+def getVideoUrl(url):
+  data = getUrl(url)
+  video = re.compile('''video", "(.*?)"''').findall(data)
+  for v in video:
+   v =  unquote(v).split('@@')[0]
+   return "http://www.dailymotion.com" + v
+  return ''
+
+if __name__ == '__main__':
+  print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
+  print getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -38,14 +38,22 @@ def getUrlBase(imdbId):

 def getRawMovieData(imdbId):
  imdbId = normalizeImdbId(imdbId)
-  data = dict()
-  data['title'] = getTitle(imdbId)
-  data['credits'] = getCredits(imdbId)
-  data['poster'] = getPoster(imdbId)
-  data['trailers'] = getMovieTrailers(imdbId)
-  data['companyCredits'] = getMovieCompanyCredits(imdbId)
+  data = getMovieInfo(imdbId)
+  data['credits'] = getMovieCredits(imdbId)
+  data['poster'] = getMoviePoster(imdbId)
+  data['connections'] = getMovieConnections(imdbId)
+  data['company credits'] = getMovieCompanyCredits(imdbId)
+  data['filming locations'] = getMovieLocations(imdbId)
+  data['movie connections'] = getMovieConnections(imdbId)
+  data['external reviews'] = getMovieExternalReviews(imdbId)
+  data['trivia'] = getMovieTrivia(imdbId)
+  data['keywords'] = getMovieKeywords(imdbId)
+  data['media'] = {}
+  data['media']['images'] = getMovieImages(imdbId)
+  data['media']['trailers'] = getMovieTrailers(imdbId)
+  return data

-def parseBase(imdbId):
+def getMovieInfo(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
@ -76,6 +84,7 @@ def parseBase(imdbId):

  #get Title
  title = ''
+  year = ''
  html_title = soup('div', {'id': 'tn15title'})
  if not html_title:
    html_title = soup('title')
@ -84,8 +93,11 @@ def parseBase(imdbId):
    html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
    title = htmldecode(html_title)
    title = stripTags(title)
-    title = re.sub('\(\d\d\d\d\)', '', title)
-    title = re.sub('\(\d\d\d\d/I*\)', '', title)
+    year = findRegexp(title, '\((\d{4})\)')
+    if not year:
+      year = findRegexp(title, '\((\d{4})')
+    title = re.sub('\(\d{4}\)', '', title)
+    title = re.sub('\(\d{4}/I*\)', '', title)
    for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
      title = title.replace(t, '')
  title = title.strip()
@ -94,14 +106,34 @@ def parseBase(imdbId):
  if title.startswith('"') and title.endswith('"'):
    title = title[1:-1]
  info['title'] = title
+  info['year'] = year
+  '''
+  #Rating
+  rating = findRegexp(data, '<b>(.*?)/10</b>')
+  if rating:
+    info['rating'] = int(float(rating) * 1000)
+  else:
+    info['rating'] = -1
+
+  #Votes
+  votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
+  if votes:
+    info['votes'] = int(votes.replace(',', ''))
+  else:
+    info['votes'] = -1
+  '''
  return info

-def getPoster(imdbId):
-  info = parseBase(imdbId)
+def getMoviePoster(imdbId):
+  info = getMovieInfo(imdbId)
  return info['poster']

-def getTitle(imdbId):
-  info = parseBase(imdbId)
+def getMovieYear(imdbId):
+  info = getMovieInfo(imdbId)
+  return info['year']
+
+def getMovieTitle(imdbId):
+  info = getMovieInfo(imdbId)
  return info['title']

 def creditList(data, section=None):
@ -118,7 +150,7 @@ def creditList(data, section=None):
    credits.append(c)
  return credits

-def getCredits(imdbId):
+def getMovieCredits(imdbId):
  credits = dict()
  url = "%s/fullcredits" % getUrlBase(imdbId)
  data = getUrlUnicode(url)
@ -216,6 +248,52 @@ def getMovieTrivia(imdbId):
      trivia.append(t)
  return trivia

+def getMovieConnections(imdbId):
+  url = "%s/movieconnections" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  soup = BeautifulSoup(data)
+  connections = {}
+  content = soup('div', {'id': 'tn15content'})[0]
+  blocks = str(content).split('<h5>')[1:]
+  for c in blocks:
+    connection = c.split('</h5>')[0]
+    cs = BeautifulSoup(c)
+    if connection:
+      #relation -> list of imdb ids
+      connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
+  return connections
+
+def getMovieKeywords(imdbId):
+  url = "%s/keywords" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  soup = BeautifulSoup(data)
+  keywords = []
+  for key in soup('a', {'href': re.compile('^/keyword/')}):
+    k = htmldecode(key.string)
+    k = k.replace(u'\xa0', ' ')
+    keywords.append(k)
+  return keywords
+
+
+def getMovieExternalReviews(imdbId):
+  url = "%s/externalreviews" % getUrlBase(imdbId)
+  data = getUrlUnicode(url)
+  soup = BeautifulSoup(data)
+  ol = soup('ol')
+  if ol:
+    ol = ol[0]
+    ret = {}
+    for li in ol('li'):
+      try:
+        a = li('a')[0]
+        href = a.get('href')
+        txt = a.contents[0]
+        ret[href] = txt
+      except:
+        pass
+    return ret
+  return {}
+
 '''the old code below'''

 class IMDb:
@ -224,14 +302,10 @@ class IMDb:
    self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb

    self.businessUrl = "%sbusiness" % self.pageUrl
-    self.connectionsUrl = "%smovieconnections" % self.pageUrl
    self.creditsUrl = "%sfullcredits" % self.pageUrl
    self.episodesUrl = "%sepisodes" % self.pageUrl
-    self.keywordUrl = "%skeywords" % self.pageUrl
    self.plotUrl = "%splotsummary" % self.pageUrl
    self.releaseinfoUrl = "%sreleaseinfo" % self.pageUrl
-    self.locationUrl = "%slocations" % self.pageUrl
-    self.externalreviewsUrl = "%sexternalreviews" % self.pageUrl

  def getPage(self):
    return getUrlUnicode(self.pageUrl)
@ -293,7 +367,7 @@ class IMDb:
    return parsed_value

  def parseTitle(self):
-    title = getTitle(self.imdb)
+    title = getMovieTitle(self.imdb)
    title = normalizeTitle(title)
    if title.startswith('"') and title.find('"',1) > 0 and \
      title.find('"',1) == title.rfind('"'):
@ -328,7 +402,7 @@ class IMDb:
    data = self.getPage()
    IMDbDict ={}
    #Poster
-    IMDbDict['poster'] = getPoster(self.imdb)
+    IMDbDict['poster'] = getMoviePoster(self.imdb)
    if not IMDbDict['poster']:
      IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif'
    #Title, Year
@ -373,16 +447,16 @@ class IMDb:
      IMDbDict['tvshow'] = False
    IMDbDict['credits'] = self.getCredits()
    IMDbDict['plot'] = self.parsePlot()
-    IMDbDict['keywords'] = self.parseKeywords()
+    IMDbDict['keywords'] = getMovieKeywords(self.imdb)

    IMDbDict['trivia'] = getMovieTrivia(self.imdb)
-    IMDbDict['connections'] = self.parseConnections()
-    IMDbDict['locations'] = self.parseLocations()
+    IMDbDict['connections'] = getMovieConnections(self.imdb)
+    IMDbDict['locations'] = getMovieLocations(self.imdb)
    IMDbDict['release_date'] = self.parseReleaseinfo()
    IMDbDict['business'] = self.parseBusiness()
-    IMDbDict['reviews'] = self.parseExternalreviews()
+    IMDbDict['reviews'] = getMovieExternalReviews(self.imdb)
    IMDbDict['stills'] = getMovieStills(self.imdb)
-    #IMDbDict['trailer'] = self.parseTrailer()
+    #IMDbDict['trailer'] = getMovieTrailer(self.imdb)
    self.IMDbDict = IMDbDict

    if IMDbDict['episode_of']:
@ -452,42 +526,6 @@ class IMDb:
    self.episodes = episodes
    return self.episodes

-  def parseLocations(self):
-    data = getUrlUnicode(self.locationUrl)
-    soup = BeautifulSoup(data)
-    locations = []
-    for key in soup('a', {'href': re.compile('^/List')}):
-      locations.append(htmldecode(key.string))
-    self.locations = locations
-    return self.locations
-
-  def parseKeywords(self):
-    data = getUrlUnicode(self.keywordUrl)
-    soup = BeautifulSoup(data)
-    keywords = []
-    for key in soup('a', {'href': re.compile('^/keyword/')}):
-      k = htmldecode(key.string)
-      k = k.replace(u'\xa0', ' ')
-      keywords.append(k)
-    self.keywords = keywords
-    return self.keywords
-
-  def getConnections(self):
-    return getUrlUnicode(self.connectionsUrl)
-
-  def parseConnections(self):
-    connections = {}
-    soup = BeautifulSoup(self.getConnections())
-    content = soup('div', {'id': 'tn15content'})[0]
-    blocks = str(content).split('<h5>')[1:]
-    for c in blocks:
-      connection = c.split('</h5>')[0]
-      cs = BeautifulSoup(c)
-      if connection:
-        #relation -> list of imdb ids
-        connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})]
-    return connections
-
  def getReleaseinfo(self):
    return getUrlUnicode(self.releaseinfoUrl)

@ -530,26 +568,6 @@ class IMDb:
      business['profit'] = business['gross'] - business['budget']
    return business

-  def getExternalreviews(self):
-    return getUrlUnicode(self.externalreviewsUrl)
-
-  def parseExternalreviews(self):
-    soup = BeautifulSoup(self.getExternalreviews())
-    ol = soup('ol')
-    if ol:
-      ol = ol[0]
-      ret = {}
-      for li in ol('li'):
-        try:
-          a = li('a')[0]
-          href = a.get('href')
-          txt = a.contents[0]
-          ret[href] = txt
-        except:
-          pass
-      return ret
-    return {}
-
 def guess(title, director=''):
  #FIXME: proper file -> title
  title = title.split('-')[0]