findRegexp -> findRe, update setup.py

2008-05-07 11:45:00 +02:00 · 2008-05-07 11:45:00 +02:00 · 8e8f8f3896
commit 8e8f8f3896
parent 1b93ae048d
4 changed files with 37 additions and 33 deletions
--- a/ox/imdb.py
+++ b/ox/imdb.py
@ -12,7 +12,7 @@ import time
 from BeautifulSoup import BeautifulSoup
 import chardet
 import oxutils
-from oxutils import stripTags, htmldecode, findRegexp, findString
+from oxutils import stripTags, htmldecode, findRe, findString
 from oxutils.cache import getUrl, getUrlUnicode
 from oxutils.normalize import normalizeTitle, normalizeImdbId

@ -57,7 +57,7 @@ def getMovieInfo(imdbId):
  data = getUrl(getUrlBase(imdbId))
  soup = BeautifulSoup(data)
  info = dict()
-  info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
+  info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')

  for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
    title = stripTags(i[0]).strip().lower()
@ -93,11 +93,12 @@ def getMovieInfo(imdbId):
    html_title = html_title.replace('<br />', ' ').replace('  ', ' ')
    title = htmldecode(html_title)
    title = stripTags(title)
-    year = findRegexp(title, '\((\d{4})\)')
+    year = findRe(title, '\((\d{4})\)')
    if not year:
-      year = findRegexp(title, '\((\d{4})')
-    title = re.sub('\(\d{4}\)', '', title)
-    title = re.sub('\(\d{4}/I*\)', '', title)
+      year = findRe(title, '\((\d{4})')
+    _y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
+    if _y:
+      title = title.replace(_y, '')
    for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
      title = title.replace(t, '')
  title = title.strip()
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
  info['year'] = year
  '''
  #Rating
-  rating = findRegexp(data, '<b>(.*?)/10</b>')
+  rating = findRe(data, '<b>(.*?)/10</b>')
  if rating:
    info['rating'] = int(float(rating) * 1000)
  else:
    info['rating'] = -1

  #Votes
-  votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
+  votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
  if votes:
    info['votes'] = int(votes.replace(',', ''))
  else:
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
    for a in videos[0]('a'):
      title = stripTags(unicode(a)).strip()
      url = 'http://www.imdb.com' + a['href']
-      videoId = findRegexp(url, '/(vi\d*?)/')
+      videoId = findRe(url, '/(vi\d*?)/')
      iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
      iframe = getUrlUnicode(iframeUrl)
-      videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
+      videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
      trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
  return trailers

@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
    cs = BeautifulSoup(c)
    if connection:
      #relation -> list of imdb ids
-      connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
+      connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
  return connections

 def getMovieKeywords(imdbId):
@ -315,11 +316,11 @@ class IMDb:
      value = unicode(value, 'utf-8')
      value = stripTags(value).strip()
    if key == 'runtime':
-      parsed_value = findRegexp(value, '(.*?) min')
-      parsed_value = findRegexp(parsed_value, '([0-9]+)')
+      parsed_value = findRe(value, '(.*?) min')
+      parsed_value = findRe(parsed_value, '([0-9]+)')
      if not parsed_value:
-        parsed_value = findRegexp(value, '(.*?) sec')
-        parsed_value = findRegexp(parsed_value, '([0-9]+)')
+        parsed_value = findRe(value, '(.*?) sec')
+        parsed_value = findRe(parsed_value, '([0-9]+)')
        if not parsed_value:
          parsed_value = 0
        else:
@ -598,7 +599,7 @@ def guess(title, director=''):
  if return_url.startswith('http://www.imdb.com/title/tt'):
    return return_url[28:35]
  if data:
-    imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
+    imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
    if imdb_id:
      return imdb_id

--- a/ox/mininova.py
+++ b/ox/mininova.py
@ -9,7 +9,7 @@ from urllib import quote
 import sha

 from oxutils.cache import getUrl, getUrlUnicode
-from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
+from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
 from oxutils.normalize import normalizeImdbId

 from torrent import Torrent
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):

 def getId(mininovaId):
  mininovaId = unicode(mininovaId)
-  d = findRegexp(mininovaId, "/(\d+)")
+  d = findRe(mininovaId, "/(\d+)")
  if d:
    return d
  mininovaId = mininovaId.split('/')
@ -78,9 +78,9 @@ def getData(mininovaId):
    value = decodeHtml(stripTags(d[1].strip()))
    torrent[key] = value

-  torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>')
-  torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
-  torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
+  torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
+  torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+  torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
  if torrent['description']:
    torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
  t = getUrl(torrent[u'torrent_link'])
--- a/ox/thepiratebay.py
+++ b/ox/thepiratebay.py
@ -10,7 +10,7 @@ from urllib2 import URLError
 import sha

 from oxutils.cache import getUrl, getUrlUnicode
-from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
+from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
 from oxutils.normalize import normalizeImdbId

 from torrent import Torrent
@ -61,7 +61,7 @@ def getId(piratebayId):
    piratebayId = piratebayId.split('org/')[1]
  if 'tor/' in piratebayId:
    piratebayId = piratebayId.split('tor/')[1]
-  d = findRegexp(piratebayId, "/(\d+)")
+  d = findRe(piratebayId, "/(\d+)")
  if d:
    piratebayId = d
  return piratebayId
@ -81,18 +81,18 @@ def getData(piratebayId):
  torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId

  data = _getUrlUnicode(torrent['comment_link'])
-  torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
+  torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
  if not torrent[u'title']:
    return None
  torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
-  torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
+  torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
  torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
  for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
    key = d[0].lower().strip()
    key = _key_map.get(key, key)
    value = decodeHtml(stripTags(d[1].strip()))
    torrent[key] = value
-  torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
+  torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
  if torrent[u'description']:
    torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
  t = _getUrl(torrent[u'torrent_link'])
--- a/setup.py
+++ b/setup.py
@ -8,16 +8,19 @@ import os
 setup(
  name="ox",
  version="0.1",
-
-  # uncomment the following lines if you fill them out in release.py
  description="collection of scrapers for various websites",
-  author="bot",
-  author_email="bot@0xdb.org",
-  url="http://ox.0xdb.org",
-  download_url="http://ox.0xdb.org/download",
-  license="GPL",
+  author="0x",
+  author_email="code@0xdb.org",
+  url="http://code.0xdb.org/ox",
+  download_url="http://code.0xdb.org/ox/download",
+  license="GPLv3",
  packages=find_packages(),
  zip_safe=False,
+  install_requires=[
+        'oxutils',
+        'feedparser',
+        'beautifulsoup',
+  ],
  keywords = [
  ],
  classifiers = [