adding movies to itunes.py

This commit is contained in:
Rolux 2008-05-07 13:36:47 +02:00
commit c4f0505ae8
4 changed files with 42 additions and 38 deletions

View File

@ -12,7 +12,7 @@ import time
from BeautifulSoup import BeautifulSoup
import chardet
import oxutils
from oxutils import stripTags, htmldecode, findRegexp, findString
from oxutils import stripTags, decodeHtml, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId
@ -57,13 +57,13 @@ def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data)
info = dict()
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"')
info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip()
def cleanUp(k):
k = htmldecode(k).replace(u'\xa0', ' ').strip()
k = decodeHtml(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip()
return k
txt = cleanUp(txt)
@ -91,13 +91,14 @@ def getMovieInfo(imdbId):
if html_title:
html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title)
title = decodeHtml(html_title)
title = stripTags(title)
year = findRegexp(title, '\((\d{4})\)')
year = findRe(title, '\((\d{4})\)')
if not year:
year = findRegexp(title, '\((\d{4})')
title = re.sub('\(\d{4}\)', '', title)
title = re.sub('\(\d{4}/I*\)', '', title)
year = findRe(title, '\((\d{4})')
_y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
title = title.strip()
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
info['year'] = year
'''
#Rating
rating = findRegexp(data, '<b>(.*?)/10</b>')
rating = findRe(data, '<b>(.*?)/10</b>')
if rating:
info['rating'] = int(float(rating) * 1000)
else:
info['rating'] = -1
#Votes
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes:
info['votes'] = int(votes.replace(',', ''))
else:
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
for a in videos[0]('a'):
title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/')
videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"'))
videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers
@ -209,7 +210,7 @@ def getMovieLocations(imdbId):
soup = BeautifulSoup(data)
locations = []
for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string))
locations.append(decodeHtml(key.string))
return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
cs = BeautifulSoup(c)
if connection:
#relation -> list of imdb ids
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections
def getMovieKeywords(imdbId):
@ -269,7 +270,7 @@ def getMovieKeywords(imdbId):
soup = BeautifulSoup(data)
keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string)
k = decodeHtml(key.string)
k = k.replace(u'\xa0', ' ')
keywords.append(k)
return keywords
@ -315,11 +316,11 @@ class IMDb:
value = unicode(value, 'utf-8')
value = stripTags(value).strip()
if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
parsed_value = findRe(value, '(.*?) min')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)')
parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value:
parsed_value = 0
else:
@ -508,7 +509,7 @@ class IMDb:
episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u''
description = htmldecode(match[5])
description = decodeHtml(match[5])
description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description
episodes[episode]['date'] = ''
@ -598,7 +599,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35]
if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id:
return imdb_id

View File

@ -9,7 +9,7 @@ from urllib import quote
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
def getId(mininovaId):
mininovaId = unicode(mininovaId)
d = findRegexp(mininovaId, "/(\d+)")
d = findRe(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
@ -78,9 +78,9 @@ def getData(mininovaId):
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>')
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link'])

View File

@ -10,7 +10,7 @@ from urllib2 import URLError
import sha
from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId
from torrent import Torrent
@ -61,7 +61,7 @@ def getId(piratebayId):
piratebayId = piratebayId.split('org/')[1]
if 'tor/' in piratebayId:
piratebayId = piratebayId.split('tor/')[1]
d = findRegexp(piratebayId, "/(\d+)")
d = findRe(piratebayId, "/(\d+)")
if d:
piratebayId = d
return piratebayId
@ -81,18 +81,18 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>')
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>')
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link'])

View File

@ -8,16 +8,19 @@ import os
setup(
name="ox",
version="0.1",
# uncomment the following lines if you fill them out in release.py
description="collection of scrapers for various websites",
author="bot",
author_email="bot@0xdb.org",
url="http://ox.0xdb.org",
download_url="http://ox.0xdb.org/download",
license="GPL",
author="0x",
author_email="code@0xdb.org",
url="http://code.0xdb.org/ox",
download_url="http://code.0xdb.org/ox/download",
license="GPLv3",
packages=find_packages(),
zip_safe=False,
install_requires=[
'oxutils',
'feedparser',
'beautifulsoup',
],
keywords = [
],
classifiers = [