adding movies to itunes.py

This commit is contained in:
Rolux 2008-05-07 13:36:47 +02:00
commit c4f0505ae8
4 changed files with 42 additions and 38 deletions

View file

@ -12,7 +12,7 @@ import time
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import chardet import chardet
import oxutils import oxutils
from oxutils import stripTags, htmldecode, findRegexp, findString from oxutils import stripTags, decodeHtml, findRe, findString
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils.normalize import normalizeTitle, normalizeImdbId from oxutils.normalize import normalizeTitle, normalizeImdbId
@ -57,13 +57,13 @@ def getMovieInfo(imdbId):
data = getUrl(getUrlBase(imdbId)) data = getUrl(getUrlBase(imdbId))
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
info = dict() info = dict()
info['poster'] = findRegexp(data, 'name="poster".*?<img .*?src="(.*?)"') info['poster'] = findRe(data, 'name="poster".*?<img .*?src="(.*?)"')
for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data): for i in re.compile('<h5>(.*?):</h5>(.*?)<div class="info"', re.DOTALL).findall(data):
title = stripTags(i[0]).strip().lower() title = stripTags(i[0]).strip().lower()
txt= stripTags(i[1]).strip() txt= stripTags(i[1]).strip()
def cleanUp(k): def cleanUp(k):
k = htmldecode(k).replace(u'\xa0', ' ').strip() k = decodeHtml(k).replace(u'\xa0', ' ').strip()
if k.endswith('more'): k=k[:-len('more')].strip() if k.endswith('more'): k=k[:-len('more')].strip()
return k return k
txt = cleanUp(txt) txt = cleanUp(txt)
@ -91,13 +91,14 @@ def getMovieInfo(imdbId):
if html_title: if html_title:
html_title = str(html_title[0]) html_title = str(html_title[0])
html_title = html_title.replace('<br />', ' ').replace(' ', ' ') html_title = html_title.replace('<br />', ' ').replace(' ', ' ')
title = htmldecode(html_title) title = decodeHtml(html_title)
title = stripTags(title) title = stripTags(title)
year = findRegexp(title, '\((\d{4})\)') year = findRe(title, '\((\d{4})\)')
if not year: if not year:
year = findRegexp(title, '\((\d{4})') year = findRe(title, '\((\d{4})')
title = re.sub('\(\d{4}\)', '', title) _y = findRe(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)')
title = re.sub('\(\d{4}/I*\)', '', title) if _y:
title = title.replace(_y, '')
for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
title = title.strip() title = title.strip()
@ -109,14 +110,14 @@ def getMovieInfo(imdbId):
info['year'] = year info['year'] = year
''' '''
#Rating #Rating
rating = findRegexp(data, '<b>(.*?)/10</b>') rating = findRe(data, '<b>(.*?)/10</b>')
if rating: if rating:
info['rating'] = int(float(rating) * 1000) info['rating'] = int(float(rating) * 1000)
else: else:
info['rating'] = -1 info['rating'] = -1
#Votes #Votes
votes = findRegexp(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>') votes = findRe(data, '<small>\(<a href="ratings">(.*?) votes</a>\)</small>')
if votes: if votes:
info['votes'] = int(votes.replace(',', '')) info['votes'] = int(votes.replace(',', ''))
else: else:
@ -171,10 +172,10 @@ def getMovieTrailers(imdbId):
for a in videos[0]('a'): for a in videos[0]('a'):
title = stripTags(unicode(a)).strip() title = stripTags(unicode(a)).strip()
url = 'http://www.imdb.com' + a['href'] url = 'http://www.imdb.com' + a['href']
videoId = findRegexp(url, '/(vi\d*?)/') videoId = findRe(url, '/(vi\d*?)/')
iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId
iframe = getUrlUnicode(iframeUrl) iframe = getUrlUnicode(iframeUrl)
videoUrl = unquote(findRegexp(iframe, 'addVariable\("file", "(.*?)"')) videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"'))
trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl})
return trailers return trailers
@ -209,7 +210,7 @@ def getMovieLocations(imdbId):
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
locations = [] locations = []
for key in soup('a', {'href': re.compile('^/List')}): for key in soup('a', {'href': re.compile('^/List')}):
locations.append(htmldecode(key.string)) locations.append(decodeHtml(key.string))
return locations return locations
def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')):
@ -260,7 +261,7 @@ def getMovieConnections(imdbId):
cs = BeautifulSoup(c) cs = BeautifulSoup(c)
if connection: if connection:
#relation -> list of imdb ids #relation -> list of imdb ids
connections[connection] = [findRegexp(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})] connections[connection] = [findRe(a.get('href'), "\d{7}") for a in cs('a', {'href': re.compile('/title/tt')})]
return connections return connections
def getMovieKeywords(imdbId): def getMovieKeywords(imdbId):
@ -269,7 +270,7 @@ def getMovieKeywords(imdbId):
soup = BeautifulSoup(data) soup = BeautifulSoup(data)
keywords = [] keywords = []
for key in soup('a', {'href': re.compile('^/keyword/')}): for key in soup('a', {'href': re.compile('^/keyword/')}):
k = htmldecode(key.string) k = decodeHtml(key.string)
k = k.replace(u'\xa0', ' ') k = k.replace(u'\xa0', ' ')
keywords.append(k) keywords.append(k)
return keywords return keywords
@ -315,11 +316,11 @@ class IMDb:
value = unicode(value, 'utf-8') value = unicode(value, 'utf-8')
value = stripTags(value).strip() value = stripTags(value).strip()
if key == 'runtime': if key == 'runtime':
parsed_value = findRegexp(value, '(.*?) min') parsed_value = findRe(value, '(.*?) min')
parsed_value = findRegexp(parsed_value, '([0-9]+)') parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = findRegexp(value, '(.*?) sec') parsed_value = findRe(value, '(.*?) sec')
parsed_value = findRegexp(parsed_value, '([0-9]+)') parsed_value = findRe(parsed_value, '([0-9]+)')
if not parsed_value: if not parsed_value:
parsed_value = 0 parsed_value = 0
else: else:
@ -508,7 +509,7 @@ class IMDb:
episodes[episode]['title'] = match[3].strip() episodes[episode]['title'] = match[3].strip()
if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])):
episodes[episode]['title'] = u'' episodes[episode]['title'] = u''
description = htmldecode(match[5]) description = decodeHtml(match[5])
description = stripTags(description.split('Next US airings:')[0]) description = stripTags(description.split('Next US airings:')[0])
episodes[episode]['description'] = description episodes[episode]['description'] = description
episodes[episode]['date'] = '' episodes[episode]['date'] = ''
@ -598,7 +599,7 @@ def guess(title, director=''):
if return_url.startswith('http://www.imdb.com/title/tt'): if return_url.startswith('http://www.imdb.com/title/tt'):
return return_url[28:35] return return_url[28:35]
if data: if data:
imdb_id = findRegexp(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)') imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?<ol><li>.*?<a href="/title/tt(.......)')
if imdb_id: if imdb_id:
return imdb_id return imdb_id

View file

@ -9,7 +9,7 @@ from urllib import quote
import sha import sha
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from oxutils.normalize import normalizeImdbId from oxutils.normalize import normalizeImdbId
from torrent import Torrent from torrent import Torrent
@ -47,7 +47,7 @@ def findMovieByImdb(imdbId):
def getId(mininovaId): def getId(mininovaId):
mininovaId = unicode(mininovaId) mininovaId = unicode(mininovaId)
d = findRegexp(mininovaId, "/(\d+)") d = findRe(mininovaId, "/(\d+)")
if d: if d:
return d return d
mininovaId = mininovaId.split('/') mininovaId = mininovaId.split('/')
@ -78,9 +78,9 @@ def getData(mininovaId):
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'title'] = findRegexp(data, '<title>(.*?):.*?</title>') torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRegexp(data, '<div id="description">(.*?)</div>') torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link']) t = getUrl(torrent[u'torrent_link'])

View file

@ -10,7 +10,7 @@ from urllib2 import URLError
import sha import sha
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRegexp, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId from oxutils.normalize import normalizeImdbId
from torrent import Torrent from torrent import Torrent
@ -61,7 +61,7 @@ def getId(piratebayId):
piratebayId = piratebayId.split('org/')[1] piratebayId = piratebayId.split('org/')[1]
if 'tor/' in piratebayId: if 'tor/' in piratebayId:
piratebayId = piratebayId.split('tor/')[1] piratebayId = piratebayId.split('tor/')[1]
d = findRegexp(piratebayId, "/(\d+)") d = findRe(piratebayId, "/(\d+)")
if d: if d:
piratebayId = d piratebayId = d
return piratebayId return piratebayId
@ -81,18 +81,18 @@ def getData(piratebayId):
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link']) data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRegexp(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip() torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRegexp(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title'])) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, quote(torrent['title']))
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'description'] = findRegexp(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link']) t = _getUrl(torrent[u'torrent_link'])

View file

@ -8,16 +8,19 @@ import os
setup( setup(
name="ox", name="ox",
version="0.1", version="0.1",
# uncomment the following lines if you fill them out in release.py
description="collection of scrapers for various websites", description="collection of scrapers for various websites",
author="bot", author="0x",
author_email="bot@0xdb.org", author_email="code@0xdb.org",
url="http://ox.0xdb.org", url="http://code.0xdb.org/ox",
download_url="http://ox.0xdb.org/download", download_url="http://code.0xdb.org/ox/download",
license="GPL", license="GPLv3",
packages=find_packages(), packages=find_packages(),
zip_safe=False, zip_safe=False,
install_requires=[
'oxutils',
'feedparser',
'beautifulsoup',
],
keywords = [ keywords = [
], ],
classifiers = [ classifiers = [