update some scrapers

This commit is contained in:
j 2008-03-16 12:16:07 +00:00
parent 4d5b87a890
commit 6bedcaa9d6
5 changed files with 133 additions and 8 deletions

View File

@ -1,6 +1,7 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
from utils import stripTags
@ -27,4 +28,40 @@ def torrentsWeLike(link):
for word in ('dvdrip', 'dvdscr', 'dvd screener'):
if word in text:
return True
return False
return False
def movieType(movie):
if 'cam' in movie['title'].lower():
return 'cam'
if 'vcd' in movie['title'].lower():
return 'vcd'
for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'):
if key in movie['title'].lower():
return 'telecine'
for key in ('dvdrip', 'dvdscrs'):
if key in movie['title'].lower():
return 'dvdrip'
if 'screener' in movie['title'].lower():
return 'screener'
if 'xvid' in movie['title'].lower():
return 'Xvid'
if '1080p' in movie['title'].lower():
return '1080p'
if '720p' in movie['title'].lower():
return '720p'
if 'dvdr' in movie['title'].lower():
return 'DVDR'
return ''
def filterMovies(movies):
m2 = []
for movie in movies:
imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt'])
if imdb_id:
movie['imdb'] = imdb_id[0]
else:
movie['imdb'] = ''
movie['source_type'] = movieType(movie)
m2.append(movie)
return m2

View File

@ -185,7 +185,7 @@ class IMDb:
title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '')
if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')]
@ -264,6 +264,10 @@ class IMDb:
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes()
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.parseCredits()
IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords()
@ -528,10 +532,10 @@ def guess(title, director=''):
search = 'site:imdb.com %s "%s"' % (director, title)
else:
search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google(search, 1):
for (name, url, desc) in google(search, 2):
if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35]
try:
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req)

View File

@ -6,9 +6,10 @@ import socket
from urllib import quote
from BeautifulSoup import BeautifulSoup
import feedparser
from utils import read_url, read_url_utf8
from btutils import torrentsWeLike
from btutils import torrentsWeLike, filterMovies
socket.setdefaulttimeout(10.0)
@ -47,3 +48,47 @@ def searchByImdb(imdb):
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
torrents.append(torrent_url)
return torrents
def getId(s):
s = s.split('/')
if len(s) == 1:
return s[0]
else:
return s[-1]
def getInfo(mid):
mid = getId(mid)
comment_link = "http://www.mininova.org/tor/%s" % mid
torrent_link = "http://www.mininova.org/get/%s" % mid
details_link = "http://www.mininova.org/det/%s" % mid
txt = read_url(comment_link) + '\n' + read_url(details_link)
txt = txt.decode('utf-8', 'replace')
title = re.compile('<title>(.*?):.*?</title>').findall(txt)[0]
if "This torrent does not exist..." in txt:
print "This torrent does not exist...", mid
return None
movie = dict(
title=title,
txt=txt,
comment_link=comment_link,
torrent_link=torrent_link,
)
return filterMovies([movie,])[0]
def newMovies(preFilter):
url = "http://www.mininova.org/rss.xml?cat=4"
page = read_url(url)
fd = feedparser.parse(page)
movies = []
for entry in fd.entries:
if not preFilter or preFilter(entry):
movie = dict(
title=entry.title,
txt=entry.summary,
comment_link=entry.link,
torrent_link=entry.link.replace('/tor/','/get/')
)
movies.append(movie)
movies = filterMovies(movies)
return movies

View File

@ -20,7 +20,7 @@ def getRottenTomatoes(rating = 70):
offset = 0
titles = ['1']
while titles:
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
url = "http://www.rottentomatoes.com/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=56&y=10&start_index=%d" % (rating, offset)
page = read_url(url)
soup = BeautifulSoup(page)
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
@ -34,4 +34,4 @@ def getRottenTomatoes(rating = 70):
offset += 10
return movies

View File

@ -5,10 +5,12 @@
import re
import socket
from urllib import quote
from urllib2 import URLError
from BeautifulSoup import BeautifulSoup
import feedparser
from btutils import torrentsWeLike
from btutils import torrentsWeLike, filterMovies
from google import google
from utils import read_url, read_url_utf8
@ -116,3 +118,40 @@ def search(query, filterResult = False):
def searchByImdb(imdb):
return search("tt" + imdb)
def getId(pid):
if pid.startswith('http://torrents.thepiratebay.org/'):
pid = pid.split('org/')[1]
if 'tor/' in pid:
pid = pid.split('tor/')[1]
return pid
def getInfo(piratebayID):
piratebayID = getId(piratebayID)
url = 'http://thepiratebay.org/tor/%s' % piratebayID
try:
txt = read_url(url).decode('utf-8', 'replace')
except URLError, e:
if e.code == 404:
return None
title = re.compile('<title>(.*?) \(download torrent\) - TPB</title>').findall(txt)[0]
movie = dict(
title=title,
txt=txt,
comment_link=url,
torrent_link="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayID, title)
)
return filterMovies([movie,])[0]
def newMovies(preFilter=None):
url = "http://rss.thepiratebay.org/201"
page = read_url(url)
fd = feedparser.parse(page)
movies = []
for entry in fd.entries:
if not preFilter or preFilter(entry):
movie = getInfo(entry.comments)
movies.append(movie)
movies = filterMovies(movies)
return movies