From 6bedcaa9d6cf719fc46086f036c9e8b2a636ea81 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sun, 16 Mar 2008 12:16:07 +0000 Subject: [PATCH] update some scrapers --- scrapeit/btutils.py | 39 ++++++++++++++++++++++++++++++- scrapeit/imdb.py | 10 +++++--- scrapeit/mininova.py | 47 +++++++++++++++++++++++++++++++++++++- scrapeit/rottentomatoes.py | 4 ++-- scrapeit/thepiratebay.py | 41 ++++++++++++++++++++++++++++++++- 5 files changed, 133 insertions(+), 8 deletions(-) diff --git a/scrapeit/btutils.py b/scrapeit/btutils.py index a340b4f..2a76f0c 100644 --- a/scrapeit/btutils.py +++ b/scrapeit/btutils.py @@ -1,6 +1,7 @@ # -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 +import re from utils import stripTags @@ -27,4 +28,40 @@ def torrentsWeLike(link): for word in ('dvdrip', 'dvdscr', 'dvd screener'): if word in text: return True - return False \ No newline at end of file + return False + +def movieType(movie): + if 'cam' in movie['title'].lower(): + return 'cam' + if 'vcd' in movie['title'].lower(): + return 'vcd' + for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'): + if key in movie['title'].lower(): + return 'telecine' + for key in ('dvdrip', 'dvdscrs'): + if key in movie['title'].lower(): + return 'dvdrip' + if 'screener' in movie['title'].lower(): + return 'screener' + if 'xvid' in movie['title'].lower(): + return 'Xvid' + if '1080p' in movie['title'].lower(): + return '1080p' + if '720p' in movie['title'].lower(): + return '720p' + if 'dvdr' in movie['title'].lower(): + return 'DVDR' + return '' + +def filterMovies(movies): + m2 = [] + for movie in movies: + imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt']) + if imdb_id: + movie['imdb'] = imdb_id[0] + else: + movie['imdb'] = '' + movie['source_type'] = movieType(movie) + m2.append(movie) + return m2 + diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 4a8952d..84a9b1a 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -185,7 +185,7 @@ class IMDb: title = stripTags(html_title) title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d/I*\)', '', title) - for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): + for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] @@ -264,6 +264,10 @@ class IMDb: IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() + if IMDbDict['episodes']: + IMDbDict['tvshow'] = True + else: + IMDbDict['tvshow'] = False IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() @@ -528,10 +532,10 @@ def guess(title, director=''): search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title - for (name, url, desc) in google(search, 1): + for (name, url, desc) in google(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] - + try: req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) u = urllib2.urlopen(req) diff --git a/scrapeit/mininova.py b/scrapeit/mininova.py index f5091a3..01d6cee 100644 --- a/scrapeit/mininova.py +++ b/scrapeit/mininova.py @@ -6,9 +6,10 @@ import socket from urllib import quote from BeautifulSoup import BeautifulSoup +import feedparser from utils import read_url, read_url_utf8 -from btutils import torrentsWeLike +from btutils import torrentsWeLike, filterMovies socket.setdefaulttimeout(10.0) @@ -47,3 +48,47 @@ def searchByImdb(imdb): torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get') torrents.append(torrent_url) return torrents + +def getId(s): + s = s.split('/') + if len(s) == 1: + return s[0] + else: + return s[-1] + +def getInfo(mid): + mid = getId(mid) + comment_link = "http://www.mininova.org/tor/%s" % mid + torrent_link = "http://www.mininova.org/get/%s" % mid + details_link = "http://www.mininova.org/det/%s" % mid + txt = read_url(comment_link) + '\n' + read_url(details_link) + txt = txt.decode('utf-8', 'replace') + title = re.compile('(.*?):.*?').findall(txt)[0] + if "This torrent does not exist..." in txt: + print "This torrent does not exist...", mid + return None + movie = dict( + title=title, + txt=txt, + comment_link=comment_link, + torrent_link=torrent_link, + ) + return filterMovies([movie,])[0] + +def newMovies(preFilter): + url = "http://www.mininova.org/rss.xml?cat=4" + page = read_url(url) + fd = feedparser.parse(page) + movies = [] + for entry in fd.entries: + if not preFilter or preFilter(entry): + movie = dict( + title=entry.title, + txt=entry.summary, + comment_link=entry.link, + torrent_link=entry.link.replace('/tor/','/get/') + ) + movies.append(movie) + movies = filterMovies(movies) + return movies + diff --git a/scrapeit/rottentomatoes.py b/scrapeit/rottentomatoes.py index 3e7ca3e..28d8fa3 100644 --- a/scrapeit/rottentomatoes.py +++ b/scrapeit/rottentomatoes.py @@ -20,7 +20,7 @@ def getRottenTomatoes(rating = 70): offset = 0 titles = ['1'] while titles: - url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset) + url = "http://www.rottentomatoes.com/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=56&y=10&start_index=%d" % (rating, offset) page = read_url(url) soup = BeautifulSoup(page) titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})] @@ -34,4 +34,4 @@ def getRottenTomatoes(rating = 70): offset += 10 return movies - \ No newline at end of file + diff --git a/scrapeit/thepiratebay.py b/scrapeit/thepiratebay.py index fafa042..30453cf 100644 --- a/scrapeit/thepiratebay.py +++ b/scrapeit/thepiratebay.py @@ -5,10 +5,12 @@ import re import socket from urllib import quote +from urllib2 import URLError from BeautifulSoup import BeautifulSoup +import feedparser -from btutils import torrentsWeLike +from btutils import torrentsWeLike, filterMovies from google import google from utils import read_url, read_url_utf8 @@ -116,3 +118,40 @@ def search(query, filterResult = False): def searchByImdb(imdb): return search("tt" + imdb) + +def getId(pid): + if pid.startswith('http://torrents.thepiratebay.org/'): + pid = pid.split('org/')[1] + if 'tor/' in pid: + pid = pid.split('tor/')[1] + return pid + +def getInfo(piratebayID): + piratebayID = getId(piratebayID) + url = 'http://thepiratebay.org/tor/%s' % piratebayID + try: + txt = read_url(url).decode('utf-8', 'replace') + except URLError, e: + if e.code == 404: + return None + title = re.compile('(.*?) \(download torrent\) - TPB').findall(txt)[0] + movie = dict( + title=title, + txt=txt, + comment_link=url, + torrent_link="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayID, title) + ) + return filterMovies([movie,])[0] + +def newMovies(preFilter=None): + url = "http://rss.thepiratebay.org/201" + page = read_url(url) + fd = feedparser.parse(page) + movies = [] + for entry in fd.entries: + if not preFilter or preFilter(entry): + movie = getInfo(entry.comments) + movies.append(movie) + movies = filterMovies(movies) + return movies +