diff --git a/scrapeit/btutils.py b/scrapeit/btutils.py index a340b4f..2a76f0c 100644 --- a/scrapeit/btutils.py +++ b/scrapeit/btutils.py @@ -1,6 +1,7 @@ # -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 +import re from utils import stripTags @@ -27,4 +28,40 @@ def torrentsWeLike(link): for word in ('dvdrip', 'dvdscr', 'dvd screener'): if word in text: return True - return False \ No newline at end of file + return False + +def movieType(movie): + if 'cam' in movie['title'].lower(): + return 'cam' + if 'vcd' in movie['title'].lower(): + return 'vcd' + for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'): + if key in movie['title'].lower(): + return 'telecine' + for key in ('dvdrip', 'dvdscrs'): + if key in movie['title'].lower(): + return 'dvdrip' + if 'screener' in movie['title'].lower(): + return 'screener' + if 'xvid' in movie['title'].lower(): + return 'Xvid' + if '1080p' in movie['title'].lower(): + return '1080p' + if '720p' in movie['title'].lower(): + return '720p' + if 'dvdr' in movie['title'].lower(): + return 'DVDR' + return '' + +def filterMovies(movies): + m2 = [] + for movie in movies: + imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt']) + if imdb_id: + movie['imdb'] = imdb_id[0] + else: + movie['imdb'] = '' + movie['source_type'] = movieType(movie) + m2.append(movie) + return m2 + diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index 4a8952d..84a9b1a 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -185,7 +185,7 @@ class IMDb: title = stripTags(html_title) title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d/I*\)', '', title) - for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): + for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): title = title.replace(t, '') if title.find(u'\xa0') > -1: title = title[:title.find(u'\xa0')] @@ -264,6 +264,10 @@ class IMDb: IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episodes'] = self.parseEpisodes() + if IMDbDict['episodes']: + IMDbDict['tvshow'] = True + else: + IMDbDict['tvshow'] = False IMDbDict['credits'] = self.parseCredits() IMDbDict['plot'] = self.parsePlot() IMDbDict['keywords'] = self.parseKeywords() @@ -528,10 +532,10 @@ def guess(title, director=''): search = 'site:imdb.com %s "%s"' % (director, title) else: search = 'site:imdb.com "%s"' % title - for (name, url, desc) in google(search, 1): + for (name, url, desc) in google(search, 2): if url.startswith('http://www.imdb.com/title/tt'): return url[28:35] - + try: req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) u = urllib2.urlopen(req) diff --git a/scrapeit/mininova.py b/scrapeit/mininova.py index f5091a3..01d6cee 100644 --- a/scrapeit/mininova.py +++ b/scrapeit/mininova.py @@ -6,9 +6,10 @@ import socket from urllib import quote from BeautifulSoup import BeautifulSoup +import feedparser from utils import read_url, read_url_utf8 -from btutils import torrentsWeLike +from btutils import torrentsWeLike, filterMovies socket.setdefaulttimeout(10.0) @@ -47,3 +48,47 @@ def searchByImdb(imdb): torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get') torrents.append(torrent_url) return torrents + +def getId(s): + s = s.split('/') + if len(s) == 1: + return s[0] + else: + return s[-1] + +def getInfo(mid): + mid = getId(mid) + comment_link = "http://www.mininova.org/tor/%s" % mid + torrent_link = "http://www.mininova.org/get/%s" % mid + details_link = "http://www.mininova.org/det/%s" % mid + txt = read_url(comment_link) + '\n' + read_url(details_link) + txt = txt.decode('utf-8', 'replace') + title = re.compile('