diff --git a/scrapeit/btutils.py b/scrapeit/btutils.py index 474ea7c..a340b4f 100644 --- a/scrapeit/btutils.py +++ b/scrapeit/btutils.py @@ -15,9 +15,14 @@ def torrentsWeLike(link): if word in text: return False #no dubbed versions - for word in ('italian', 'german', 'spanish', 'french'): + for word in ('italian', 'german', 'spanish', 'french', 'nl sub'): if word in text: return False + #not blueray or hddvd version right now or even DVDRs + for word in ('chd', 'hd ', 'hd-', 'dvdr-', 'dvdr.', 'dvdr '): + if word in text: + return False + #only dvdrips or dvdscrs for word in ('dvdrip', 'dvdscr', 'dvd screener'): if word in text: diff --git a/scrapeit/mininova.py b/scrapeit/mininova.py index 7cf0059..f5091a3 100644 --- a/scrapeit/mininova.py +++ b/scrapeit/mininova.py @@ -12,7 +12,7 @@ from btutils import torrentsWeLike socket.setdefaulttimeout(10.0) -def search(query): +def search(query, filterResult = False): '''search for torrents on mininova ''' torrents = [] @@ -21,9 +21,14 @@ def search(query): soup = BeautifulSoup(page) for row in soup('tr'): links = row('a', {'href':re.compile('/tor')}) - if links and torrentsWeLike(links[0]): + if links: torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get') - torrents.append(torrent_url) + if filterResult: + if torrentsWeLike(links[0]): + torrents.append(torrent_url) + else: + torrents.append(torrent_url) + return torrents def searchByImdb(imdb): @@ -32,9 +37,13 @@ def searchByImdb(imdb): torrents = [] page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb) soup = BeautifulSoup(page) - for row in soup('tr'): - links = row('a', {'href':re.compile('/get')}) - if links: - torrent_url = "http://www.mininova.org%s" % links[0].get('href') - torrents.append(torrent_url) + for row in soup('tr'): + #filter private trackers + private_tracker = row('a', {'href':re.compile('/faq/#pt')}) + links = row('a', {'href':re.compile('/tor')}) + if not private_tracker and links: + torrent = links[0] + if torrentsWeLike(unicode(torrent.contents[0])): + torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get') + torrents.append(torrent_url) return torrents diff --git a/scrapeit/thepiratebay.py b/scrapeit/thepiratebay.py index 1d42b51..fafa042 100644 --- a/scrapeit/thepiratebay.py +++ b/scrapeit/thepiratebay.py @@ -8,6 +8,7 @@ from urllib import quote from BeautifulSoup import BeautifulSoup +from btutils import torrentsWeLike from google import google from utils import read_url, read_url_utf8 @@ -85,19 +86,32 @@ def get_episodes(id): episodes = re.compile('(.*?)').findall(data) return episodes -def search(query): +def search(query, filterResult = False): torrents = [] - url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query) - page = read_url(url) - soup = BeautifulSoup(page) - for row in soup('tr'): - torrentType = row.findAll('td', {'class': 'vertTh'}) - if torrentType: - torrentType = torrentType[0]('a')[0].get('href').split('/')[-1] - # 201 = Movies , 202 = Movie DVDR - if torrentType in ['201']: - torrent = row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href') - torrents.append(torrent) + next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] + page_count = 1 + while next and page_count < 4: + page_count += 1 + url = next[0] + if not url.startswith('http'): + if not url.startswith('/'): + url = "/" + url + url = "http://thepiratebay.org" + url + page = read_url(url) + soup = BeautifulSoup(page) + for row in soup('tr'): + torrentType = row.findAll('td', {'class': 'vertTh'}) + if torrentType: + torrentType = torrentType[0]('a')[0].get('href').split('/')[-1] + # 201 = Movies , 202 = Movie DVDR + if torrentType in ['201']: + torrent = row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href') + if filterResult: + if torrentsWeLike(torrent): + torrents.append(torrent) + else: + torrents.append(torrent) + next = re.compile('.*?next.gif.*?').findall(page) return torrents def searchByImdb(imdb):