filter results, return more results from TPB, filter HD content in filter

2007-07-09 13:11:56 +00:00 · 2007-07-09 13:11:56 +00:00 · ee5864a99d
commit ee5864a99d
parent 472c99240b
3 changed files with 49 additions and 21 deletions
--- a/scrapeit/btutils.py
+++ b/scrapeit/btutils.py
@ -15,9 +15,14 @@ def torrentsWeLike(link):
    if word in text:
      return False
  #no dubbed versions
-  for word in ('italian', 'german', 'spanish', 'french'):
+  for word in ('italian', 'german', 'spanish', 'french', 'nl sub'):
    if word in text:
      return False
+  #not blueray or hddvd version right now or even DVDRs
+  for word in ('chd', 'hd ', 'hd-', 'dvdr-', 'dvdr.', 'dvdr '):
+    if word in text:
+      return False
+  
  #only dvdrips or dvdscrs
  for word in ('dvdrip', 'dvdscr', 'dvd screener'):
    if word in text:
--- a/scrapeit/mininova.py
+++ b/scrapeit/mininova.py
@ -12,7 +12,7 @@ from btutils import torrentsWeLike

 socket.setdefaulttimeout(10.0)

-def search(query):
+def search(query, filterResult = False):
  '''search for torrents on mininova
  '''
  torrents = []
@ -21,9 +21,14 @@ def search(query):
  soup = BeautifulSoup(page)
  for row in soup('tr'): 
    links = row('a', {'href':re.compile('/tor')})
-    if links and torrentsWeLike(links[0]):
+    if links:
      torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
-      torrents.append(torrent_url)
+      if filterResult:
+        if torrentsWeLike(links[0]):
+          torrents.append(torrent_url)
+      else:
+        torrents.append(torrent_url)
+        
  return torrents

 def searchByImdb(imdb):
@ -32,9 +37,13 @@ def searchByImdb(imdb):
  torrents = []
  page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
  soup = BeautifulSoup(page)
-  for row in soup('tr'): 
-    links = row('a', {'href':re.compile('/get')})
-    if links:
-      torrent_url = "http://www.mininova.org%s" % links[0].get('href')
-      torrents.append(torrent_url)
+  for row in soup('tr'):
+    #filter private trackers
+    private_tracker = row('a', {'href':re.compile('/faq/#pt')})
+    links = row('a', {'href':re.compile('/tor')})
+    if not private_tracker and links:
+      torrent = links[0]
+      if torrentsWeLike(unicode(torrent.contents[0])):
+        torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
+        torrents.append(torrent_url)
  return torrents
--- a/scrapeit/thepiratebay.py
+++ b/scrapeit/thepiratebay.py
@ -8,6 +8,7 @@ from urllib import quote

 from BeautifulSoup import BeautifulSoup

+from btutils import torrentsWeLike
 from google import google
 from utils import read_url, read_url_utf8

@ -85,19 +86,32 @@ def get_episodes(id):
  episodes = re.compile('<nobr><a href="(.*?)">(.*?)</a></nobr>').findall(data)
  return episodes
  
-def search(query):
+def search(query, filterResult = False):
  torrents = []
-  url = "http://thepiratebay.org/search.php?video=on&q=%s" % quote(query)
-  page = read_url(url)
-  soup = BeautifulSoup(page)
-  for row in soup('tr'): 
-    torrentType = row.findAll('td', {'class': 'vertTh'})
-    if torrentType:
-      torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
-      # 201 = Movies , 202 = Movie DVDR
-      if torrentType in ['201']:
-        torrent =  row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
-        torrents.append(torrent)
+  next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] 
+  page_count = 1
+  while next and page_count < 4:
+    page_count += 1
+    url = next[0]
+    if not url.startswith('http'):
+      if not url.startswith('/'):
+        url = "/" + url
+      url = "http://thepiratebay.org" + url
+    page = read_url(url)
+    soup = BeautifulSoup(page)
+    for row in soup('tr'): 
+      torrentType = row.findAll('td', {'class': 'vertTh'})
+      if torrentType:
+        torrentType = torrentType[0]('a')[0].get('href').split('/')[-1]
+        # 201 = Movies , 202 = Movie DVDR
+        if torrentType in ['201']:
+          torrent =  row.findAll('a', {'href':re.compile('.torrent$')})[0].get('href')
+          if filterResult:
+            if torrentsWeLike(torrent):
+              torrents.append(torrent)
+          else:
+            torrents.append(torrent)
+    next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(page)
  return torrents
  
 def searchByImdb(imdb):