scrapeit/scrapeit/mininova.py

95 lines
2.6 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
import socket
from urllib import quote
from BeautifulSoup import BeautifulSoup
import feedparser
from utils import read_url, read_url_utf8
from btutils import torrentsWeLike, filterMovies
socket.setdefaulttimeout(10.0)
def search(query, filterResult = False):
'''search for torrents on mininova
'''
torrents = []
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
page = read_url(url)
soup = BeautifulSoup(page)
for row in soup('tr'):
links = row('a', {'href':re.compile('/tor')})
if links:
torrent_url = "http://www.mininova.org%s" % links[0].get('href').replace('/tor', '/get')
if filterResult:
if torrentsWeLike(links[0]):
torrents.append(torrent_url)
else:
torrents.append(torrent_url)
return torrents
def searchByImdb(imdb):
'''search for torrents on mininova by imdb
'''
torrents = []
page = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdb)
soup = BeautifulSoup(page)
for row in soup('tr'):
#filter private trackers
private_tracker = row('a', {'href':re.compile('/faq/#pt')})
links = row('a', {'href':re.compile('/tor')})
if not private_tracker and links:
torrent = links[0]
if torrentsWeLike(unicode(torrent.contents[0])):
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
torrents.append(torrent_url)
return torrents
def getId(s):
s = s.split('/')
if len(s) == 1:
return s[0]
else:
return s[-1]
def getInfo(mid):
mid = getId(mid)
comment_link = "http://www.mininova.org/tor/%s" % mid
torrent_link = "http://www.mininova.org/get/%s" % mid
details_link = "http://www.mininova.org/det/%s" % mid
txt = read_url(comment_link) + '\n' + read_url(details_link)
txt = txt.decode('utf-8', 'replace')
title = re.compile('<title>(.*?):.*?</title>').findall(txt)[0]
if "This torrent does not exist..." in txt:
print "This torrent does not exist...", mid
return None
movie = dict(
title=title,
txt=txt,
comment_link=comment_link,
torrent_link=torrent_link,
)
return filterMovies([movie,])[0]
def newMovies(preFilter):
url = "http://www.mininova.org/rss.xml?cat=4"
page = read_url(url)
fd = feedparser.parse(page)
movies = []
for entry in fd.entries:
if not preFilter or preFilter(entry):
movie = dict(
title=entry.title,
txt=entry.summary,
comment_link=entry.link,
torrent_link=entry.link.replace('/tor/','/get/')
)
movies.append(movie)
movies = filterMovies(movies)
return movies