update some scrapers

This commit is contained in:
j 2008-03-16 12:16:07 +00:00
parent 4d5b87a890
commit 6bedcaa9d6
5 changed files with 133 additions and 8 deletions

View file

@ -1,6 +1,7 @@
# -*- Mode: Python; -*- # -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=2:sts=2:ts=2
import re
from utils import stripTags from utils import stripTags
@ -27,4 +28,40 @@ def torrentsWeLike(link):
for word in ('dvdrip', 'dvdscr', 'dvd screener'): for word in ('dvdrip', 'dvdscr', 'dvd screener'):
if word in text: if word in text:
return True return True
return False return False
def movieType(movie):
if 'cam' in movie['title'].lower():
return 'cam'
if 'vcd' in movie['title'].lower():
return 'vcd'
for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'):
if key in movie['title'].lower():
return 'telecine'
for key in ('dvdrip', 'dvdscrs'):
if key in movie['title'].lower():
return 'dvdrip'
if 'screener' in movie['title'].lower():
return 'screener'
if 'xvid' in movie['title'].lower():
return 'Xvid'
if '1080p' in movie['title'].lower():
return '1080p'
if '720p' in movie['title'].lower():
return '720p'
if 'dvdr' in movie['title'].lower():
return 'DVDR'
return ''
def filterMovies(movies):
m2 = []
for movie in movies:
imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt'])
if imdb_id:
movie['imdb'] = imdb_id[0]
else:
movie['imdb'] = ''
movie['source_type'] = movieType(movie)
m2.append(movie)
return m2

View file

@ -185,7 +185,7 @@ class IMDb:
title = stripTags(html_title) title = stripTags(html_title)
title = re.sub('\(\d\d\d\d\)', '', title) title = re.sub('\(\d\d\d\d\)', '', title)
title = re.sub('\(\d\d\d\d/I*\)', '', title) title = re.sub('\(\d\d\d\d/I*\)', '', title)
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'): for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
title = title.replace(t, '') title = title.replace(t, '')
if title.find(u'\xa0') > -1: if title.find(u'\xa0') > -1:
title = title[:title.find(u'\xa0')] title = title[:title.find(u'\xa0')]
@ -264,6 +264,10 @@ class IMDb:
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
IMDbDict['episodes'] = self.parseEpisodes() IMDbDict['episodes'] = self.parseEpisodes()
if IMDbDict['episodes']:
IMDbDict['tvshow'] = True
else:
IMDbDict['tvshow'] = False
IMDbDict['credits'] = self.parseCredits() IMDbDict['credits'] = self.parseCredits()
IMDbDict['plot'] = self.parsePlot() IMDbDict['plot'] = self.parsePlot()
IMDbDict['keywords'] = self.parseKeywords() IMDbDict['keywords'] = self.parseKeywords()
@ -528,10 +532,10 @@ def guess(title, director=''):
search = 'site:imdb.com %s "%s"' % (director, title) search = 'site:imdb.com %s "%s"' % (director, title)
else: else:
search = 'site:imdb.com "%s"' % title search = 'site:imdb.com "%s"' % title
for (name, url, desc) in google(search, 1): for (name, url, desc) in google(search, 2):
if url.startswith('http://www.imdb.com/title/tt'): if url.startswith('http://www.imdb.com/title/tt'):
return url[28:35] return url[28:35]
try: try:
req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS) req = urllib2.Request(imdb_url, None, utils.DEFAULT_HEADERS)
u = urllib2.urlopen(req) u = urllib2.urlopen(req)

View file

@ -6,9 +6,10 @@ import socket
from urllib import quote from urllib import quote
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import feedparser
from utils import read_url, read_url_utf8 from utils import read_url, read_url_utf8
from btutils import torrentsWeLike from btutils import torrentsWeLike, filterMovies
socket.setdefaulttimeout(10.0) socket.setdefaulttimeout(10.0)
@ -47,3 +48,47 @@ def searchByImdb(imdb):
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get') torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
torrents.append(torrent_url) torrents.append(torrent_url)
return torrents return torrents
def getId(s):
s = s.split('/')
if len(s) == 1:
return s[0]
else:
return s[-1]
def getInfo(mid):
mid = getId(mid)
comment_link = "http://www.mininova.org/tor/%s" % mid
torrent_link = "http://www.mininova.org/get/%s" % mid
details_link = "http://www.mininova.org/det/%s" % mid
txt = read_url(comment_link) + '\n' + read_url(details_link)
txt = txt.decode('utf-8', 'replace')
title = re.compile('<title>(.*?):.*?</title>').findall(txt)[0]
if "This torrent does not exist..." in txt:
print "This torrent does not exist...", mid
return None
movie = dict(
title=title,
txt=txt,
comment_link=comment_link,
torrent_link=torrent_link,
)
return filterMovies([movie,])[0]
def newMovies(preFilter):
url = "http://www.mininova.org/rss.xml?cat=4"
page = read_url(url)
fd = feedparser.parse(page)
movies = []
for entry in fd.entries:
if not preFilter or preFilter(entry):
movie = dict(
title=entry.title,
txt=entry.summary,
comment_link=entry.link,
torrent_link=entry.link.replace('/tor/','/get/')
)
movies.append(movie)
movies = filterMovies(movies)
return movies

View file

@ -20,7 +20,7 @@ def getRottenTomatoes(rating = 70):
offset = 0 offset = 0
titles = ['1'] titles = ['1']
while titles: while titles:
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset) url = "http://www.rottentomatoes.com/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=56&y=10&start_index=%d" % (rating, offset)
page = read_url(url) page = read_url(url)
soup = BeautifulSoup(page) soup = BeautifulSoup(page)
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})] titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
@ -34,4 +34,4 @@ def getRottenTomatoes(rating = 70):
offset += 10 offset += 10
return movies return movies

View file

@ -5,10 +5,12 @@
import re import re
import socket import socket
from urllib import quote from urllib import quote
from urllib2 import URLError
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import feedparser
from btutils import torrentsWeLike from btutils import torrentsWeLike, filterMovies
from google import google from google import google
from utils import read_url, read_url_utf8 from utils import read_url, read_url_utf8
@ -116,3 +118,40 @@ def search(query, filterResult = False):
def searchByImdb(imdb): def searchByImdb(imdb):
return search("tt" + imdb) return search("tt" + imdb)
def getId(pid):
if pid.startswith('http://torrents.thepiratebay.org/'):
pid = pid.split('org/')[1]
if 'tor/' in pid:
pid = pid.split('tor/')[1]
return pid
def getInfo(piratebayID):
piratebayID = getId(piratebayID)
url = 'http://thepiratebay.org/tor/%s' % piratebayID
try:
txt = read_url(url).decode('utf-8', 'replace')
except URLError, e:
if e.code == 404:
return None
title = re.compile('<title>(.*?) \(download torrent\) - TPB</title>').findall(txt)[0]
movie = dict(
title=title,
txt=txt,
comment_link=url,
torrent_link="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayID, title)
)
return filterMovies([movie,])[0]
def newMovies(preFilter=None):
url = "http://rss.thepiratebay.org/201"
page = read_url(url)
fd = feedparser.parse(page)
movies = []
for entry in fd.entries:
if not preFilter or preFilter(entry):
movie = getInfo(entry.comments)
movies.append(movie)
movies = filterMovies(movies)
return movies