update some scrapers
This commit is contained in:
parent
4d5b87a890
commit
6bedcaa9d6
5 changed files with 133 additions and 8 deletions
|
@ -1,6 +1,7 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
import re
|
||||
|
||||
from utils import stripTags
|
||||
|
||||
|
@ -28,3 +29,39 @@ def torrentsWeLike(link):
|
|||
if word in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
def movieType(movie):
|
||||
if 'cam' in movie['title'].lower():
|
||||
return 'cam'
|
||||
if 'vcd' in movie['title'].lower():
|
||||
return 'vcd'
|
||||
for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'):
|
||||
if key in movie['title'].lower():
|
||||
return 'telecine'
|
||||
for key in ('dvdrip', 'dvdscrs'):
|
||||
if key in movie['title'].lower():
|
||||
return 'dvdrip'
|
||||
if 'screener' in movie['title'].lower():
|
||||
return 'screener'
|
||||
if 'xvid' in movie['title'].lower():
|
||||
return 'Xvid'
|
||||
if '1080p' in movie['title'].lower():
|
||||
return '1080p'
|
||||
if '720p' in movie['title'].lower():
|
||||
return '720p'
|
||||
if 'dvdr' in movie['title'].lower():
|
||||
return 'DVDR'
|
||||
return ''
|
||||
|
||||
def filterMovies(movies):
|
||||
m2 = []
|
||||
for movie in movies:
|
||||
imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt'])
|
||||
if imdb_id:
|
||||
movie['imdb'] = imdb_id[0]
|
||||
else:
|
||||
movie['imdb'] = ''
|
||||
movie['source_type'] = movieType(movie)
|
||||
m2.append(movie)
|
||||
return m2
|
||||
|
||||
|
|
|
@ -185,7 +185,7 @@ class IMDb:
|
|||
title = stripTags(html_title)
|
||||
title = re.sub('\(\d\d\d\d\)', '', title)
|
||||
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
||||
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||
title = title.replace(t, '')
|
||||
if title.find(u'\xa0') > -1:
|
||||
title = title[:title.find(u'\xa0')]
|
||||
|
@ -264,6 +264,10 @@ class IMDb:
|
|||
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||
|
||||
IMDbDict['episodes'] = self.parseEpisodes()
|
||||
if IMDbDict['episodes']:
|
||||
IMDbDict['tvshow'] = True
|
||||
else:
|
||||
IMDbDict['tvshow'] = False
|
||||
IMDbDict['credits'] = self.parseCredits()
|
||||
IMDbDict['plot'] = self.parsePlot()
|
||||
IMDbDict['keywords'] = self.parseKeywords()
|
||||
|
@ -528,7 +532,7 @@ def guess(title, director=''):
|
|||
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||
else:
|
||||
search = 'site:imdb.com "%s"' % title
|
||||
for (name, url, desc) in google(search, 1):
|
||||
for (name, url, desc) in google(search, 2):
|
||||
if url.startswith('http://www.imdb.com/title/tt'):
|
||||
return url[28:35]
|
||||
|
||||
|
|
|
@ -6,9 +6,10 @@ import socket
|
|||
from urllib import quote
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
import feedparser
|
||||
|
||||
from utils import read_url, read_url_utf8
|
||||
from btutils import torrentsWeLike
|
||||
from btutils import torrentsWeLike, filterMovies
|
||||
|
||||
socket.setdefaulttimeout(10.0)
|
||||
|
||||
|
@ -47,3 +48,47 @@ def searchByImdb(imdb):
|
|||
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
|
||||
torrents.append(torrent_url)
|
||||
return torrents
|
||||
|
||||
def getId(s):
|
||||
s = s.split('/')
|
||||
if len(s) == 1:
|
||||
return s[0]
|
||||
else:
|
||||
return s[-1]
|
||||
|
||||
def getInfo(mid):
|
||||
mid = getId(mid)
|
||||
comment_link = "http://www.mininova.org/tor/%s" % mid
|
||||
torrent_link = "http://www.mininova.org/get/%s" % mid
|
||||
details_link = "http://www.mininova.org/det/%s" % mid
|
||||
txt = read_url(comment_link) + '\n' + read_url(details_link)
|
||||
txt = txt.decode('utf-8', 'replace')
|
||||
title = re.compile('<title>(.*?):.*?</title>').findall(txt)[0]
|
||||
if "This torrent does not exist..." in txt:
|
||||
print "This torrent does not exist...", mid
|
||||
return None
|
||||
movie = dict(
|
||||
title=title,
|
||||
txt=txt,
|
||||
comment_link=comment_link,
|
||||
torrent_link=torrent_link,
|
||||
)
|
||||
return filterMovies([movie,])[0]
|
||||
|
||||
def newMovies(preFilter):
|
||||
url = "http://www.mininova.org/rss.xml?cat=4"
|
||||
page = read_url(url)
|
||||
fd = feedparser.parse(page)
|
||||
movies = []
|
||||
for entry in fd.entries:
|
||||
if not preFilter or preFilter(entry):
|
||||
movie = dict(
|
||||
title=entry.title,
|
||||
txt=entry.summary,
|
||||
comment_link=entry.link,
|
||||
torrent_link=entry.link.replace('/tor/','/get/')
|
||||
)
|
||||
movies.append(movie)
|
||||
movies = filterMovies(movies)
|
||||
return movies
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ def getRottenTomatoes(rating = 70):
|
|||
offset = 0
|
||||
titles = ['1']
|
||||
while titles:
|
||||
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
|
||||
url = "http://www.rottentomatoes.com/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=56&y=10&start_index=%d" % (rating, offset)
|
||||
page = read_url(url)
|
||||
soup = BeautifulSoup(page)
|
||||
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
|
||||
|
|
|
@ -5,10 +5,12 @@
|
|||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
from urllib2 import URLError
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
import feedparser
|
||||
|
||||
from btutils import torrentsWeLike
|
||||
from btutils import torrentsWeLike, filterMovies
|
||||
from google import google
|
||||
from utils import read_url, read_url_utf8
|
||||
|
||||
|
@ -116,3 +118,40 @@ def search(query, filterResult = False):
|
|||
|
||||
def searchByImdb(imdb):
|
||||
return search("tt" + imdb)
|
||||
|
||||
def getId(pid):
|
||||
if pid.startswith('http://torrents.thepiratebay.org/'):
|
||||
pid = pid.split('org/')[1]
|
||||
if 'tor/' in pid:
|
||||
pid = pid.split('tor/')[1]
|
||||
return pid
|
||||
|
||||
def getInfo(piratebayID):
|
||||
piratebayID = getId(piratebayID)
|
||||
url = 'http://thepiratebay.org/tor/%s' % piratebayID
|
||||
try:
|
||||
txt = read_url(url).decode('utf-8', 'replace')
|
||||
except URLError, e:
|
||||
if e.code == 404:
|
||||
return None
|
||||
title = re.compile('<title>(.*?) \(download torrent\) - TPB</title>').findall(txt)[0]
|
||||
movie = dict(
|
||||
title=title,
|
||||
txt=txt,
|
||||
comment_link=url,
|
||||
torrent_link="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayID, title)
|
||||
)
|
||||
return filterMovies([movie,])[0]
|
||||
|
||||
def newMovies(preFilter=None):
|
||||
url = "http://rss.thepiratebay.org/201"
|
||||
page = read_url(url)
|
||||
fd = feedparser.parse(page)
|
||||
movies = []
|
||||
for entry in fd.entries:
|
||||
if not preFilter or preFilter(entry):
|
||||
movie = getInfo(entry.comments)
|
||||
movies.append(movie)
|
||||
movies = filterMovies(movies)
|
||||
return movies
|
||||
|
||||
|
|
Loading…
Reference in a new issue