update some scrapers
This commit is contained in:
parent
4d5b87a890
commit
6bedcaa9d6
5 changed files with 133 additions and 8 deletions
|
@ -1,6 +1,7 @@
|
||||||
# -*- Mode: Python; -*-
|
# -*- Mode: Python; -*-
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=2:sts=2:ts=2
|
||||||
|
import re
|
||||||
|
|
||||||
from utils import stripTags
|
from utils import stripTags
|
||||||
|
|
||||||
|
@ -28,3 +29,39 @@ def torrentsWeLike(link):
|
||||||
if word in text:
|
if word in text:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def movieType(movie):
|
||||||
|
if 'cam' in movie['title'].lower():
|
||||||
|
return 'cam'
|
||||||
|
if 'vcd' in movie['title'].lower():
|
||||||
|
return 'vcd'
|
||||||
|
for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'):
|
||||||
|
if key in movie['title'].lower():
|
||||||
|
return 'telecine'
|
||||||
|
for key in ('dvdrip', 'dvdscrs'):
|
||||||
|
if key in movie['title'].lower():
|
||||||
|
return 'dvdrip'
|
||||||
|
if 'screener' in movie['title'].lower():
|
||||||
|
return 'screener'
|
||||||
|
if 'xvid' in movie['title'].lower():
|
||||||
|
return 'Xvid'
|
||||||
|
if '1080p' in movie['title'].lower():
|
||||||
|
return '1080p'
|
||||||
|
if '720p' in movie['title'].lower():
|
||||||
|
return '720p'
|
||||||
|
if 'dvdr' in movie['title'].lower():
|
||||||
|
return 'DVDR'
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def filterMovies(movies):
|
||||||
|
m2 = []
|
||||||
|
for movie in movies:
|
||||||
|
imdb_id = re.compile('title/tt(\d{7})').findall(movie['txt'])
|
||||||
|
if imdb_id:
|
||||||
|
movie['imdb'] = imdb_id[0]
|
||||||
|
else:
|
||||||
|
movie['imdb'] = ''
|
||||||
|
movie['source_type'] = movieType(movie)
|
||||||
|
m2.append(movie)
|
||||||
|
return m2
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ class IMDb:
|
||||||
title = stripTags(html_title)
|
title = stripTags(html_title)
|
||||||
title = re.sub('\(\d\d\d\d\)', '', title)
|
title = re.sub('\(\d\d\d\d\)', '', title)
|
||||||
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
title = re.sub('\(\d\d\d\d/I*\)', '', title)
|
||||||
for t in ('TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
|
for t in ('TV series', 'TV-Series', '(mini)', '(VG)', '(V)', '(TV)'):
|
||||||
title = title.replace(t, '')
|
title = title.replace(t, '')
|
||||||
if title.find(u'\xa0') > -1:
|
if title.find(u'\xa0') > -1:
|
||||||
title = title[:title.find(u'\xa0')]
|
title = title[:title.find(u'\xa0')]
|
||||||
|
@ -264,6 +264,10 @@ class IMDb:
|
||||||
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '')
|
||||||
|
|
||||||
IMDbDict['episodes'] = self.parseEpisodes()
|
IMDbDict['episodes'] = self.parseEpisodes()
|
||||||
|
if IMDbDict['episodes']:
|
||||||
|
IMDbDict['tvshow'] = True
|
||||||
|
else:
|
||||||
|
IMDbDict['tvshow'] = False
|
||||||
IMDbDict['credits'] = self.parseCredits()
|
IMDbDict['credits'] = self.parseCredits()
|
||||||
IMDbDict['plot'] = self.parsePlot()
|
IMDbDict['plot'] = self.parsePlot()
|
||||||
IMDbDict['keywords'] = self.parseKeywords()
|
IMDbDict['keywords'] = self.parseKeywords()
|
||||||
|
@ -528,7 +532,7 @@ def guess(title, director=''):
|
||||||
search = 'site:imdb.com %s "%s"' % (director, title)
|
search = 'site:imdb.com %s "%s"' % (director, title)
|
||||||
else:
|
else:
|
||||||
search = 'site:imdb.com "%s"' % title
|
search = 'site:imdb.com "%s"' % title
|
||||||
for (name, url, desc) in google(search, 1):
|
for (name, url, desc) in google(search, 2):
|
||||||
if url.startswith('http://www.imdb.com/title/tt'):
|
if url.startswith('http://www.imdb.com/title/tt'):
|
||||||
return url[28:35]
|
return url[28:35]
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,10 @@ import socket
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
import feedparser
|
||||||
|
|
||||||
from utils import read_url, read_url_utf8
|
from utils import read_url, read_url_utf8
|
||||||
from btutils import torrentsWeLike
|
from btutils import torrentsWeLike, filterMovies
|
||||||
|
|
||||||
socket.setdefaulttimeout(10.0)
|
socket.setdefaulttimeout(10.0)
|
||||||
|
|
||||||
|
@ -47,3 +48,47 @@ def searchByImdb(imdb):
|
||||||
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
|
torrent_url = "http://www.mininova.org%s" % torrent.get('href').replace('/tor', '/get')
|
||||||
torrents.append(torrent_url)
|
torrents.append(torrent_url)
|
||||||
return torrents
|
return torrents
|
||||||
|
|
||||||
|
def getId(s):
|
||||||
|
s = s.split('/')
|
||||||
|
if len(s) == 1:
|
||||||
|
return s[0]
|
||||||
|
else:
|
||||||
|
return s[-1]
|
||||||
|
|
||||||
|
def getInfo(mid):
|
||||||
|
mid = getId(mid)
|
||||||
|
comment_link = "http://www.mininova.org/tor/%s" % mid
|
||||||
|
torrent_link = "http://www.mininova.org/get/%s" % mid
|
||||||
|
details_link = "http://www.mininova.org/det/%s" % mid
|
||||||
|
txt = read_url(comment_link) + '\n' + read_url(details_link)
|
||||||
|
txt = txt.decode('utf-8', 'replace')
|
||||||
|
title = re.compile('<title>(.*?):.*?</title>').findall(txt)[0]
|
||||||
|
if "This torrent does not exist..." in txt:
|
||||||
|
print "This torrent does not exist...", mid
|
||||||
|
return None
|
||||||
|
movie = dict(
|
||||||
|
title=title,
|
||||||
|
txt=txt,
|
||||||
|
comment_link=comment_link,
|
||||||
|
torrent_link=torrent_link,
|
||||||
|
)
|
||||||
|
return filterMovies([movie,])[0]
|
||||||
|
|
||||||
|
def newMovies(preFilter):
|
||||||
|
url = "http://www.mininova.org/rss.xml?cat=4"
|
||||||
|
page = read_url(url)
|
||||||
|
fd = feedparser.parse(page)
|
||||||
|
movies = []
|
||||||
|
for entry in fd.entries:
|
||||||
|
if not preFilter or preFilter(entry):
|
||||||
|
movie = dict(
|
||||||
|
title=entry.title,
|
||||||
|
txt=entry.summary,
|
||||||
|
comment_link=entry.link,
|
||||||
|
torrent_link=entry.link.replace('/tor/','/get/')
|
||||||
|
)
|
||||||
|
movies.append(movie)
|
||||||
|
movies = filterMovies(movies)
|
||||||
|
return movies
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ def getRottenTomatoes(rating = 70):
|
||||||
offset = 0
|
offset = 0
|
||||||
titles = ['1']
|
titles = ['1']
|
||||||
while titles:
|
while titles:
|
||||||
url = "http://www.rottentomatoes.com/movies/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=40&y=5&start_index=%s" % (rating, offset)
|
url = "http://www.rottentomatoes.com/browser.php?movietype=1&genre=&tomatometer=&avgrating=%s&numreviews=10&mpaa=&x=56&y=10&start_index=%d" % (rating, offset)
|
||||||
page = read_url(url)
|
page = read_url(url)
|
||||||
soup = BeautifulSoup(page)
|
soup = BeautifulSoup(page)
|
||||||
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
|
titles = [link.contents[0] for link in soup.findAll('a', {'class': 'movie-link'})]
|
||||||
|
|
|
@ -5,10 +5,12 @@
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
from urllib2 import URLError
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
import feedparser
|
||||||
|
|
||||||
from btutils import torrentsWeLike
|
from btutils import torrentsWeLike, filterMovies
|
||||||
from google import google
|
from google import google
|
||||||
from utils import read_url, read_url_utf8
|
from utils import read_url, read_url_utf8
|
||||||
|
|
||||||
|
@ -116,3 +118,40 @@ def search(query, filterResult = False):
|
||||||
|
|
||||||
def searchByImdb(imdb):
|
def searchByImdb(imdb):
|
||||||
return search("tt" + imdb)
|
return search("tt" + imdb)
|
||||||
|
|
||||||
|
def getId(pid):
|
||||||
|
if pid.startswith('http://torrents.thepiratebay.org/'):
|
||||||
|
pid = pid.split('org/')[1]
|
||||||
|
if 'tor/' in pid:
|
||||||
|
pid = pid.split('tor/')[1]
|
||||||
|
return pid
|
||||||
|
|
||||||
|
def getInfo(piratebayID):
|
||||||
|
piratebayID = getId(piratebayID)
|
||||||
|
url = 'http://thepiratebay.org/tor/%s' % piratebayID
|
||||||
|
try:
|
||||||
|
txt = read_url(url).decode('utf-8', 'replace')
|
||||||
|
except URLError, e:
|
||||||
|
if e.code == 404:
|
||||||
|
return None
|
||||||
|
title = re.compile('<title>(.*?) \(download torrent\) - TPB</title>').findall(txt)[0]
|
||||||
|
movie = dict(
|
||||||
|
title=title,
|
||||||
|
txt=txt,
|
||||||
|
comment_link=url,
|
||||||
|
torrent_link="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayID, title)
|
||||||
|
)
|
||||||
|
return filterMovies([movie,])[0]
|
||||||
|
|
||||||
|
def newMovies(preFilter=None):
|
||||||
|
url = "http://rss.thepiratebay.org/201"
|
||||||
|
page = read_url(url)
|
||||||
|
fd = feedparser.parse(page)
|
||||||
|
movies = []
|
||||||
|
for entry in fd.entries:
|
||||||
|
if not preFilter or preFilter(entry):
|
||||||
|
movie = getInfo(entry.comments)
|
||||||
|
movies.append(movie)
|
||||||
|
movies = filterMovies(movies)
|
||||||
|
return movies
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue