diff --git a/scrapeit/opensubtitles.py b/scrapeit/opensubtitles.py index 315f73f..7efc74e 100644 --- a/scrapeit/opensubtitles.py +++ b/scrapeit/opensubtitles.py @@ -3,12 +3,15 @@ # vi:si:et:sw=2:sts=2:ts=2 import utils -import feedparser import StringIO import zipfile import re import socket +from BeautifulSoup import BeautifulSoup +import feedparser +import chardet + def read_url(url): t0 = socket.getdefaulttimeout() socket.setdefaulttimeout(100) @@ -16,7 +19,7 @@ def read_url(url): socket.setdefaulttimeout(t0) return data -def searchSubtitlesByIMDb(imdb, parts = 1, language = "eng"): +def findSubtitlesByIMDb(imdb, parts = 1, language = "eng"): url = "http://www.opensubtitles.org/en/search/sublanguageid-%s/subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (language, parts, imdb) data = read_url(url) fd = feedparser.parse(data) @@ -50,7 +53,26 @@ def extractSubtitles(zip_data): srts[f] = zfile.read(f) return srts - +def loadSrtUnicode(data): + encoding = chardet.detect(data)['encoding'] + try: + udata = unicode(data, encoding) + except: + try: + udata = unicode(data, 'latin-1') + except: + print "failed to detect encoding, giving up" + udata = u'' + return udata + def downloadSubtitleByID(opensubtitle_id): - zip_file = getZipFileLink(opensubtitle_id) - return downloadSubtitle(zip_file) + data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) + soup = BeautifulSoup(data) + srts = {} + for a in soup('a', {'href': re.compile('download/file')}): + download_url = 'http://www.opensubtitles.org' + a['href'] + file_name = a.contents[-1].split('\n')[0].strip() + data = loadSrtUnicode(read_url(download_url)) + if data: + srts[file_name] = data + return srts