scrapeit/scrapeit/opensubtitles.py
2007-08-11 15:47:43 +00:00

80 lines
2.1 KiB
Python

# -*- coding: utf-8 -*-
# -*- Mode: Python; -*-
# vi:si:et:sw=2:sts=2:ts=2
import utils
import StringIO
import zipfile
import re
import socket
from BeautifulSoup import BeautifulSoup
import feedparser
import chardet
def read_url(url):
t0 = socket.getdefaulttimeout()
socket.setdefaulttimeout(100)
data = utils.read_url(url)
socket.setdefaulttimeout(t0)
return data
def findSubtitlesByIMDb(imdb, parts = 1, language = "eng"):
url = "http://www.opensubtitles.org/en/search/sublanguageid-%s/subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (language, parts, imdb)
data = read_url(url)
fd = feedparser.parse(data)
os_id = None
if fd.entries:
link = fd.entries[0]['links'][0]['href']
os_id = re.compile('subtitles/(.*?)/').findall(link)
if os_id:
os_id = os_id[0]
return os_id
def getZipFileLink(opensubtitle_id):
return "http://www.opensubtitles.org/en/download/sub/%s" % opensubtitle_id
def downloadSubtitle(zip_link):
return extractSubtitles(read_url(zip_link))
def extractSubtitles(zip_data):
srts ={}
z = StringIO.StringIO()
z.write(zip_data)
zfile = zipfile.ZipFile(z)
files = zfile.namelist()
def is_not_nfo(file): return not file.endswith('nfo')
files = filter(is_not_nfo, files)
if len(files) == 1:
srts[files[0]] = zfile.read(files[0])
else:
for f in zfile.namelist():
if f.endswith('.srt'):
srts[f] = zfile.read(f)
return srts
def loadSrtUnicode(data):
encoding = chardet.detect(data)['encoding']
try:
udata = unicode(data, encoding)
except:
try:
udata = unicode(data, 'latin-1')
except:
print "failed to detect encoding, giving up"
udata = u''
return udata
def downloadSubtitleByID(opensubtitle_id):
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
soup = BeautifulSoup(data)
srts = {}
c = 0
for a in soup('a', {'href': re.compile('download/file')}):
download_url = 'http://www.opensubtitles.org' + a['href']
file_name = a.contents[-1].split('\n')[0].strip()
data = loadSrtUnicode(read_url(download_url))
if data:
srts["%03d_%s" %(c, file_name)] = data
c += 1
return srts