(.*?)
'
+ '.*?
(.*?) | (.*?)(.*?).*? | .*?'''
+ for row in re.compile(regexp, re.DOTALL).findall(data):
+ torrentDate = row[0]
+ torrentExtra = row[1]
+ torrentId = row[2]
+ torrentTitle = decodeHtml(row[3]).strip()
+ torrentLink = "http://www.mininova.org/tor/" + torrentId
+ privateTracker = 'priv.gif' in torrentExtra
+ if not privateTracker:
+ results.append((torrentTitle, torrentLink, ''))
+ return results
+
+def findMovie(query, max_results=10):
+ '''search for torrents on mininova
+ '''
+ url = "http://www.mininova.org/search/%s/seeds" % quote(query)
+ data = readUrlUnicode(url)
+ return _parseResultsPage(data, max_results)
+
+def findMovieByImdb(imdbId):
+ '''find torrents on mininova for a given imdb id
+ '''
+ results = []
+ imdbId = normalizeImdbId(imdbId)
+ data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
+ return _parseResultsPage(data)
+
+def getId(mininovaId):
+ mininovaId = unicode(mininovaId)
+ d = findRe(mininovaId, "/(\d+)")
+ if d:
+ return d
+ mininovaId = mininovaId.split('/')
+ if len(mininovaId) == 1:
+ return mininovaId[0]
+ else:
+ return mininovaId[-1]
+
+def exists(mininovaId):
+ mininovaId = getId(mininovaId)
+ data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
+ if not data or 'Torrent not found...' in data:
+ return False
+ if 'tracker of this torrent requires registration.' in data:
+ return False
+ return True
+
+def getData(mininovaId):
+ _key_map = {
+ 'by': u'uploader',
+ }
+ mininovaId = getId(mininovaId)
+ torrent = dict()
+ torrent[u'id'] = mininovaId
+ torrent[u'domain'] = 'mininova.org'
+ torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
+ torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
+ torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
+
+ data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
+ if '
Torrent not found...
' in data:
+ return None
+
+ for d in re.compile('
.(.*?):(.*?)
', re.DOTALL).findall(data):
+ key = d[0].lower().strip()
+ key = _key_map.get(key, key)
+ value = decodeHtml(stripTags(d[1].strip()))
+ torrent[key] = value
+
+ torrent[u'title'] = findRe(data, '
(.*?):.*?')
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+ torrent[u'description'] = findRe(data, '
(.*?)
')
+ if torrent['description']:
+ torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+ t = readUrl(torrent[u'torrent_link'])
+ torrent[u'torrent_info'] = getTorrentInfo(t)
+ return torrent
+
+class Mininova(Torrent):
+ '''
+ >>> Mininova('123')
+ {}
+ >>> Mininova('1072195')['infohash']
+ '72dfa59d2338e4a48c78cec9de25964cddb64104'
+ '''
+ def __init__(self, mininovaId):
+ self.data = getData(mininovaId)
+ if not self.data:
+ return
+ Torrent.__init__(self)
+ ratio = self.data['share ratio'].split(',')
+ self['seeder'] = -1
+ self['leecher'] = -1
+ if len(ratio) == 2:
+ val = intValue(ratio[0].replace(',','').strip())
+ if val:
+ self['seeder'] = int(val)
+ val = intValue(ratio[1].replace(',','').strip())
+ if val:
+ self['leecher'] = int(val)
+ val = intValue(self.data['downloads'].replace(',','').strip())
+ if val:
+ self['downloaded'] = int(val)
+ else:
+ self['downloaded'] = -1
+ published = self.data['added on']
+ published = published.split(' +')[0]
+ self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
+
diff --git a/ox/web/movieposterdb.py b/ox/web/movieposterdb.py
new file mode 100644
index 0000000..0068123
--- /dev/null
+++ b/ox/web/movieposterdb.py
@@ -0,0 +1,44 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+
+import re
+
+from ox.cache import readUrlUnicode
+from ox import findRe
+
+def getData(id):
+ '''
+ >>> getData('0060304')['posters'][0]
+ u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
+ >>> getData('0123456')['posters']
+ []
+ '''
+ data = {
+ "url": getUrl(id)
+ }
+ data["posters"] = getPostersByUrl(data["url"])
+ return data
+
+def getId(url):
+ return url.split("/")[-2]
+
+def getPostersByUrl(url, group=True):
+ posters = []
+ html = readUrlUnicode(url)
+ if url in html:
+ if group:
+ results = re.compile('
', re.DOTALL).findall(html)
+ for result in results:
+ posters += getPostersByUrl(result, False)
+ results = re.compile('', re.DOTALL).findall(html)
+ for result in results:
+ html = readUrlUnicode(result)
+ posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
+ return posters
+
+def getUrl(id):
+ return "http://www.movieposterdb.com/movie/%s/" % id
+
+if __name__ == '__main__':
+ print getData('0060304')
+ print getData('0133093')
diff --git a/ox/web/opensubtitles.py b/ox/web/opensubtitles.py
new file mode 100644
index 0000000..2986f7d
--- /dev/null
+++ b/ox/web/opensubtitles.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+import feedparser
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, stripTags
+from ox import langCode2To3, langTo3Code
+
+def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
+ if len(language) == 2:
+ language = langCode2To3(language)
+ elif len(language) != 3:
+ language = langTo3Code(language)
+ url = "http://www.opensubtitles.org/en/search/"
+ if language:
+ url += "sublanguageid-%s/" % language
+ url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
+ data = readUrl(url)
+ if "title>opensubtitles.com - search results(.*?)'
+ for f in re.compile(reg_exp, re.DOTALL).findall(data):
+ name = stripTags(f[1]).split('\n')[0]
+ url = "http://www.opensubtitles.com%s" % f[0]
+ srts[name] = readUrlUnicode(url)
+ return srts
+
diff --git a/ox/web/oxdb.py b/ox/web/oxdb.py
new file mode 100644
index 0000000..8443c94
--- /dev/null
+++ b/ox/web/oxdb.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import ox.cache
+
+def getPosterUrl(id):
+ url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
+ if ox.cache.exists(url):
+ return url
+ return ''
+
diff --git a/ox/web/piratecinema.py b/ox/web/piratecinema.py
new file mode 100644
index 0000000..5a58721
--- /dev/null
+++ b/ox/web/piratecinema.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import ox.cache
+from ox.cache import exists
+
+
+def getPosterUrl(id):
+ url = "http://piratecinema.org/posters/%s/%s.jpg" % (id[:4], id)
+ if ox.cache.exists(url):
+ return url
+ return ''
+
diff --git a/ox/web/rottentomatoes.py b/ox/web/rottentomatoes.py
new file mode 100644
index 0000000..1a8106a
--- /dev/null
+++ b/ox/web/rottentomatoes.py
@@ -0,0 +1,34 @@
+# -*- coding: UTF-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+
+from ox.cache import getHeaders, readUrl, readUrlUnicode
+from ox import findRe, stripTags
+
+
+def readUrlByImdb(imdb):
+ #this would also wor but does not cache:
+ '''
+ from urllib2 import urlopen
+ u = urlopen(url)
+ return u.url
+ '''
+ url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
+ data = readUrl(url)
+ if "movie_title" in data:
+ movies = re.compile('(/m/.*?/)').findall(data)
+ if movies:
+ return "http://www.rottentomatoes.com" + movies[0]
+ return None
+
+def getData(url):
+ data = readUrlUnicode(url)
+ r = {}
+ r['title'] = findRe(data, '
(.*?)
')
+ if '(' in r['title']:
+ r['year'] = findRe(r['title'], '\((\d*?)\)')
+ r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip()
+ r['synopsis'] = findRe(data, '
(.*?)')
+ r['average rating'] = findRe(data, '
(.*?)
').strip()
+ return r
+
diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py
new file mode 100644
index 0000000..ce5ee33
--- /dev/null
+++ b/ox/web/siteparser.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+from datetime import datetime
+
+from ox.cache import readUrlUnicode
+from ox import stripTags, decodeHtml
+
+
+def cleanup(key, data, data_type):
+ if data:
+ if isinstance(data[0], basestring):
+ #FIXME: some types need stripTags
+ #data = [stripTags(decodeHtml(p)).strip() for p in data]
+ data = [decodeHtml(p).strip() for p in data]
+ elif isinstance(data[0], list) or isinstance(data[0], tuple):
+ data = [cleanup(key, p, data_type) for p in data]
+ while len(data) == 1:
+ data = data[0]
+ if data_type == 'list' and isinstance(data, basestring):
+ data = [data, ]
+ elif data_type != 'list':
+ data = ''
+ return data
+
+class SiteParser(dict):
+ baseUrl = ''
+ regex = {}
+
+ def getUrl(self, page):
+ return "%s%s" % (self.baseUrl, page)
+
+ def __init__(self):
+ for key in self.regex:
+ url = self.getUrl(self.regex[key]['page'])
+ data = readUrlUnicode(url)
+ if isinstance(self.regex[key]['re'], basestring):
+ data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
+ data = cleanup(key, data, self.regex[key]['type'])
+ else:
+ for r in self.regex[key]['re']:
+ if isinstance(data, basestring):
+ data = re.compile(r, re.DOTALL).findall(data)
+ else:
+ data = [re.compile(r, re.DOTALL).findall(d) for d in data]
+ data = cleanup(key, data, self.regex[key]['type'])
+ def apply_f(f, data):
+ if data and isinstance(data[0], list):
+ data = [f(d) for d in data]
+ else:
+ data = f(data)
+ return data
+ if self.regex[key]['type'] == 'float':
+ data = apply_f(float, data)
+ elif self.regex[key]['type'] == 'int':
+ data = apply_f(int, data)
+ elif self.regex[key]['type'] == 'date':
+ parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
+ data = apply_f(parse_date, data)
+ self[key] = data
+
diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py
new file mode 100644
index 0000000..855ec47
--- /dev/null
+++ b/ox/web/spiegel.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import time
+
+import ox.cache
+from ox.html import decodeHtml, stripTags
+import ox.net
+
+
+def getNews(year, month, day):
+ sections = [
+ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
+ 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
+ ]
+ dt = datetime(year, month, day)
+ day = int(dt.strftime('%j'))
+ date = dt.strftime('%d.%m.%Y')
+ news = []
+ for section in sections:
+ url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
+ if date == time.strftime('%d.%m.%Y', time.localtime()):
+ html = ox.net.readUrl(url)
+ else:
+ html = ox.cache.readUrl(url)
+ for item in re.compile('
(.*?)
', re.DOTALL).findall(item)[0]).strip()
+ try:
+ description = formatString(re.compile('
(.*?)<', re.DOTALL).findall(item)[0])
+ except:
+ description = ''
+ try:
+ imageUrl = re.compile('(.*?)', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
+ if new['title1'][-1:] == ':':
+ new['title1'] = new['title1'][0:-1]
+ new['title2'] = new['title'][len(new['title1']) + 2:]
+ new['url'] = re.compile(' ', '')
+ string = string.replace('\n', ' ').replace(' ', ' ').strip()
+ string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
+ return string
+
+def formatSection(string):
+ return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
+
+def formatSubsection(string):
+ # SPIEGEL, SPIEGEL special
+ subsection = {
+ 'abi': 'Abi - und dann?',
+ 'formel1': 'Formel 1',
+ 'jobundberuf': 'Job & Beruf',
+ 'leben': 'Leben U21',
+ 'mensch': 'Mensch & Technik',
+ 'sonst': '',
+ 'staedte': u'St\xc3dte',
+ 'ussports': 'US-Sports',
+ 'wunderbar': 'wunderBAR'
+ }
+ if subsection.has_key(string):
+ return subsection[string].replace(u'\xc3', 'ae')
+ return string[:1].upper() + string[1:]
+
+def getIssue(year, week):
+ coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
+ if not ox.net.exists(coverUrl):
+ return None
+ url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
+ contents = []
+ data = ox.cache.readUrl(url)
+ items = re.compile('(.*?)').findall(data)
+ for item in items:
+ item = item[1]
+ page = int(re.compile('&SE=(.*?)"').findall(item)[0])
+ title = stripTags(item).strip()
+ contents.append({'title': title, 'page': page})
+ pageUrl = {}
+ pages = page + 2
+ for page in range(1, pages + 10):
+ url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
+ if ox.cache.exists(url):
+ pageUrl[page] = url
+ else:
+ pageUrl[page] = ''
+ return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
+
+
+def archiveIssues():
+ '''
+ this is just an example of an archiving application
+ '''
+ p = {}
+ import os
+ import simplejson
+ import time
+ archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
+ localtime = time.localtime()
+ year = int(time.strftime('%Y', localtime))
+ week = int(time.strftime('%W', localtime))
+ for y in range(year, 1993, -1):
+ if y == year:
+ wMax = week + 1
+ else:
+ wMax = 53
+ for w in range(wMax, 0, -1):
+ print 'getIssue(%d, %d)' % (y, w)
+ issue = getIssue(y, w)
+ if issue:
+ dirname = '%s/%d/%02d' % (archivePath, y, w)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+ filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
+ if not os.path.exists(filename):
+ data = simplejson.dumps(issue, ensure_ascii = False)
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
+ if not os.path.exists(filename):
+ data = []
+ for item in issue['contents']:
+ data.append('%3d %s' % (item['page'], item['title']))
+ data = '\n'.join(data)
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
+ if not os.path.exists(filename):
+ data = ox.cache.readUrl(issue['coverUrl'])
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ for page in issue['pageUrl']:
+ url = issue['pageUrl'][page]
+ if url:
+ filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
+ if not os.path.exists(filename):
+ data = ox.cache.readUrl(url)
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ if not p:
+ p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
+ else:
+ p['num'] += 1
+ p['sum'] += issue['pages']
+ if issue['pages'] < p['min']:
+ p['min'] = issue['pages']
+ if issue['pages'] > p['max']:
+ p['max'] = issue['pages']
+ print p['min'], p['sum'] / p['num'], p['max']
+
+
+def archiveNews():
+ '''
+ this is just an example of an archiving application
+ '''
+ import os
+ import simplejson
+ import time
+
+ count = {}
+ colon = []
+
+ archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
+ days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+ localtime = time.localtime()
+ year = int(time.strftime('%Y', localtime))
+ month = int(time.strftime('%m', localtime))
+ day = int(time.strftime('%d', localtime)) - 1
+ for y in range(year, 1999, -1):
+ if y == year:
+ mMax = month
+ else:
+ mMax = 12
+ for m in range(mMax, 0, -1):
+ if y == year and m == month:
+ dMax = day
+ elif m == 2 and y % 4 == 0 and y % 400 != 0:
+ dMax = days[m] + 1
+ else:
+ dMax = days[m]
+ for d in range(dMax, 0, -1):
+ print 'getNews(%d, %d, %d)' % (y, m, d)
+ news = getNews(y, m ,d)
+ for new in news:
+ dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+ if new['url'][-5:] == '.html':
+ filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
+ else:
+ filename = dirname + '/' + new['url'] + '.json'
+ if not os.path.exists(filename) or True:
+ data = simplejson.dumps(new, ensure_ascii = False)
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ filename = filename[:-5] + '.txt'
+ if not os.path.exists(filename) or True:
+ data = splitTitle(new['title'])
+ data.append(new['description'])
+ data = '\n'.join(data)
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+ filename = dirname + '/' + new['imageUrl'].split('/')[-1]
+ if not os.path.exists(filename):
+ data = ox.cache.readUrl(new['imageUrl'])
+ f = open(filename, 'w')
+ f.write(data)
+ f.close()
+
+ strings = new['url'].split('/')
+ string = strings[3]
+ if len(strings) == 6:
+ string += '/' + strings[4]
+ if not count.has_key(string):
+ count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
+ else:
+ count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
+ strings = splitTitle(new['title'])
+ if strings[0] != new['title1'] or strings[1] != new['title2']:
+ colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
+ for key in sortDictByKey(count):
+ print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
+ for value in colon:
+ print value
+
+def sortDictByKey(d):
+ keys = d.keys()
+ keys.sort()
+ return keys
+
+if __name__ == '__main__':
+ # spiegel = Spiegel(2008, 8)
+ # print spiegel.getContents()
+ # news = News(2001, 9, 10)
+ # output(news.getNews())
+ '''
+ x = []
+ for d in range(10, 30):
+ print '2/%d' % d
+ news = getNews(2008, 2, d)
+ for new in news:
+ strings = new['url'].split('/')
+ string = formatSection(strings[3])
+ if len(strings) == 6:
+ string += '/' + formatSubsection(strings[4])
+ if not string in x:
+ x.append(string)
+ print x
+ '''
+ # archiveIssues()
+ archiveNews()
diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py
new file mode 100644
index 0000000..4202a4d
--- /dev/null
+++ b/ox/web/thepiratebay.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from datetime import datetime
+import re
+import socket
+from urllib import quote, urlencode
+from urllib2 import URLError
+
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
+from ox.normalize import normalizeImdbId
+import ox
+
+from torrent import Torrent
+
+cache_timeout = 24*60*60 # cache search only for 24 hours
+
+season_episode = re.compile("S..E..", re.IGNORECASE)
+
+
+def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
+ headers = headers.copy()
+ headers['Cookie'] = 'language=en_EN'
+ return cache.readUrl(url, data, headers, timeout)
+
+def _readUrlUnicode(url, timeout=cache.cache_timeout):
+ return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
+
+def findMovies(query, max_results=10):
+ results = []
+ next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
+ page_count = 1
+ while next and page_count < 4:
+ page_count += 1
+ url = next[0]
+ if not url.startswith('http'):
+ if not url.startswith('/'):
+ url = "/" + url
+ url = "http://thepiratebay.org" + url
+ data = _readUrlUnicode(url, timeout=cache_timeout)
+ regexp = '''(.*?).*?'''
+ for row in re.compile(regexp, re.DOTALL).findall(data):
+ torrentType = row[0]
+ torrentLink = "http://thepiratebay.org" + row[1]
+ torrentTitle = decodeHtml(row[2])
+ # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
+ if torrentType in ['201']:
+ results.append((torrentTitle, torrentLink, ''))
+ if len(results) >= max_results:
+ return results
+ next = re.compile('.*?next.gif.*?').findall(data)
+ return results
+
+def findMovieByImdb(imdb):
+ return findMovies("tt" + normalizeImdbId(imdb))
+
+def getId(piratebayId):
+ if piratebayId.startswith('http://torrents.thepiratebay.org/'):
+ piratebayId = piratebayId.split('org/')[1]
+ d = findRe(piratebayId, "tor/(\d+)")
+ if d:
+ piratebayId = d
+ d = findRe(piratebayId, "torrent/(\d+)")
+ if d:
+ piratebayId = d
+ return piratebayId
+
+def exists(piratebayId):
+ piratebayId = getId(piratebayId)
+ return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
+
+def getData(piratebayId):
+ _key_map = {
+ 'spoken language(s)': u'language',
+ 'texted language(s)': u'subtitle language',
+ 'by': u'uploader',
+ 'leechers': 'leecher',
+ 'seeders': 'seeder',
+ }
+ piratebayId = getId(piratebayId)
+ torrent = dict()
+ torrent[u'id'] = piratebayId
+ torrent[u'domain'] = 'thepiratebay.org'
+ torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
+
+ data = _readUrlUnicode(torrent['comment_link'])
+ torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB')
+ if not torrent[u'title']:
+ return None
+ torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
+ torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
+ title = quote(torrent['title'].encode('utf-8'))
+ torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
+ for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data):
+ key = d[0].lower().strip()
+ key = _key_map.get(key, key)
+ value = decodeHtml(stripTags(d[1].strip()))
+ torrent[key] = value
+ torrent[u'description'] = findRe(data, '(.*?)
')
+ if torrent[u'description']:
+ torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
+ t = _readUrl(torrent[u'torrent_link'])
+ torrent[u'torrent_info'] = getTorrentInfo(t)
+ return torrent
+
+class Thepiratebay(Torrent):
+ '''
+ >>> Thepiratebay('123')
+ {}
+
+ >>> Thepiratebay('3951349')['infohash']
+ '4e84415d36ed7b54066160c05a0b0f061898d12b'
+ '''
+ def __init__(self, piratebayId):
+ self.data = getData(piratebayId)
+ if not self.data:
+ return
+ Torrent.__init__(self)
+ published = self.data['uploaded']
+ published = published.replace(' GMT', '').split(' +')[0]
+ self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
+
diff --git a/ox/web/torrent.py b/ox/web/torrent.py
new file mode 100644
index 0000000..68cd274
--- /dev/null
+++ b/ox/web/torrent.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from ox import intValue
+
+
+class Torrent(dict):
+ '''
+ >>> Torrent()
+ {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
+ '''
+ _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
+ 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
+ _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
+ _dict_keys = ('torrent_info', )
+ _list_keys = ()
+ data = {'torrent_info': {}}
+
+ def __init__(self):
+ for key in self._string_keys:
+ self[key] = self.data.get(key, u'')
+ for key in self._dict_keys:
+ self[key] = self.data.get(key, {})
+ for key in self._list_keys:
+ self[key] = self.data.get(key, [])
+ for key in self._int_keys:
+ value = self.data.get(key, -1)
+ if not isinstance(value, int):
+ value = int(intValue(value))
+ self[key] = value
+ self['infohash'] = self.data['torrent_info'].get('hash', '')
+ self['size'] = self.data['torrent_info'].get('size', -1)
+ self['announce'] = self.data['torrent_info'].get('announce', '')
+ if 'files' in self.data['torrent_info']:
+ self['files'] = len(self.data['torrent_info']['files'])
+ else:
+ self['files'] = 1
+
diff --git a/ox/web/tv.py b/ox/web/tv.py
new file mode 100644
index 0000000..3808bbd
--- /dev/null
+++ b/ox/web/tv.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+import re
+import time
+
+from ox import stripTags, findRe
+from ox.cache import readUrlUnicode
+
+
+def getEpisodeData(url):
+ '''
+ prases informatin on tvcom episode pages
+ returns dict with title, show, description, score
+ example:
+ getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
+ '''
+ data = readUrlUnicode(url)
+ r = {}
+ r['description'] = stripTags(findRe(data, 'div id="main-col">.*?(.*?)
(.*?)')
+ r['title'] = findRe(data, '.*?: (.*?) - TV.com ')
+ #episode score
+ r['episode score'] = findRe(data, '(.*?)')
+
+ match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
+ if match:
+ r['season'] = int(match[0][1])
+ r['episode'] = int(match[0][0])
+ #'Wednesday September 29, 2004' -> 2004-09-29
+ r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
+ return r
+
diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py
new file mode 100644
index 0000000..3d99688
--- /dev/null
+++ b/ox/web/wikipedia.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import urlencode
+
+import simplejson
+from ox.cache import readUrl, readUrlUnicode
+from ox import findRe, decodeHtml
+
+
+def getId(url):
+ return url.split("/")[-1]
+
+def getUrl(id):
+ return "http://en.wikipedia.org/wiki/%s" % id
+
+
+def getMovieId(title, director='', year=''):
+ query = '"%s" film %s %s' % (title, director, year)
+ result = find(query, 1)
+ if result:
+ return result[0][1]
+ return ''
+
+def getUrlByImdbId(imdbId):
+ query = '"%s"'% imdbId
+ result = find(query)
+ if result:
+ url = result[0][1]
+ return url
+ return ""
+
+def getUrlByImdb(imdbId):
+ # deprecated, use getUrlByImdbId()
+ return getUrlByImdbId(imdbId)
+
+def getUrlByAllmovieId(allmovieId):
+ query = '"amg_id = 1:%s"'% allmovieId
+ result = find(query)
+ if result:
+ url = result[0][1]
+ return url
+ return ''
+
+def getWikiData(wikipediaUrl):
+ url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
+ url = "%s&action=raw" % url
+ data = readUrlUnicode(url)
+ return data
+
+def getMovieData(wikipediaUrl):
+ if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
+ data = getWikiData(wikipediaUrl)
+ filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
+ filmbox = {}
+ _box = filmbox_data.strip().split('\n|')
+ if len(_box) == 1:
+ _box = _box[0].split('|\n')
+ for row in _box:
+ d = row.split('=')
+ if len(d) == 2:
+ key = d[0].strip()
+ if key[0] == '|':
+ key = key[1:]
+ value = d[1].strip()
+ filmbox[key] = value
+ if 'imdb title' in data:
+ filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|')
+ elif 'imdb episode' in data:
+ filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|')
+ if 'Amg movie' in data:
+ filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|')
+ if 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'):
+ filmbox['amg_id'] = filmbox['amg_id'][2:]
+
+ if 'rotten-tomatoes' in data:
+ filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|')
+ if not filmbox['rottentomatoes_id']:
+ filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|')
+ if 'google video' in data:
+ filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|')
+ if 'DEFAULTSORT' in data:
+ filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
+ return filmbox
+
+def getImageUrl(name):
+ data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
+ url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
+ return url
+
+def getPosterUrl(wikipediaUrl):
+ if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
+ data = getMovieData(wikipediaUrl)
+ if 'image' in data:
+ return getImageUrl(data['image'])
+ return ''
+
+def getMoviePoster(wikipediaUrl):
+ # deprecated, use getPosterUrl()
+ return getPosterUrl(wikipediaUrl)
+
+def getAllmovieId(wikipediaUrl):
+ data = getMovieData(wikipediaUrl)
+ return data.get('amg_id', '')
+
+def find(query, max_results=10):
+ query = {'action': 'query', 'list':'search', 'format': 'json',
+ 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
+ url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
+ data = readUrl(url)
+ if not data:
+ data = readUrl(url, timeout=0)
+ result = simplejson.loads(data)
+ results = []
+ if result and 'query' in result:
+ for r in result['query']['search']:
+ title = r['title']
+ url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
+ results.append((title, url, ''))
+ return results
+
diff --git a/ox/web/youtube.py b/ox/web/youtube.py
new file mode 100644
index 0000000..5042c59
--- /dev/null
+++ b/ox/web/youtube.py
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from urllib import quote, unquote
+import httplib
+import xml.etree.ElementTree as ET
+import re
+
+import feedparser
+from ox.cache import readUrl, readUrlUnicode
+from ox import findString, findRe
+
+
+def getVideoKey(youtubeId):
+ data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId)
+ match = re.compile("token=(.+)&thumbnail").findall(data)
+ if match:
+ return unquote(match[0])
+ return False
+
+def getVideoUrl(youtubeId, format='mp4'):
+ youtubeKey = getVideoKey(youtubeId)
+ if format == '1080p':
+ fmt=37
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+ if format == '720p':
+ fmt=22
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+ elif format == 'mp4':
+ fmt=18
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+ elif format == 'high':
+ fmt=35
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt)
+ else:
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s" % (youtubeId, youtubeKey)
+ return url
+
+def getMovieInfo(youtubeId, video_url_base=None):
+ url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId
+ data = readUrl(url)
+ fd = feedparser.parse(data)
+ return getInfoFromAtom(fd.entries[0], video_url_base)
+
+def getInfoFromAtom(entry, video_url_base=None):
+ info = dict()
+ info['title'] = entry['title']
+ info['description'] = entry['description']
+ info['author'] = entry['author']
+ #info['published'] = entry['published_parsed']
+ if 'media_keywords' in entry:
+ info['keywords'] = entry['media_keywords'].split(', ')
+ info['url'] = entry['links'][0]['href']
+ info['id'] = findString(info['url'], "/watch?v=")
+ info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
+ if video_url_base:
+ info['flv'] = "%s/%s.%s" % (video_url_base, info['id'], 'flv')
+ info['mp4'] = "%s/%s.%s" % (video_url_base, info['id'], 'mp4')
+ else:
+ info['flv'] = getVideoUrl(info['id'], 'flv')
+ info['flv_high'] = getVideoUrl(info['id'], 'high')
+ info['mp4'] = getVideoUrl(info['id'], 'mp4')
+ info['720p'] = getVideoUrl(info['id'], '720p')
+ info['1080p'] = getVideoUrl(info['id'], '1080p')
+ info['embed'] = '' % (info['id'], info['id'])
+ return info
+
+def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
+ query = quote(query)
+ url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
+ data = readUrlUnicode(url)
+ fd = feedparser.parse(data)
+ videos = []
+ for entry in fd.entries:
+ v = getInfoFromAtom(entry, video_url_base)
+ videos.append(v)
+ if len(videos) >= max_results:
+ return videos
+ return videos
+
+'''
+def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None):
+ url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
+ data = readUrlUnicode(url)
+ regx = re.compile(' ')
+ id_title = regx.findall(data)
+ data_flat = data.replace('\n', ' ')
+ videos = {}
+ for video in id_title:
+ vid = video[0]
+ if vid not in videos:
+ v = dict()
+ v['id'] = vid
+ v['link'] = "http//youtube.com/watch.v=%s" % v['id']
+ v['title'] = video[2].strip()
+ if video_url_base:
+ v['video_link'] = "%s/%s" % (video_url_base, v['id'])
+ else:
+ v['video_url'] = getVideoUrl(v['id'])
+ v['description'] = findRe(data, 'BeginvidDesc%s">(.*?)' % v['id']).strip().replace('', ' ').replace('', '')
+ v['thumbnail'] = video[1]
+ videos[vid] = v
+ if len(videos) >= max_results:
+ return videos.values()
+ return videos.values()
+'''
+