diff --git a/README b/README index 1fabf7b..acdfb8c 100644 --- a/README +++ b/README @@ -1,22 +1,37 @@ -python-oxlib some tools to build tools +python-ox some tools to build tools Depends: python2.5 python-chardet (http://chardet.feedparser.org/) + python-feedparser (http://www.feedparser.org/) + python-beautifulsoup (http://www.crummy.com/software/BeautifulSoup/) Usage: - import oxlib + import ox - data = oxlib.cache.readUrl('http:/...') - text = oxlib.stripTags(data) - oxlib.normalizeNewlines(text) - oxlib.formatBytes(len(data)) + data = ox.cache.readUrl('http:/...') + text = ox.stripTags(data) + ox.normalizeNewlines(text) + ox.formatBytes(len(data)) - oxlib.formatBytes(1234567890) + ox.formatBytes(1234567890) '1.15 GB' + import ox.web.imdb + imdbId = ox.web.imdb.guess('The Matrix') + info = ox.web.imdb.Imdb(imdbId) + info['year'] + 1999 + Install: python setup.py install +Cookies: + some ox.web modules require user accont information or cookies to work, + those are saved in ~/.ox/auth.json, most basic form looks like this: + { + "key": "value" + } + Tests: - nosetests --with-doctest oxlib + nosetests --with-doctest ox diff --git a/ox/__init__.py b/ox/__init__.py index 7961627..4004411 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 # GPL 2008 -__version__ = '1.0.0' +__version__ = '2.0.0' from file import * from format import * diff --git a/ox/web/__init__.py b/ox/web/__init__.py new file mode 100644 index 0000000..1b94046 --- /dev/null +++ b/ox/web/__init__.py @@ -0,0 +1,9 @@ +# vi:si:et:sw=4:sts=4:ts=4 +# encoding: utf-8 +__version__ = '1.0.0' + +import imdb +import wikipedia +import google +import piratecinema +import oxdb diff --git a/ox/web/aaaarg.py b/ox/web/aaaarg.py new file mode 100644 index 0000000..43f9d55 --- /dev/null +++ b/ox/web/aaaarg.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import os +import string + +from ox import cache +from ox.html import stripTags, decodeHtml +from ox.text import findRe +from ox.normalize import canonicalName +import auth + + +def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): + headers = headers.copy() + headers["Cookie"] = auth.get("aaaarg.cookie") + return cache.readUrl(url, data, headers, timeout) + +def readUrlUnicode(url, timeout=cache.cache_timeout): + return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) + +def downloadText(id, filename=None): + #FIXME, what about the cache, this keeps all pdfs in oxcache... + url='http://a.aaaarg.org/node/%d/download' % id + data = readUrl(url, timeout=-1) + headers = cache.getHeaders(url, timeout=-1) + if filename: + with open(filename, "w") as f: + f.write(data) + return + return data + +def getTextByLetter(letter): + texts = [] + url = 'http://a.aaaarg.org/library/%s' % letter + data = readUrlUnicode(url) + txts = re.compile('
  • (.*?)
  • (.*?)
  • ').findall(data) + author = 'Unknown Author' + for r in txts: + if r[0] != ' ': + author = r[0] + link = r[1] + id = findRe(link, '/(\d+)') + title = decodeHtml(r[2]) + author_foder = canonicalName(author) + author_foder = os.path.join(author_foder[0], author_foder) + filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id)) + texts.append({ + 'author': author, + 'title': title, + 'id': id, + 'filename': filename, + }) + return texts + +def getTexts(): + texts = [] + for letter in string.letters[:26]: + texts += getTextByLetter(letter) + return texts + diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py new file mode 100644 index 0000000..b189645 --- /dev/null +++ b/ox/web/allmovie.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import time + +from ox import stripTags, findRe +from ox.cache import readUrlUnicode + + +def getId(url): + return url.split("/")[-2] + +def getData(id): + ''' + >>> getData('129689')['cast'][1][1] + u'Marianne' + >>> getData('129689')['credits'][0][0] + u'Jean-Luc Godard' + >>> getData('129689')['posters'][0] + u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' + >>> getData('129689')['rating'] + u'4.5' + ''' + data = { + "url": getUrl(id) + } + html = readUrlUnicode(data["url"]) + data['aka'] = parseList(html, 'AKA') + data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)') + data['countries'] = parseList(html, 'Countries') + data['director'] = parseEntry(html, 'Director') + data['genres'] = parseList(html, 'Genres') + data['keywords'] = parseList(html, 'Keywords') + data['posters'] = [findRe(html, '(\d+) min.') + data['set'] = parseEntry(html, 'Set In') + data['synopsis'] = parseText(html, 'Plot Synopsis') + data['themes'] = parseList(html, 'Themes') + data['types'] = parseList(html, 'Types') + data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"') + html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id) + data['cast'] = parseTable(html) + html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id) + data['credits'] = parseTable(html) + html = readUrlUnicode("http://allmovie.com/work/%s/review" % id) + data['review'] = parseText(html, 'Review') + return data + +def getUrl(id): + return "http://allmovie.com/work/%s/" % id + +def parseEntry(html, title): + return stripTags(findRe(html, '%s(.*?)' % title)).strip() + +def parseList(html, title): + html = findRe(html, '%s(.*?)' % title) + return map(lambda x: stripTags(x), re.compile('
  • (.*?)
  • ', re.DOTALL).findall(html)) + +def parseTable(html): + return map( + lambda x: map( + lambda x: stripTags(x).strip().replace(' ', ''), + x.split('-') + ), + findRe(html, '
    (.*?)').split('')[:-1] + ) + +def parseText(html, title): + return stripTags(findRe(html, '%s.*?

    (.*?)' % title)).strip() + +if __name__ == '__main__': + print getData('129689') + # print getData('177524') + diff --git a/ox/web/auth.py b/ox/web/auth.py new file mode 100644 index 0000000..fdb283d --- /dev/null +++ b/ox/web/auth.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +# GPL 2009 +import os +import simplejson + + +def get(key): + user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json')) + auth = {} + if os.path.exists(user_auth): + f = open(user_auth, "r") + data = f.read() + f.close() + auth = simplejson.loads(data) + if key in auth: + return auth[key] + print "please add key %s to json file '%s'" % (key, user_auth) + return "" + diff --git a/ox/web/criterion.py b/ox/web/criterion.py new file mode 100644 index 0000000..c204360 --- /dev/null +++ b/ox/web/criterion.py @@ -0,0 +1,90 @@ +# -*- coding: UTF-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re + +import ox.cache +from ox.cache import readUrlUnicode +from ox.html import stripTags +from ox.text import findRe, removeSpecialCharacters + +import imdb + +def getId(url): + return url.split("/")[-1] + +def getUrl(id): + return "http://www.criterion.com/films/%s" % id + +def getData(id): + ''' + >>> getData('1333')['imdbId'] + '0060304' + + >>> getData('236')['posters'][0] + 'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg' + + >>> getData('786')['posters'][0] + 'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg' + ''' + data = { + "url": getUrl(id) + } + try: + html = readUrlUnicode(data["url"]) + except: + html = ox.cache.getUrl(data["url"]) + data["number"] = findRe(html, "

    (.*?)

    ") + data["title"] = findRe(html, "

    (.*?)

    ") + data["director"] = findRe(html, "

    (.*?)

    ") + results = re.compile("

    (.*?)

    ").findall(html) + data["country"] = results[0] + data["year"] = results[1] + result = findRe(html, "
    (.*?)
    ") + data["synopsis"] = findRe(result, "

    (.*?)

    ") + result = findRe(html, "
    (.*?)
    ") + if 'Blu-Ray' in result or 'Essential Art House DVD' in result: + result = re.compile("
    (.*?)
    ", re.DOTALL).findall(html)[1] + result = findRe(result, "") + if not "/boxsets/" in result: + data["posters"] = [result] + else: + html_ = readUrlUnicode(result) + result = findRe(html_, "(.*?)" % id) + result = findRe(result, "src=\"(.*?)\"") + data["posters"] = [result.replace("_w100", "")] + result = findRe(html, "\"Film>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?key')[0] + 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv' + + >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?key')[0] + 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv' + ''' + data = readUrl(url) + video = re.compile('''video", "(.*?)"''').findall(data) + for v in video: + v = unquote(v).split('@@')[0] + return "http://www.dailymotion.com" + v + return '' + diff --git a/ox/web/epguides.py b/ox/web/epguides.py new file mode 100644 index 0000000..d4ad1aa --- /dev/null +++ b/ox/web/epguides.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import time + +from ox import stripTags, findRe +from ox.cache import readUrlUnicode + +import google + + +def getShowUrl(title): + ''' + Search Epguide Url for Show via Show Title. + Use Google to search the url, this is also done on Epguide. + ''' + for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1): + if url.startswith('http://epguides.com'): + if re.search(title, name): + return url + return None + +def getShowData(url): + data = readUrlUnicode(url) + r = {} + r['title'] = stripTags(findRe(data, '

    (.*?)

    ')) + r['imdb'] = findRe(data, '

    .*?

    ') + r['episodes'] = {} + #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear + for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) (.*?)').findall(data): + air_date = episode[3].strip() + #'22 Sep 04' -> 2004-09-22 + try: + air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y')) + except: + pass + s = episode[1].split('-')[0].strip() + e = episode[1].split('-')[-1].strip() + try: + r['episodes']['S%02dE%02d' % (int(s), int(e))] = { + 'prod code': episode[2], + 'air date': air_date, + 'url': episode[4], + 'title':episode[5], + } + except: + print "oxweb.epguides failed,", url + return r + diff --git a/ox/web/google.py b/ox/web/google.py new file mode 100644 index 0000000..a980b2e --- /dev/null +++ b/ox/web/google.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import time +import urllib +import urllib2 +import weakref +import threading +import Queue +import simplejson + + +import ox +from ox import stripTags + + +''' +usage: +import google +google.find(query) + +for result in google.find(query): result + +result is title, url, description + +google.find(query, max_results) + +FIXME: how search depper than first page? +''' +DEFAULT_MAX_RESULTS = 10 +DEFAULT_TIMEOUT = 24*60*60 + +def readUrl(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT): + return ox.cache.readUrl(url, data, headers, timeout) + +def quote_plus(s): + return urllib.quote_plus(s.encode('utf-8')) + +def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): + url = "http://www.google.com/search?q=%s" % quote_plus(query) + data = readUrl(url, timeout=timeout) + link_re = r'(?P.*?)' + \ + r'.*?(?:
    |)' + \ + r'(?P.*?)' + '(?:| max_results: + results = results[:max_results] + return results + +def _find(query): + url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=%s' % quote_plus(query) + results = simplejson.loads(ox.cache.readUrlUnicode(url))['responseData']['results'] + return results + diff --git a/ox/web/imdb.py b/ox/web/imdb.py new file mode 100644 index 0000000..2e79ac9 --- /dev/null +++ b/ox/web/imdb.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import urllib2 +from urllib import quote, unquote +import re +import os +import time + +import ox +from ox import findRe +from ox.normalize import normalizeTitle, normalizeImdbId + +from siteparser import SiteParser +import google + +class Imdb(SiteParser): + regex = { + 'cast': { + 'page': 'combined', + 're': '.*?>(.*?).*?(.*?)', + 'type': 'list' + }, + 'cinematographers': { + 'page': 'combined', + 're': [ + 'Cinematography by(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'connections': { + 'page': 'movieconnections', + 're': '
    (.*?)
    (.*?)\n\n', + 'type': 'list' + }, + 'countries': { + 'page': 'combined', + 're': '(.*?)', + 'type': 'list' + }, + 'directors': { + 'page': 'combined', + 're': [ + 'Directed by(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'editors': { + 'page': 'combined', + 're': [ + 'Film Editing by(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'filming_locations': { + 'page': 'locations', + 're': '(.*?)', + 'type': 'list' + }, + 'genres': { + 'page': 'combined', + 're': '(.*?)', + 'type': 'list' + }, + 'keywords': { + 'page': 'keywords', + 're': '(.*?)', + 'type': 'list' + }, + 'languages': { + 'page': 'combined', + 're': '(.*?)', + 'type': 'list' + }, + 'plot': { + 'page': 'plotsummary', + 're': '

    (.*?)', + 'type': 'string' + }, + 'poster_id': { + 'page': 'combined', + 're': '/primary-photo/media/rm(.*?)/tt', + 'type': 'list' + }, + 'poster_ids': { + 'page': 'posters', + 're': '/unknown-thumbnail/media/rm(.*?)/tt', + 'type': 'list' + }, + 'producers': { + 'page': 'combined', + 're': [ + 'Produced by(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'rating': { + 'page': 'combined', + 're': '

    .*?(.*?)/10', + 'type': 'float' + }, + 'release_date': { + 'page': 'releaseinfo', + 're': '.*? ', + 'type': 'date' + }, + 'runtime': { + 'page': 'combined', + 're': '
    Runtime:
    .*?([0-9]+ sec|[0-9]+ min).*?
    ', + 'type': 'string' + }, + 'title': { + 'page': 'combined', + 're': '

    (.*?) ', + 'type': 'string' + }, + 'trivia': { + 'page': 'trivia', + 're': '
    (.*?)
    ', + 'type': 'list', + }, + 'votes': { + 'page': 'combined', + 're': '
    (.*?) votes', + 'type': 'string' + }, + 'writers': { + 'page': 'combined', + 're': [ + 'Writing credits(.*?)', + '(.*?)' + ], + 'type': 'list' + }, + 'year': { + 'page': 'combined', + 're': '', + 'type': 'int' + } + } + + def __init__(self, id): + self.baseUrl = "http://www.imdb.com/title/tt%s/" % id + super(Imdb, self).__init__() + + if 'runtime' in self: + if 'min' in self['runtime']: base=60 + else: base=1 + self['runtime'] = int(findRe(self['runtime'], '([0-9]+)')) * base + + if 'connections' in self: + cc={} + for rel, data in self['connections']: + cc[unicode(rel)] = re.compile('').findall(data) + self['connections'] = cc + +def guess(title, director='', timeout=google.DEFAULT_TIMEOUT): + #FIXME: proper file -> title + title = title.split('-')[0] + title = title.split('(')[0] + title = title.split('.')[0] + title = title.strip() + imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) + return_url = '' + + #lest first try google + #i.e. site:imdb.com Michael Stevens Sin + if director: + search = 'site:imdb.com %s "%s"' % (director, title) + else: + search = 'site:imdb.com "%s"' % title + for (name, url, desc) in google.find(search, 2, timeout=timeout): + if url.startswith('http://www.imdb.com/title/tt'): + return normalizeImdbId(int(ox.intValue(url))) + + try: + req = urllib2.Request(imdb_url, None, ox.net.DEFAULT_HEADERS) + u = urllib2.urlopen(req) + data = u.read() + return_url = u.url + u.close() + except: + return None + if return_url.startswith('http://www.imdb.com/title/tt'): + return return_url[28:35] + if data: + imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?>> getData('1991/silence_of_the_lambs')['imdbId'] + u'0102926' + + >>> getData('1991/silence_of_the_lambs')['posters'][0] + u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg' + + >>> getData('1991/silence_of_the_lambs')['url'] + u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' + ''' + data = { + 'url': getUrl(id) + } + html = readUrlUnicode(data['url']) + data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ') + data['title'] = stripTags(findRe(html, '

      (.*?) \(')) + data['year'] = findRe(html, '\((.*?)\)') + data['posters'] = [] + results = re.compile('')) + 1 + for page in range(pages, 0, -1): + for id in getIdsByPage(page): + if not id in ids: + ids.append(id) + return ids + +def getIdsByPage(page): + ids = [] + html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1) + results = re.compile('', re.DOTALL).findall(html) + for result in results: + url = 'http://impawards.com/%s' % result + ids.append(getId(url)) + return set(ids) + +def getUrl(id): + url = "http://www.impawards.com/%s.html" % id + html = readUrlUnicode(url) + if findRe(html, "No Movie Posters on This Page"): + url = "http://www.impawards.com/%s_ver1.html" % id + return url + +if __name__ == '__main__': + ids = getIds() + print sorted(ids), len(ids) diff --git a/ox/web/itunes.py b/ox/web/itunes.py new file mode 100644 index 0000000..5348e40 --- /dev/null +++ b/ox/web/itunes.py @@ -0,0 +1,187 @@ +# vi:si:et:sw=4:sts=4:ts=4 +# encoding: utf-8 +import re +import urllib + +from ox.cache import readUrl +from ox.html import decodeHtml, stripTags +from ox.text import findRe +from ox.text import findString + + +# to sniff itunes traffic, use something like +# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net + +# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit +# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit + +ITUNES_HEADERS = { + 'X-Apple-Tz': '0', + 'X-Apple-Storefront': '143441-1', + 'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)', + 'Accept-Language': 'en-us, en;q=0.50', + 'Accept-Encoding': 'gzip', + 'Connection': 'close', +} + +def composeUrl(request, parameters): + if request == 'advancedSearch': + url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?' + if parameters['media'] == 'music': + url += urllib.urlencode({ + 'albumTerm': parameters['title'], + 'allArtistNames': parameters['artist'], + 'composerTerm': '', + 'flavor': 0, + 'genreIndex': 1, + 'media': 'music', + 'mediaType': 2, + 'ringtone': 0, + 'searchButton': 'submit', + 'songTerm': '' + }) + elif parameters['media'] == 'movie': + url += urllib.urlencode({ + 'actorTerm': '', + 'closedCaption': 0, + 'descriptionTerm': '', + 'directorProducerName': parameters['director'], + 'flavor': 0, + 'media': 'movie', + 'mediaType': 3, + 'movieTerm': parameters['title'], + 'ratingIndex': 1, + 'releaseYearTerm': '', + 'searchButton': 'submit' + }) + elif request == 'viewAlbum': + url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id'] + elif request == 'viewMovie': + url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id'] + return url + +def parseXmlDict(xml): + values = {} + strings = xml.split('') + for string in strings: + if string.find('') != -1: + key = findRe(string, '(.*?)') + type = findRe(string, '<(.*?)>') + if type == 'true/': + value = True + else: + value = findRe(string, '<%s>(.*?)' % (type, type)) + if type == 'integer': + value = int(value) + elif type == 'string': + value = decodeHtml(value) + values[key] = value + return values + +def parseCast(xml, title): + list = [] + try: + strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings.pop() + for string in strings: + list.append(findRe(string, '(.*?)')) + return list + except: + return list + +def parseMovies(xml, title): + list = [] + try: + strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings.pop() + for string in strings: + list.append({ + 'id': findRe(string, 'viewMovie\?id=(.*?)&'), + 'title': findRe(string, '(.*?)') + }) + return list + except: + return list + +class ItunesAlbum: + def __init__(self, id = '', title = '', artist = ''): + self.id = id + self.title = title + self.artist = artist + if not id: + self.id = self.getId() + + def getId(self): + url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) + xml = readUrl(url, headers = ITUNES_HEADERS) + id = findRe(xml, 'viewAlbum\?id=(.*?)&') + return id + + def getData(self): + data = {'id': self.id} + url = composeUrl('viewAlbum', {'id': self.id}) + xml = readUrl(url, None, ITUNES_HEADERS) + data['albumName'] = findRe(xml, '(.*?)') + data['artistName'] = findRe(xml, '(.*?)') + data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') + data['genre'] = findRe(xml, 'Genre:(.*?)<') + data['releaseDate'] = findRe(xml, 'Released(.*?)<') + data['review'] = stripTags(findRe(xml, 'REVIEW.*?(.*?)')) + data['tracks'] = [] + strings = findRe(xml, 'items.*?(.*?)$').split('') + for string in strings: + data['tracks'].append(parseXmlDict(string)) + data['type'] = findRe(xml, 'listType(.*?)<') + return data + +class ItunesMovie: + def __init__(self, id = '', title = '', director = ''): + self.id = id + self.title = title + self.director = director + if not id: + self.id = self.getId() + + def getId(self): + url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) + xml = readUrl(url, headers = ITUNES_HEADERS) + id = findRe(xml, 'viewMovie\?id=(.*?)&') + return id + + def getData(self): + data = {'id': self.id} + url = composeUrl('viewMovie', {'id': self.id}) + xml = readUrl(url, None, ITUNES_HEADERS) + f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') + f.write(xml) + f.close() + data['actors'] = parseCast(xml, 'actors') + string = findRe(xml, 'Average Rating:(.*?)') + data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5 + data['directors'] = parseCast(xml, 'directors') + data['format'] = findRe(xml, 'Format:(.*?)<') + data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) + data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?)')) + data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') + data['producers'] = parseCast(xml, 'producers') + data['rated'] = findRe(xml, 'Rated(.*?)<') + data['relatedMovies'] = parseMovies(xml, 'related movies') + data['releaseDate'] = findRe(xml, 'Released(.*?)<') + data['runTime'] = findRe(xml, 'Run Time:(.*?)<') + data['screenwriters'] = parseCast(xml, 'screenwriters') + data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') + data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') + return data + +if __name__ == '__main__': + import simplejson + data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + for v in data['relatedMovies']: + data = ItunesMovie(id = v['id']).getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + data = ItunesMovie(id='272960052').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + diff --git a/ox/web/karagarga.py b/ox/web/karagarga.py new file mode 100644 index 0000000..27b4d0a --- /dev/null +++ b/ox/web/karagarga.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +from ox import cache +from ox.html import stripTags +from ox.text import findRe + +import auth + + +def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): + headers = headers.copy() + headers["Cookie"] = auth.get("karagarga.cookie") + return cache.readUrl(url, data, headers, timeout) + +def readUrlUnicode(url, timeout=cache.cache_timeout): + return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout) + +def getData(id): + data = { + "url": getUrl(id) + } + html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1")) + if 'No torrent with ID' in html: + return False + data['added'] = stripTags(parseTable(html, 'Added')) + data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"') + # data['description'] = parseTable(html, 'Description') + data['director'] = stripTags(parseTable(html, 'Director / Artist')) + data['files'] = [] + result = findRe(html, '(.*?)
      ') + results = re.compile('(.*?)(.*?)', re.DOTALL).findall(result) + for name, size in results: + data['files'].append({ + 'name': name, + 'size': '%s %s' % (size[:-2], size[-2:].strip().upper()) + }) + data['format'] = '' + if html.find('genreimages/dvdr.png') != -1: + data['format'] = 'DVD' + elif html.find('genreimages/hdrip.png') != -1: + data['format'] = 'HD' + data['genre'] = [] + result = parseTable(html, 'Genres') + for string in result.split('\n'): + string = stripTags(findRe(string, '
      (.*?)')) + if string: + data['genre'].append(string) + data['id'] = id + data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})') + data['language'] = stripTags(parseTable(html, 'Language')) + data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)')) + data['link'] = stripTags(parseTable(html, 'Internet Link')) + data['links'] = [] + results = re.compile('(.*?)', re.DOTALL).findall(parseTable(html, 'Description')) + for (url, title) in results: + if url.find('javascript') == -1: + data['links'].append({ + 'title': title, + 'url': url.replace('http://anonym.to/?', '') + }) + data['people'] = 0 + result = stripTags(findRe(html, '(.*?) seeder\(s\)')) + data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', '')) + data['snatched'] = int(findRe(html, '.*?colspan=2>(.*?) ')) + data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)


      ').replace('included: ', '') + data['subtitles'] = [] + results = re.compile('(.*?)', re.DOTALL).findall(parseTable(html, 'Subtitles')) + for (url, language) in results: + data['subtitles'].append({ + 'language': language.replace('click here for ', ''), + 'url': url + }) + data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"') + data['year'] = stripTags(parseTable(html, 'Year')) + data['title'] = stripTags(findRe(html, '

      (.*?)

      ')).strip() + data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title']) + data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title']) + return data + +def getId(url): + return url.split("=")[-1] + +def getTorrent(id): + return readUrl(getData(id)['torrent']) + +def getIds(lastId = 20): + lastId = '%s' % lastId + ids = [] + page = 0 + while True: + for id in getIdsByPage(page): + if not id in ids: + ids.append(id) + if lastId in ids: + break + page += 1 + return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids)))) + +def getIdsByPage(page): + ids = [] + url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page + html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day + strings = html.split('') + strings.pop(0) + for string in strings: + ids.append(findRe(string, '"details.php\?id=(.*?)"')) + return ids + +def getUrl(id): + return "http://karagarga.net/details.php?id=%s" % id + +def parseTable(html, title): + if title == 'Genres': + return findRe(html, '(.*?)
      ') + url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) + xml = readUrl(url) + lyrics = findRe(xml, '(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') + lyrics = lyrics.replace('\n', '').replace('\r', '') + lyrics = lyrics.replace('[br]', '\n').strip() + lyrics.replace('\n\n\n', '\n\n') + lyrics = decodeHtml(lyrics.replace('&', '&')) + return lyrics + +if __name__ == '__main__': + print getLyrics('Election Day', 'Arcadia') diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py new file mode 100644 index 0000000..34e20a3 --- /dev/null +++ b/ox/web/metacritic.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +from urllib import quote + +from ox.cache import readUrl, readUrlUnicode +from ox import findRe, decodeHtml, stripTags + + +def getMetacriticShowUrl(title): + title = quote(title) + url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title + data = readUrl(url) + return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') + +def getData(title, url=None): + if not url: + url = getMetacriticShowUrl(title) + if not url: + return None + data = readUrlUnicode(url) + score = findRe(data, 'ALT="Metascore: (.*?)"') + if score: + score = int(score) + else: + score = -1 + + reviews = re.compile( + '
      (.*?)
      ' + '.*?(.*?)' + '.*?(.*?)
      ' + '.*?
      (.*?)
      ' + '.*?(.*?)(.*?)(.*?).*?.*?''' + for row in re.compile(regexp, re.DOTALL).findall(data): + torrentDate = row[0] + torrentExtra = row[1] + torrentId = row[2] + torrentTitle = decodeHtml(row[3]).strip() + torrentLink = "http://www.mininova.org/tor/" + torrentId + privateTracker = 'priv.gif' in torrentExtra + if not privateTracker: + results.append((torrentTitle, torrentLink, '')) + return results + +def findMovie(query, max_results=10): + '''search for torrents on mininova + ''' + url = "http://www.mininova.org/search/%s/seeds" % quote(query) + data = readUrlUnicode(url) + return _parseResultsPage(data, max_results) + +def findMovieByImdb(imdbId): + '''find torrents on mininova for a given imdb id + ''' + results = [] + imdbId = normalizeImdbId(imdbId) + data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) + return _parseResultsPage(data) + +def getId(mininovaId): + mininovaId = unicode(mininovaId) + d = findRe(mininovaId, "/(\d+)") + if d: + return d + mininovaId = mininovaId.split('/') + if len(mininovaId) == 1: + return mininovaId[0] + else: + return mininovaId[-1] + +def exists(mininovaId): + mininovaId = getId(mininovaId) + data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId) + if not data or 'Torrent not found...' in data: + return False + if 'tracker of this torrent requires registration.' in data: + return False + return True + +def getData(mininovaId): + _key_map = { + 'by': u'uploader', + } + mininovaId = getId(mininovaId) + torrent = dict() + torrent[u'id'] = mininovaId + torrent[u'domain'] = 'mininova.org' + torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId + torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId + torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId + + data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link']) + if '

      Torrent not found...

      ' in data: + return None + + for d in re.compile('

      .(.*?):(.*?)

      ', re.DOTALL).findall(data): + key = d[0].lower().strip() + key = _key_map.get(key, key) + value = decodeHtml(stripTags(d[1].strip())) + torrent[key] = value + + torrent[u'title'] = findRe(data, '(.*?):.*?') + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + torrent[u'description'] = findRe(data, '
      (.*?)
      ') + if torrent['description']: + torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() + t = readUrl(torrent[u'torrent_link']) + torrent[u'torrent_info'] = getTorrentInfo(t) + return torrent + +class Mininova(Torrent): + ''' + >>> Mininova('123') + {} + >>> Mininova('1072195')['infohash'] + '72dfa59d2338e4a48c78cec9de25964cddb64104' + ''' + def __init__(self, mininovaId): + self.data = getData(mininovaId) + if not self.data: + return + Torrent.__init__(self) + ratio = self.data['share ratio'].split(',') + self['seeder'] = -1 + self['leecher'] = -1 + if len(ratio) == 2: + val = intValue(ratio[0].replace(',','').strip()) + if val: + self['seeder'] = int(val) + val = intValue(ratio[1].replace(',','').strip()) + if val: + self['leecher'] = int(val) + val = intValue(self.data['downloads'].replace(',','').strip()) + if val: + self['downloaded'] = int(val) + else: + self['downloaded'] = -1 + published = self.data['added on'] + published = published.split(' +')[0] + self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S") + diff --git a/ox/web/movieposterdb.py b/ox/web/movieposterdb.py new file mode 100644 index 0000000..0068123 --- /dev/null +++ b/ox/web/movieposterdb.py @@ -0,0 +1,44 @@ +# -*- coding: UTF-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 + +import re + +from ox.cache import readUrlUnicode +from ox import findRe + +def getData(id): + ''' + >>> getData('0060304')['posters'][0] + u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg' + >>> getData('0123456')['posters'] + [] + ''' + data = { + "url": getUrl(id) + } + data["posters"] = getPostersByUrl(data["url"]) + return data + +def getId(url): + return url.split("/")[-2] + +def getPostersByUrl(url, group=True): + posters = [] + html = readUrlUnicode(url) + if url in html: + if group: + results = re.compile('', re.DOTALL).findall(html) + for result in results: + posters += getPostersByUrl(result, False) + results = re.compile('', re.DOTALL).findall(html) + for result in results: + html = readUrlUnicode(result) + posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) + return posters + +def getUrl(id): + return "http://www.movieposterdb.com/movie/%s/" % id + +if __name__ == '__main__': + print getData('0060304') + print getData('0133093') diff --git a/ox/web/opensubtitles.py b/ox/web/opensubtitles.py new file mode 100644 index 0000000..2986f7d --- /dev/null +++ b/ox/web/opensubtitles.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re + +import feedparser +from ox.cache import readUrl, readUrlUnicode +from ox import findRe, stripTags +from ox import langCode2To3, langTo3Code + +def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): + if len(language) == 2: + language = langCode2To3(language) + elif len(language) != 3: + language = langTo3Code(language) + url = "http://www.opensubtitles.org/en/search/" + if language: + url += "sublanguageid-%s/" % language + url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb) + data = readUrl(url) + if "title>opensubtitles.com - search results(.*?)

    ') + if '(' in r['title']: + r['year'] = findRe(r['title'], '\((\d*?)\)') + r['title'] = re.sub('\((\d*?)\)', '', r['title']).strip() + r['synopsis'] = findRe(data, '(.*?)') + r['average rating'] = findRe(data, '
    (.*?)
    ').strip() + return r + diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py new file mode 100644 index 0000000..ce5ee33 --- /dev/null +++ b/ox/web/siteparser.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +from datetime import datetime + +from ox.cache import readUrlUnicode +from ox import stripTags, decodeHtml + + +def cleanup(key, data, data_type): + if data: + if isinstance(data[0], basestring): + #FIXME: some types need stripTags + #data = [stripTags(decodeHtml(p)).strip() for p in data] + data = [decodeHtml(p).strip() for p in data] + elif isinstance(data[0], list) or isinstance(data[0], tuple): + data = [cleanup(key, p, data_type) for p in data] + while len(data) == 1: + data = data[0] + if data_type == 'list' and isinstance(data, basestring): + data = [data, ] + elif data_type != 'list': + data = '' + return data + +class SiteParser(dict): + baseUrl = '' + regex = {} + + def getUrl(self, page): + return "%s%s" % (self.baseUrl, page) + + def __init__(self): + for key in self.regex: + url = self.getUrl(self.regex[key]['page']) + data = readUrlUnicode(url) + if isinstance(self.regex[key]['re'], basestring): + data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) + data = cleanup(key, data, self.regex[key]['type']) + else: + for r in self.regex[key]['re']: + if isinstance(data, basestring): + data = re.compile(r, re.DOTALL).findall(data) + else: + data = [re.compile(r, re.DOTALL).findall(d) for d in data] + data = cleanup(key, data, self.regex[key]['type']) + def apply_f(f, data): + if data and isinstance(data[0], list): + data = [f(d) for d in data] + else: + data = f(data) + return data + if self.regex[key]['type'] == 'float': + data = apply_f(float, data) + elif self.regex[key]['type'] == 'int': + data = apply_f(int, data) + elif self.regex[key]['type'] == 'date': + parse_date = lambda d: datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') + data = apply_f(parse_date, data) + self[key] = data + diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py new file mode 100644 index 0000000..855ec47 --- /dev/null +++ b/ox/web/spiegel.py @@ -0,0 +1,292 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from datetime import datetime +import re +import time + +import ox.cache +from ox.html import decodeHtml, stripTags +import ox.net + + +def getNews(year, month, day): + sections = [ + 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', + 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' + ] + dt = datetime(year, month, day) + day = int(dt.strftime('%j')) + date = dt.strftime('%d.%m.%Y') + news = [] + for section in sections: + url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) + if date == time.strftime('%d.%m.%Y', time.localtime()): + html = ox.net.readUrl(url) + else: + html = ox.cache.readUrl(url) + for item in re.compile('
    (.*?)
    ', re.DOTALL).findall(item)[0]).strip() + try: + description = formatString(re.compile('

    (.*?)<', re.DOTALL).findall(item)[0]) + except: + description = '' + try: + imageUrl = re.compile('(.*?)', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') + if new['title1'][-1:] == ':': + new['title1'] = new['title1'][0:-1] + new['title2'] = new['title'][len(new['title1']) + 2:] + new['url'] = re.compile(' ', '') + string = string.replace('\n', ' ').replace(' ', ' ').strip() + string = string.replace('&', '&').replace(''', '\'').replace('"', '"') + return string + +def formatSection(string): + return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') + +def formatSubsection(string): + # SPIEGEL, SPIEGEL special + subsection = { + 'abi': 'Abi - und dann?', + 'formel1': 'Formel 1', + 'jobundberuf': 'Job & Beruf', + 'leben': 'Leben U21', + 'mensch': 'Mensch & Technik', + 'sonst': '', + 'staedte': u'St\xc3dte', + 'ussports': 'US-Sports', + 'wunderbar': 'wunderBAR' + } + if subsection.has_key(string): + return subsection[string].replace(u'\xc3', 'ae') + return string[:1].upper() + string[1:] + +def getIssue(year, week): + coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) + if not ox.net.exists(coverUrl): + return None + url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) + contents = [] + data = ox.cache.readUrl(url) + items = re.compile('(.*?)').findall(data) + for item in items: + item = item[1] + page = int(re.compile('&SE=(.*?)"').findall(item)[0]) + title = stripTags(item).strip() + contents.append({'title': title, 'page': page}) + pageUrl = {} + pages = page + 2 + for page in range(1, pages + 10): + url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page) + if ox.cache.exists(url): + pageUrl[page] = url + else: + pageUrl[page] = '' + return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} + + +def archiveIssues(): + ''' + this is just an example of an archiving application + ''' + p = {} + import os + import simplejson + import time + archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel' + localtime = time.localtime() + year = int(time.strftime('%Y', localtime)) + week = int(time.strftime('%W', localtime)) + for y in range(year, 1993, -1): + if y == year: + wMax = week + 1 + else: + wMax = 53 + for w in range(wMax, 0, -1): + print 'getIssue(%d, %d)' % (y, w) + issue = getIssue(y, w) + if issue: + dirname = '%s/%d/%02d' % (archivePath, y, w) + if not os.path.exists(dirname): + os.makedirs(dirname) + filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w) + if not os.path.exists(filename): + data = simplejson.dumps(issue, ensure_ascii = False) + f = open(filename, 'w') + f.write(data) + f.close() + filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w) + if not os.path.exists(filename): + data = [] + for item in issue['contents']: + data.append('%3d %s' % (item['page'], item['title'])) + data = '\n'.join(data) + f = open(filename, 'w') + f.write(data) + f.close() + filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) + if not os.path.exists(filename): + data = ox.cache.readUrl(issue['coverUrl']) + f = open(filename, 'w') + f.write(data) + f.close() + for page in issue['pageUrl']: + url = issue['pageUrl'][page] + if url: + filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) + if not os.path.exists(filename): + data = ox.cache.readUrl(url) + f = open(filename, 'w') + f.write(data) + f.close() + if not p: + p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']} + else: + p['num'] += 1 + p['sum'] += issue['pages'] + if issue['pages'] < p['min']: + p['min'] = issue['pages'] + if issue['pages'] > p['max']: + p['max'] = issue['pages'] + print p['min'], p['sum'] / p['num'], p['max'] + + +def archiveNews(): + ''' + this is just an example of an archiving application + ''' + import os + import simplejson + import time + + count = {} + colon = [] + + archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online' + days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + localtime = time.localtime() + year = int(time.strftime('%Y', localtime)) + month = int(time.strftime('%m', localtime)) + day = int(time.strftime('%d', localtime)) - 1 + for y in range(year, 1999, -1): + if y == year: + mMax = month + else: + mMax = 12 + for m in range(mMax, 0, -1): + if y == year and m == month: + dMax = day + elif m == 2 and y % 4 == 0 and y % 400 != 0: + dMax = days[m] + 1 + else: + dMax = days[m] + for d in range(dMax, 0, -1): + print 'getNews(%d, %d, %d)' % (y, m, d) + news = getNews(y, m ,d) + for new in news: + dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16] + if not os.path.exists(dirname): + os.makedirs(dirname) + if new['url'][-5:] == '.html': + filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json' + else: + filename = dirname + '/' + new['url'] + '.json' + if not os.path.exists(filename) or True: + data = simplejson.dumps(new, ensure_ascii = False) + f = open(filename, 'w') + f.write(data) + f.close() + filename = filename[:-5] + '.txt' + if not os.path.exists(filename) or True: + data = splitTitle(new['title']) + data.append(new['description']) + data = '\n'.join(data) + f = open(filename, 'w') + f.write(data) + f.close() + filename = dirname + '/' + new['imageUrl'].split('/')[-1] + if not os.path.exists(filename): + data = ox.cache.readUrl(new['imageUrl']) + f = open(filename, 'w') + f.write(data) + f.close() + + strings = new['url'].split('/') + string = strings[3] + if len(strings) == 6: + string += '/' + strings[4] + if not count.has_key(string): + count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} + else: + count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} + strings = splitTitle(new['title']) + if strings[0] != new['title1'] or strings[1] != new['title2']: + colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2'])) + for key in sortDictByKey(count): + print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string']) + for value in colon: + print value + +def sortDictByKey(d): + keys = d.keys() + keys.sort() + return keys + +if __name__ == '__main__': + # spiegel = Spiegel(2008, 8) + # print spiegel.getContents() + # news = News(2001, 9, 10) + # output(news.getNews()) + ''' + x = [] + for d in range(10, 30): + print '2/%d' % d + news = getNews(2008, 2, d) + for new in news: + strings = new['url'].split('/') + string = formatSection(strings[3]) + if len(strings) == 6: + string += '/' + formatSubsection(strings[4]) + if not string in x: + x.append(string) + print x + ''' + # archiveIssues() + archiveNews() diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py new file mode 100644 index 0000000..4202a4d --- /dev/null +++ b/ox/web/thepiratebay.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from datetime import datetime +import re +import socket +from urllib import quote, urlencode +from urllib2 import URLError + +from ox.cache import readUrl, readUrlUnicode +from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines +from ox.normalize import normalizeImdbId +import ox + +from torrent import Torrent + +cache_timeout = 24*60*60 # cache search only for 24 hours + +season_episode = re.compile("S..E..", re.IGNORECASE) + + +def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): + headers = headers.copy() + headers['Cookie'] = 'language=en_EN' + return cache.readUrl(url, data, headers, timeout) + +def _readUrlUnicode(url, timeout=cache.cache_timeout): + return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout) + +def findMovies(query, max_results=10): + results = [] + next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] + page_count = 1 + while next and page_count < 4: + page_count += 1 + url = next[0] + if not url.startswith('http'): + if not url.startswith('/'): + url = "/" + url + url = "http://thepiratebay.org" + url + data = _readUrlUnicode(url, timeout=cache_timeout) + regexp = '''(.*?).*?''' + for row in re.compile(regexp, re.DOTALL).findall(data): + torrentType = row[0] + torrentLink = "http://thepiratebay.org" + row[1] + torrentTitle = decodeHtml(row[2]) + # 201 = Movies , 202 = Movie DVDR, 205 TV Shows + if torrentType in ['201']: + results.append((torrentTitle, torrentLink, '')) + if len(results) >= max_results: + return results + next = re.compile('.*?next.gif.*?').findall(data) + return results + +def findMovieByImdb(imdb): + return findMovies("tt" + normalizeImdbId(imdb)) + +def getId(piratebayId): + if piratebayId.startswith('http://torrents.thepiratebay.org/'): + piratebayId = piratebayId.split('org/')[1] + d = findRe(piratebayId, "tor/(\d+)") + if d: + piratebayId = d + d = findRe(piratebayId, "torrent/(\d+)") + if d: + piratebayId = d + return piratebayId + +def exists(piratebayId): + piratebayId = getId(piratebayId) + return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId) + +def getData(piratebayId): + _key_map = { + 'spoken language(s)': u'language', + 'texted language(s)': u'subtitle language', + 'by': u'uploader', + 'leechers': 'leecher', + 'seeders': 'seeder', + } + piratebayId = getId(piratebayId) + torrent = dict() + torrent[u'id'] = piratebayId + torrent[u'domain'] = 'thepiratebay.org' + torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId + + data = _readUrlUnicode(torrent['comment_link']) + torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB') + if not torrent[u'title']: + return None + torrent[u'title'] = decodeHtml(torrent[u'title']).strip() + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + title = quote(torrent['title'].encode('utf-8')) + torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) + for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): + key = d[0].lower().strip() + key = _key_map.get(key, key) + value = decodeHtml(stripTags(d[1].strip())) + torrent[key] = value + torrent[u'description'] = findRe(data, '

    (.*?)
    ') + if torrent[u'description']: + torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() + t = _readUrl(torrent[u'torrent_link']) + torrent[u'torrent_info'] = getTorrentInfo(t) + return torrent + +class Thepiratebay(Torrent): + ''' + >>> Thepiratebay('123') + {} + + >>> Thepiratebay('3951349')['infohash'] + '4e84415d36ed7b54066160c05a0b0f061898d12b' + ''' + def __init__(self, piratebayId): + self.data = getData(piratebayId) + if not self.data: + return + Torrent.__init__(self) + published = self.data['uploaded'] + published = published.replace(' GMT', '').split(' +')[0] + self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S") + diff --git a/ox/web/torrent.py b/ox/web/torrent.py new file mode 100644 index 0000000..68cd274 --- /dev/null +++ b/ox/web/torrent.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from ox import intValue + + +class Torrent(dict): + ''' + >>> Torrent() + {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1} + ''' + _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', + 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language') + _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files') + _dict_keys = ('torrent_info', ) + _list_keys = () + data = {'torrent_info': {}} + + def __init__(self): + for key in self._string_keys: + self[key] = self.data.get(key, u'') + for key in self._dict_keys: + self[key] = self.data.get(key, {}) + for key in self._list_keys: + self[key] = self.data.get(key, []) + for key in self._int_keys: + value = self.data.get(key, -1) + if not isinstance(value, int): + value = int(intValue(value)) + self[key] = value + self['infohash'] = self.data['torrent_info'].get('hash', '') + self['size'] = self.data['torrent_info'].get('size', -1) + self['announce'] = self.data['torrent_info'].get('announce', '') + if 'files' in self.data['torrent_info']: + self['files'] = len(self.data['torrent_info']['files']) + else: + self['files'] = 1 + diff --git a/ox/web/tv.py b/ox/web/tv.py new file mode 100644 index 0000000..3808bbd --- /dev/null +++ b/ox/web/tv.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +import time + +from ox import stripTags, findRe +from ox.cache import readUrlUnicode + + +def getEpisodeData(url): + ''' + prases informatin on tvcom episode pages + returns dict with title, show, description, score + example: + getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') + ''' + data = readUrlUnicode(url) + r = {} + r['description'] = stripTags(findRe(data, 'div id="main-col">.*?
    (.*?)(.*?)') + r['title'] = findRe(data, '.*?: (.*?) - TV.com ') + #episode score + r['episode score'] = findRe(data, '(.*?)') + + match = re.compile('Episode Number: (\d*?)    Season Num: (\d*?)    First Aired: (.*?)  ').findall(data) + if match: + r['season'] = int(match[0][1]) + r['episode'] = int(match[0][0]) + #'Wednesday September 29, 2004' -> 2004-09-29 + r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y')) + return r + diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py new file mode 100644 index 0000000..3d99688 --- /dev/null +++ b/ox/web/wikipedia.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from urllib import urlencode + +import simplejson +from ox.cache import readUrl, readUrlUnicode +from ox import findRe, decodeHtml + + +def getId(url): + return url.split("/")[-1] + +def getUrl(id): + return "http://en.wikipedia.org/wiki/%s" % id + + +def getMovieId(title, director='', year=''): + query = '"%s" film %s %s' % (title, director, year) + result = find(query, 1) + if result: + return result[0][1] + return '' + +def getUrlByImdbId(imdbId): + query = '"%s"'% imdbId + result = find(query) + if result: + url = result[0][1] + return url + return "" + +def getUrlByImdb(imdbId): + # deprecated, use getUrlByImdbId() + return getUrlByImdbId(imdbId) + +def getUrlByAllmovieId(allmovieId): + query = '"amg_id = 1:%s"'% allmovieId + result = find(query) + if result: + url = result[0][1] + return url + return '' + +def getWikiData(wikipediaUrl): + url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') + url = "%s&action=raw" % url + data = readUrlUnicode(url) + return data + +def getMovieData(wikipediaUrl): + if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) + data = getWikiData(wikipediaUrl) + filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''') + filmbox = {} + _box = filmbox_data.strip().split('\n|') + if len(_box) == 1: + _box = _box[0].split('|\n') + for row in _box: + d = row.split('=') + if len(d) == 2: + key = d[0].strip() + if key[0] == '|': + key = key[1:] + value = d[1].strip() + filmbox[key] = value + if 'imdb title' in data: + filmbox['imdb_id'] = findRe(data, 'imdb title\|.*?(\d*?)\|') + elif 'imdb episode' in data: + filmbox['imdb_id'] = findRe(data, 'imdb episode\|.*?(\d*?)\|') + if 'Amg movie' in data: + filmbox['amg_id'] = findRe(data, 'Amg movie\|.*?(\d*?)\|') + if 'amg_id' in filmbox and filmbox['amg_id'].startswith('1:'): + filmbox['amg_id'] = filmbox['amg_id'][2:] + + if 'rotten-tomatoes' in data: + filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|id\=(.*?)\|') + if not filmbox['rottentomatoes_id']: + filmbox['rottentomatoes_id'] = findRe(data, 'rotten-tomatoes\|(.*?)\|') + if 'google video' in data: + filmbox['google_video_id'] = findRe(data, 'google video\|.*?(\d*?)\|') + if 'DEFAULTSORT' in data: + filmbox['title_sort'] = findRe(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') + return filmbox + +def getImageUrl(name): + data = readUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name) + url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"') + return url + +def getPosterUrl(wikipediaUrl): + if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) + data = getMovieData(wikipediaUrl) + if 'image' in data: + return getImageUrl(data['image']) + return '' + +def getMoviePoster(wikipediaUrl): + # deprecated, use getPosterUrl() + return getPosterUrl(wikipediaUrl) + +def getAllmovieId(wikipediaUrl): + data = getMovieData(wikipediaUrl) + return data.get('amg_id', '') + +def find(query, max_results=10): + query = {'action': 'query', 'list':'search', 'format': 'json', + 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} + url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) + data = readUrl(url) + if not data: + data = readUrl(url, timeout=0) + result = simplejson.loads(data) + results = [] + if result and 'query' in result: + for r in result['query']['search']: + title = r['title'] + url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_') + results.append((title, url, '')) + return results + diff --git a/ox/web/youtube.py b/ox/web/youtube.py new file mode 100644 index 0000000..5042c59 --- /dev/null +++ b/ox/web/youtube.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +from urllib import quote, unquote +import httplib +import xml.etree.ElementTree as ET +import re + +import feedparser +from ox.cache import readUrl, readUrlUnicode +from ox import findString, findRe + + +def getVideoKey(youtubeId): + data = readUrl("http://www.youtube.com/get_video_info?&video_id=%s" % youtubeId) + match = re.compile("token=(.+)&thumbnail").findall(data) + if match: + return unquote(match[0]) + return False + +def getVideoUrl(youtubeId, format='mp4'): + youtubeKey = getVideoKey(youtubeId) + if format == '1080p': + fmt=37 + url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt) + if format == '720p': + fmt=22 + url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt) + elif format == 'mp4': + fmt=18 + url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt) + elif format == 'high': + fmt=35 + url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s" % (youtubeId, youtubeKey, fmt) + else: + url = "http://youtube.com/get_video.php?video_id=%s&t=%s" % (youtubeId, youtubeKey) + return url + +def getMovieInfo(youtubeId, video_url_base=None): + url = "http://gdata.youtube.com/feeds/api/videos/%s" % youtubeId + data = readUrl(url) + fd = feedparser.parse(data) + return getInfoFromAtom(fd.entries[0], video_url_base) + +def getInfoFromAtom(entry, video_url_base=None): + info = dict() + info['title'] = entry['title'] + info['description'] = entry['description'] + info['author'] = entry['author'] + #info['published'] = entry['published_parsed'] + if 'media_keywords' in entry: + info['keywords'] = entry['media_keywords'].split(', ') + info['url'] = entry['links'][0]['href'] + info['id'] = findString(info['url'], "/watch?v=") + info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id'] + if video_url_base: + info['flv'] = "%s/%s.%s" % (video_url_base, info['id'], 'flv') + info['mp4'] = "%s/%s.%s" % (video_url_base, info['id'], 'mp4') + else: + info['flv'] = getVideoUrl(info['id'], 'flv') + info['flv_high'] = getVideoUrl(info['id'], 'high') + info['mp4'] = getVideoUrl(info['id'], 'mp4') + info['720p'] = getVideoUrl(info['id'], '720p') + info['1080p'] = getVideoUrl(info['id'], '1080p') + info['embed'] = '' % (info['id'], info['id']) + return info + +def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None): + query = quote(query) + url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results) + data = readUrlUnicode(url) + fd = feedparser.parse(data) + videos = [] + for entry in fd.entries: + v = getInfoFromAtom(entry, video_url_base) + videos.append(v) + if len(videos) >= max_results: + return videos + return videos + +''' +def find(query, max_results=10, offset=1, orderBy='relevance', video_url_base=None): + url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query) + data = readUrlUnicode(url) + regx = re.compile(' video') + id_title = regx.findall(data) + data_flat = data.replace('\n', ' ') + videos = {} + for video in id_title: + vid = video[0] + if vid not in videos: + v = dict() + v['id'] = vid + v['link'] = "http//youtube.com/watch.v=%s" % v['id'] + v['title'] = video[2].strip() + if video_url_base: + v['video_link'] = "%s/%s" % (video_url_base, v['id']) + else: + v['video_url'] = getVideoUrl(v['id']) + v['description'] = findRe(data, 'BeginvidDesc%s">(.*?)' % v['id']).strip().replace('', ' ').replace('', '') + v['thumbnail'] = video[1] + videos[vid] = v + if len(videos) >= max_results: + return videos.values() + return videos.values() +''' +