vi:si:et:sw=4:sts=4:ts=4

This commit is contained in:
j 2008-06-19 11:47:02 +02:00
parent 8886cfe8d3
commit 4c14ce613d
16 changed files with 1088 additions and 1134 deletions

View file

@ -1,7 +1,5 @@
# -*- Mode: Python; -*- # vi:si:et:sw=4:sts=4:ts=4
# vi:si:et:sw=2:sts=2:ts=2
# encoding: utf-8 # encoding: utf-8
__version__ = '0.1.0' __version__ = '0.1.0'
import imdb import imdb

View file

@ -7,6 +7,7 @@ from oxutils.cache import getUrlUnicode
from oxutils.html import stripTags from oxutils.html import stripTags
from oxutils.text import findRe, removeSpecialCharacters from oxutils.text import findRe, removeSpecialCharacters
def getData(criterionId): def getData(criterionId):
''' '''
>>> getData(348)['imdbId'] >>> getData(348)['imdbId']

View file

@ -1,19 +1,22 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re import re
from urllib import unquote from urllib import unquote
from oxutils.cache import getUrl from oxutils.cache import getUrl
def getVideoUrl(url): def getVideoUrl(url):
''' '''
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms') >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0' 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms') >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4' 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
''' '''
data = getUrl(url) data = getUrl(url)
video = re.compile('''video", "(.*?)"''').findall(data) video = re.compile('''video", "(.*?)"''').findall(data)
for v in video: for v in video:
v = unquote(v).split('@@')[0] v = unquote(v).split('@@')[0]
return "http://www.dailymotion.com" + v return "http://www.dailymotion.com" + v
return '' return ''

View file

@ -1,6 +1,5 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import time import time
import urllib import urllib
@ -29,24 +28,23 @@ FIXME: how search depper than first page?
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS): def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
google_timeout=24*60*60 google_timeout=24*60*60
return oxutils.cache.getUrl(url, data, headers, google_timeout) return oxutils.cache.getUrl(url, data, headers, google_timeout)
def quote_plus(s): def quote_plus(s):
return urllib.quote_plus(s.encode('utf-8')) return urllib.quote_plus(s.encode('utf-8'))
def find(query, max_results=DEFAULT_MAX_RESULTS): def find(query, max_results=DEFAULT_MAX_RESULTS):
url = "http://www.google.com/search?q=%s" % quote_plus(query) url = "http://www.google.com/search?q=%s" % quote_plus(query)
data = getUrl(url) data = getUrl(url)
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \ link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
r'.*?(?:<br>|<table.*?>)' + \ r'.*?(?:<br>|<table.*?>)' + \
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)' r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
results = [] results = []
for match in re.compile(link_re, re.DOTALL).finditer(data): for match in re.compile(link_re, re.DOTALL).finditer(data):
(name, url, desc) = match.group('name', 'url', 'desc') (name, url, desc) = match.group('name', 'url', 'desc')
results.append((stripTags(name), url, stripTags(desc))) results.append((stripTags(name), url, stripTags(desc)))
if len(results) > max_results: if len(results) > max_results:
results = results[:max_results] results = results[:max_results]
return results return results

1136
ox/imdb.py

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,5 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re import re
import ox.imdb as imdb import ox.imdb as imdb
@ -83,4 +85,4 @@ def archivePosters():
if __name__ == '__main__': if __name__ == '__main__':
archivePosters() archivePosters()
getMovieData('Brick', 'Rian Johnson') getMovieData('Brick', 'Rian Johnson')

View file

@ -1,3 +1,5 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re import re
import urllib import urllib
@ -6,6 +8,7 @@ from oxutils.html import decodeHtml, stripTags
from oxutils.text import findRe from oxutils.text import findRe
from oxutils.text import findString from oxutils.text import findString
# to sniff itunes traffic, use something like # to sniff itunes traffic, use something like
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net # sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
@ -22,162 +25,163 @@ ITUNES_HEADERS = {
} }
def composeUrl(request, parameters): def composeUrl(request, parameters):
if request == 'advancedSearch': if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?' url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music': if parameters['media'] == 'music':
url += urllib.urlencode({ url += urllib.urlencode({
'albumTerm': parameters['title'], 'albumTerm': parameters['title'],
'allArtistNames': parameters['artist'], 'allArtistNames': parameters['artist'],
'composerTerm': '', 'composerTerm': '',
'flavor': 0, 'flavor': 0,
'genreIndex': 1, 'genreIndex': 1,
'media': 'music', 'media': 'music',
'mediaType': 2, 'mediaType': 2,
'ringtone': 0, 'ringtone': 0,
'searchButton': 'submit', 'searchButton': 'submit',
'songTerm': '' 'songTerm': ''
}) })
elif parameters['media'] == 'movie': elif parameters['media'] == 'movie':
url += urllib.urlencode({ url += urllib.urlencode({
'actorTerm': '', 'actorTerm': '',
'closedCaption': 0, 'closedCaption': 0,
'descriptionTerm': '', 'descriptionTerm': '',
'directorProducerName': parameters['director'], 'directorProducerName': parameters['director'],
'flavor': 0, 'flavor': 0,
'media': 'movie', 'media': 'movie',
'mediaType': 3, 'mediaType': 3,
'movieTerm': parameters['title'], 'movieTerm': parameters['title'],
'ratingIndex': 1, 'ratingIndex': 1,
'releaseYearTerm': '', 'releaseYearTerm': '',
'searchButton': 'submit' 'searchButton': 'submit'
}) })
elif request == 'viewAlbum': elif request == 'viewAlbum':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id'] url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
elif request == 'viewMovie': elif request == 'viewMovie':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id'] url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url return url
def parseXmlDict(xml): def parseXmlDict(xml):
values = {} values = {}
strings = xml.split('<key>') strings = xml.split('<key>')
for string in strings: for string in strings:
if string.find('</key>') != -1: if string.find('</key>') != -1:
key = findRe(string, '(.*?)</key>') key = findRe(string, '(.*?)</key>')
type = findRe(string, '</key><(.*?)>') type = findRe(string, '</key><(.*?)>')
if type == 'true/': if type == 'true/':
value = True value = True
else: else:
value = findRe(string, '<%s>(.*?)</%s>' % (type, type)) value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer': if type == 'integer':
value = int(value) value = int(value)
elif type == 'string': elif type == 'string':
value = decodeHtml(value) value = decodeHtml(value)
values[key] = value values[key] = value
return values return values
def parseCast(xml, title): def parseCast(xml, title):
list = [] list = []
try: try:
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>') strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop() strings.pop()
for string in strings: for string in strings:
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list return list
except: except:
return list return list
def parseMovies(xml, title): def parseMovies(xml, title):
list = [] list = []
try: try:
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>') strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop() strings.pop()
for string in strings: for string in strings:
list.append({ list.append({
'id': findRe(string, 'viewMovie\?id=(.*?)&'), 'id': findRe(string, 'viewMovie\?id=(.*?)&'),
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>') 'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
}) })
return list return list
except: except:
return list return list
class ItunesAlbum: class ItunesAlbum:
def __init__(self, id = '', title = '', artist = ''): def __init__(self, id = '', title = '', artist = ''):
self.id = id self.id = id
self.title = title self.title = title
self.artist = artist self.artist = artist
if not id: if not id:
self.id = self.getId() self.id = self.getId()
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = getUrl(url, headers = ITUNES_HEADERS) xml = getUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&') id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = composeUrl('viewAlbum', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS) xml = getUrl(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>') data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>') data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<') data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<') data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = [] data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>') strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings: for string in strings:
data['tracks'].append(parseXmlDict(string)) data['tracks'].append(parseXmlDict(string))
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<') data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
return data return data
class ItunesMovie: class ItunesMovie:
def __init__(self, id = '', title = '', director = ''): def __init__(self, id = '', title = '', director = ''):
self.id = id self.id = id
self.title = title self.title = title
self.director = director self.director = director
if not id: if not id:
self.id = self.getId() self.id = self.getId()
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = getUrl(url, headers = ITUNES_HEADERS) xml = getUrl(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&') id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id}) url = composeUrl('viewMovie', {'id': self.id})
xml = getUrl(url, None, ITUNES_HEADERS) xml = getUrl(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml) f.write(xml)
f.close() f.close()
data['actors'] = parseCast(xml, 'actors') data['actors'] = parseCast(xml, 'actors')
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>') string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5 data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors') data['directors'] = parseCast(xml, 'directors')
data['format'] = findRe(xml, 'Format:(.*?)<') data['format'] = findRe(xml, 'Format:(.*?)<')
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers') data['producers'] = parseCast(xml, 'producers')
data['rated'] = findRe(xml, 'Rated(.*?)<') data['rated'] = findRe(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies') data['relatedMovies'] = parseMovies(xml, 'related movies')
data['releaseDate'] = findRe(xml, 'Released(.*?)<') data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['runTime'] = findRe(xml, 'Run Time:(.*?)<') data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters') data['screenwriters'] = parseCast(xml, 'screenwriters')
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
return data return data
if __name__ == '__main__': if __name__ == '__main__':
import simplejson import simplejson
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).getData()
print simplejson.dumps(data, sort_keys = True, indent = 4) print simplejson.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').getData() data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4) print simplejson.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').getData()
print simplejson.dumps(data, sort_keys = True, indent = 4)

View file

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from oxutils.cache import getUrl from oxutils.cache import getUrl
from oxutils.html import decodeHtml from oxutils.html import decodeHtml
from oxutils.text import findRe from oxutils.text import findRe
@ -16,4 +18,4 @@ def getLyrics(title, artist):
return lyrics return lyrics
if __name__ == '__main__': if __name__ == '__main__':
print getLyrics('Election Day', 'Arcadia') print getLyrics('Election Day', 'Arcadia')

View file

@ -1,7 +1,5 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime from datetime import datetime
import re import re
import socket import socket
@ -14,116 +12,115 @@ import oxutils
from torrent import Torrent from torrent import Torrent
socket.setdefaulttimeout(10.0)
def _parseResultsPage(data, max_results=10): def _parseResultsPage(data, max_results=10):
results=[] results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>''' regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0] torrentDate = row[0]
torrentExtra = row[1] torrentExtra = row[1]
torrentId = row[2] torrentId = row[2]
torrentTitle = decodeHtml(row[3]).strip() torrentTitle = decodeHtml(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra privateTracker = 'priv.gif' in torrentExtra
if not privateTracker: if not privateTracker:
results.append((torrentTitle, torrentLink, '')) results.append((torrentTitle, torrentLink, ''))
return results return results
def findMovie(query, max_results=10): def findMovie(query, max_results=10):
'''search for torrents on mininova '''search for torrents on mininova
''' '''
url = "http://www.mininova.org/search/%s/seeds" % quote(query) url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = getUrlUnicode(url) data = getUrlUnicode(url)
return _parseResultsPage(data, max_results) return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId): def findMovieByImdb(imdbId):
'''find torrents on mininova for a given imdb id '''find torrents on mininova for a given imdb id
''' '''
results = [] results = []
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
return _parseResultsPage(data) return _parseResultsPage(data)
def getId(mininovaId): def getId(mininovaId):
mininovaId = unicode(mininovaId) mininovaId = unicode(mininovaId)
d = findRe(mininovaId, "/(\d+)") d = findRe(mininovaId, "/(\d+)")
if d: if d:
return d return d
mininovaId = mininovaId.split('/') mininovaId = mininovaId.split('/')
if len(mininovaId) == 1: if len(mininovaId) == 1:
return mininovaId[0] return mininovaId[0]
else: else:
return mininovaId[-1] return mininovaId[-1]
def exists(mininovaId): def exists(mininovaId):
mininovaId = getId(mininovaId) mininovaId = getId(mininovaId)
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId) data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data: if not data or 'Torrent not found...' in data:
return False return False
if 'tracker</a> of this torrent requires registration.' in data: if 'tracker</a> of this torrent requires registration.' in data:
return False return False
return True return True
def getData(mininovaId): def getData(mininovaId):
_key_map = { _key_map = {
'by': u'uploader', 'by': u'uploader',
} }
mininovaId = getId(mininovaId) mininovaId = getId(mininovaId)
torrent = dict() torrent = dict()
torrent[u'id'] = mininovaId torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org' torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link']) data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
if '<h1>Torrent not found...</h1>' in data: if '<h1>Torrent not found...</h1>' in data:
return None return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>') torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>') torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = getUrl(torrent[u'torrent_link']) t = getUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent
class Mininova(Torrent): class Mininova(Torrent):
''' '''
>>> Mininova('123') >>> Mininova('123')
{} {}
>>> Mininova('1072195')['infohash'] >>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104' '72dfa59d2338e4a48c78cec9de25964cddb64104'
''' '''
def __init__(self, mininovaId): def __init__(self, mininovaId):
self.data = getData(mininovaId) self.data = getData(mininovaId)
if not self.data: if not self.data:
return return
Torrent.__init__(self) Torrent.__init__(self)
ratio = self.data['share ratio'].split(',') ratio = self.data['share ratio'].split(',')
self['seeder'] = -1 self['seeder'] = -1
self['leecher'] = -1 self['leecher'] = -1
if len(ratio) == 2: if len(ratio) == 2:
val = intValue(ratio[0].replace(',','').strip()) val = intValue(ratio[0].replace(',','').strip())
if val: if val:
self['seeder'] = int(val) self['seeder'] = int(val)
val = intValue(ratio[1].replace(',','').strip()) val = intValue(ratio[1].replace(',','').strip())
if val: if val:
self['leecher'] = int(val) self['leecher'] = int(val)
val = intValue(self.data['downloads'].replace(',','').strip()) val = intValue(self.data['downloads'].replace(',','').strip())
if val: if val:
self['downloaded'] = int(val) self['downloaded'] = int(val)
else: else:
self['downloaded'] = -1 self['downloaded'] = -1
published = self.data['added on'] published = self.data['added on']
published = published.split(' +')[0] published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S") self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

View file

@ -1,6 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import feedparser import feedparser
@ -9,37 +8,34 @@ import oxutils
from oxutils.lang import langCode2To3, langTo3Code from oxutils.lang import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if len(language) == 2: if len(language) == 2:
language = langCode2To3(language) language = langCode2To3(language)
elif len(language) != 3: elif len(language) != 3:
language = langTo3Code(language) language = langTo3Code(language)
url = "http://www.opensubtitles.org/en/search/" url = "http://www.opensubtitles.org/en/search/"
if language: if language:
url += "sublanguageid-%s/" % language url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb) url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = getUrl(url) data = getUrl(url)
if "title>opensubtitles.com - search results</title" in data: if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data) fd = feedparser.parse(data)
opensubtitleId = None opensubtitleId = None
print url if fd.entries:
if fd.entries: link = fd.entries[0]['links'][0]['href']
link = fd.entries[0]['links'][0]['href'] opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
print link if opensubtitleId:
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link) opensubtitleId = opensubtitleId[0]
if opensubtitleId: else:
opensubtitleId = opensubtitleId[0] opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
else: return opensubtitleId
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def downloadSubtitleById(opensubtitle_id): def downloadSubtitleById(opensubtitle_id):
srts = {} srts = {}
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data): for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = oxutils.stripTags(f[1]).split('\n')[0] name = oxutils.stripTags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0] url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = getUrlUnicode(url) srts[name] = getUrlUnicode(url)
return srts return srts

View file

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime from datetime import datetime
import re import re
import time import time
@ -8,6 +10,7 @@ import oxutils.cache
from oxutils.html import decodeHtml, stripTags from oxutils.html import decodeHtml, stripTags
import oxutils.net import oxutils.net
def getNews(year, month, day): def getNews(year, month, day):
sections = [ sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
@ -287,4 +290,4 @@ if __name__ == '__main__':
print x print x
''' '''
# archiveIssues() # archiveIssues()
archiveNews() archiveNews()

View file

@ -1,14 +1,11 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime from datetime import datetime
import re import re
import socket import socket
from urllib import quote, urlencode from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from oxutils.normalize import normalizeImdbId from oxutils.normalize import normalizeImdbId
@ -16,107 +13,106 @@ import oxutils
from torrent import Torrent from torrent import Torrent
socket.setdefaulttimeout(10.0)
season_episode = re.compile("S..E..", re.IGNORECASE) season_episode = re.compile("S..E..", re.IGNORECASE)
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout): def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
headers = cache.DEFAULT_HEADERS headers = cache.DEFAULT_HEADERS
headers['Cookie'] = 'language=en_EN' headers['Cookie'] = 'language=en_EN'
return cache.getUrl(url, data, headers, timeout) return cache.getUrl(url, data, headers, timeout)
def _getUrlUnicode(url): def _getUrlUnicode(url):
return cache.getUrlUnicode(url, _getUrl=_getUrl) return cache.getUrlUnicode(url, _getUrl=_getUrl)
def findMovies(query, max_results=10): def findMovies(query, max_results=10):
results = [] results = []
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
page_count = 1 page_count = 1
while next and page_count < 4: while next and page_count < 4:
page_count += 1 page_count += 1
url = next[0] url = next[0]
if not url.startswith('http'): if not url.startswith('http'):
if not url.startswith('/'): if not url.startswith('/'):
url = "/" + url url = "/" + url
url = "http://thepiratebay.org" + url url = "http://thepiratebay.org" + url
data = _getUrlUnicode(url) data = _getUrlUnicode(url)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>''' regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0] torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1] torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decodeHtml(row[2]) torrentTitle = decodeHtml(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows # 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']: if torrentType in ['201']:
results.append((torrentTitle, torrentLink, '')) results.append((torrentTitle, torrentLink, ''))
if len(results) >= max_results: if len(results) >= max_results:
return results return results
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data) next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
return results return results
def findMovieByImdb(imdb): def findMovieByImdb(imdb):
return findMovies("tt" + normalizeImdbId(imdb)) return findMovies("tt" + normalizeImdbId(imdb))
def getId(piratebayId): def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'): if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1] piratebayId = piratebayId.split('org/')[1]
d = findRe(piratebayId, "tor/(\d+)") d = findRe(piratebayId, "tor/(\d+)")
if d: if d:
piratebayId = d piratebayId = d
return piratebayId return piratebayId
def exists(piratebayId): def exists(piratebayId):
piratebayId = getId(piratebayId) piratebayId = getId(piratebayId)
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId) return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
def getData(piratebayId): def getData(piratebayId):
_key_map = { _key_map = {
'spoken language(s)': u'language', 'spoken language(s)': u'language',
'texted language(s)': u'subtitle language', 'texted language(s)': u'subtitle language',
'by': u'uploader', 'by': u'uploader',
'leechers': 'leecher', 'leechers': 'leecher',
'seeders': 'seeder', 'seeders': 'seeder',
} }
piratebayId = getId(piratebayId) piratebayId = getId(piratebayId)
torrent = dict() torrent = dict()
torrent[u'id'] = piratebayId torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org' torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
data = _getUrlUnicode(torrent['comment_link']) data = _getUrlUnicode(torrent['comment_link'])
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
torrent[u'title'] = decodeHtml(torrent[u'title']).strip() torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8')) title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(stripTags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _getUrl(torrent[u'torrent_link']) t = _getUrl(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent
class Thepiratebay(Torrent): class Thepiratebay(Torrent):
''' '''
>>> Thepiratebay('123') >>> Thepiratebay('123')
{} {}
>>> Thepiratebay('3951349')['infohash'] >>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b' '4e84415d36ed7b54066160c05a0b0f061898d12b'
''' '''
def __init__(self, piratebayId): def __init__(self, piratebayId):
self.data = getData(piratebayId) self.data = getData(piratebayId)
if not self.data: if not self.data:
return return
Torrent.__init__(self) Torrent.__init__(self)
published = self.data['uploaded'] published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0] published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S") self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

View file

@ -1,39 +1,37 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
from oxutils import intValue from oxutils import intValue
class Torrent(dict): class Torrent(dict):
''' '''
>>> Torrent() >>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1} {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
''' '''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language') 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files') _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', ) _dict_keys = ('torrent_info', )
_list_keys = () _list_keys = ()
data = {'torrent_info': {}} data = {'torrent_info': {}}
def __init__(self): def __init__(self):
for key in self._string_keys: for key in self._string_keys:
self[key] = self.data.get(key, u'') self[key] = self.data.get(key, u'')
for key in self._dict_keys: for key in self._dict_keys:
self[key] = self.data.get(key, {}) self[key] = self.data.get(key, {})
for key in self._list_keys: for key in self._list_keys:
self[key] = self.data.get(key, []) self[key] = self.data.get(key, [])
for key in self._int_keys: for key in self._int_keys:
value = self.data.get(key, -1) value = self.data.get(key, -1)
if not isinstance(value, int): if not isinstance(value, int):
value = int(intValue(value)) value = int(intValue(value))
self[key] = value self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '') self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1) self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '') self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']: if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files']) self['files'] = len(self.data['torrent_info']['files'])
else: else:
self['files'] = 1 self['files'] = 1

View file

@ -1,72 +1,72 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
from urllib import urlencode from urllib import urlencode
import simplejson import simplejson
from oxutils.cache import getUrl, getUrlUnicode from oxutils.cache import getUrl, getUrlUnicode
from oxutils import findRe, decodeHtml from oxutils import findRe, decodeHtml
def getMovieId(title, director='', year=''): def getMovieId(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year) query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1) result = find(query, 1)
if result: if result:
return result[0][1] return result[0][1]
return '' return ''
def getUrlByImdb(imdbId): def getUrlByImdb(imdbId):
query = '"imdb_id = %s"'% imdbId query = '"imdb_id = %s"'% imdbId
result = find(query) result = find(query)
if result: if result:
url = result[0][1] url = result[0][1]
return url return url
if str(imdbId).startswith('0'): if str(imdbId).startswith('0'):
imdbId = imdbId[1:] imdbId = imdbId[1:]
return getUrlByImdb(imdbId) return getUrlByImdb(imdbId)
def getUrlByAmbId(amg_id): def getUrlByAmbId(amg_id):
query = '"amg_id = %s"'% amg_id query = '"amg_id = %s"'% amg_id
result = find(query) result = find(query)
if result: if result:
url = result[0][1] url = result[0][1]
return url return url
return '' return ''
def getWikiData(wikipediaUrl): def getWikiData(wikipediaUrl):
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
html = getUrlUnicode(url) html = getUrlUnicode(url)
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>")) data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
return data return data
def getMovieData(wikipediaUrl): def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl) data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''') filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
filmbox = {} filmbox = {}
for row in filmbox_data.strip().split('|'): for row in filmbox_data.strip().split('|'):
d = row.split('=') d = row.split('=')
if len(d) == 2: if len(d) == 2:
key = d[0].strip() key = d[0].strip()
value = d[1].strip() value = d[1].strip()
filmbox[key] = value filmbox[key] = value
return filmbox return filmbox
def getAmgId(wikipediaUrl): def getAmgId(wikipediaUrl):
data = getMovieData(wikipediaUrl) data = getMovieData(wikipediaUrl)
return data.get('amg_id', '') return data.get('amg_id', '')
def find(query, max_results=10): def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = getUrl(url) data = getUrl(url)
if not data: if not data:
data = getUrl(url, timeout=0) data = getUrl(url, timeout=0)
result = simplejson.loads(data) result = simplejson.loads(data)
results = [] results = []
for r in result['query']['search']: for r in result['query']['search']:
title = r['title'] title = r['title']
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_') url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
results.append((title, url, '')) results.append((title, url, ''))
return results return results

View file

@ -1,6 +1,5 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
from urllib import quote from urllib import quote
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@ -8,49 +7,50 @@ import feedparser
from oxutils.cache import getUrl from oxutils.cache import getUrl
from oxutils import findString from oxutils import findString
def getVideoUrl(youtubeId, format='mp4'): def getVideoUrl(youtubeId, format='mp4'):
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
data = getUrl(url) data = getUrl(url)
xml = ET.fromstring(data) xml = ET.fromstring(data)
youtubeKey = xml.find('t').text youtubeKey = xml.find('t').text
if format == 'mp4': if format == 'mp4':
fmt=18 fmt=18
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt) url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
else: else:
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey) url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
return url return url
def getMovieInfo(youtubeId): def getMovieInfo(youtubeId):
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
data = getUrl(url) data = getUrl(url)
fd = feedparser.parse(data) fd = feedparser.parse(data)
return getInfoFromAtom(fd.entries[0]) return getInfoFromAtom(fd.entries[0])
def getInfoFromAtom(entry): def getInfoFromAtom(entry):
info = dict() info = dict()
info['title'] = entry['title'] info['title'] = entry['title']
info['description'] = entry['description'] info['description'] = entry['description']
info['author'] = entry['author'] info['author'] = entry['author']
info['published'] = entry['published_parsed'] info['published'] = entry['published_parsed']
info['keywords'] = entry['media_keywords'].split(', ') info['keywords'] = entry['media_keywords'].split(', ')
info['url'] = entry['links'][0]['href'] info['url'] = entry['links'][0]['href']
info['id'] = findString(info['url'], "/watch?v=") info['id'] = findString(info['url'], "/watch?v=")
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id'] info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
info['flv'] = getVideoUrl(info['id'], 'flv') info['flv'] = getVideoUrl(info['id'], 'flv')
info['mp4'] = getVideoUrl(info['id'], 'mp4') info['mp4'] = getVideoUrl(info['id'], 'mp4')
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id']) info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
return info return info
def find(query, max_results=10, offset=1, orderBy='relevance'): def find(query, max_results=10, offset=1, orderBy='relevance'):
query = quote(query) query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results) url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
data = getUrl(url) data = getUrl(url)
fd = feedparser.parse(data) fd = feedparser.parse(data)
videos = [] videos = []
for entry in fd.entries: for entry in fd.entries:
v = getInfoFromAtom(entry) v = getInfoFromAtom(entry)
videos.append(v) videos.append(v)
if len(videos) >= max_results: if len(videos) >= max_results:
return videos return videos
return videos return videos

View file

@ -1,33 +1,33 @@
#!/usr/bin/env python #!/usr/bin/env python
# vi:si:et:sw=2:sts=2:ts=2 # vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8 # encoding: utf-8
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os import os
setup( setup(
name="ox", name="ox",
version="0.1", version="0.1",
description="collection of scrapers for various websites", description="collection of scrapers for various websites",
author="0x", author="0x",
author_email="code@0xdb.org", author_email="code@0xdb.org",
url="http://code.0xdb.org/ox", url="http://code.0xdb.org/ox",
download_url="http://code.0xdb.org/ox/download", download_url="http://code.0xdb.org/ox/download",
license="GPLv3", license="GPLv3",
packages=find_packages(), packages=find_packages(),
zip_safe=False, zip_safe=False,
install_requires=[ install_requires=[
'oxutils', 'oxutils',
'feedparser', 'feedparser',
'beautifulsoup', 'beautifulsoup',
], ],
keywords = [ keywords = [
], ],
classifiers = [ classifiers = [
'Development Status :: 3 - Alpha', 'Development Status :: 3 - Alpha',
'Operating System :: OS Independent', 'Operating System :: OS Independent',
'Programming Language :: Python', 'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Software Development :: Libraries :: Python Modules',
], ],
) )