vi:si:et:sw=4:sts=4:ts=4
This commit is contained in:
parent
8886cfe8d3
commit
4c14ce613d
16 changed files with 1088 additions and 1134 deletions
|
@ -1,7 +1,5 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
|
||||
__version__ = '0.1.0'
|
||||
|
||||
import imdb
|
||||
|
|
|
@ -7,6 +7,7 @@ from oxutils.cache import getUrlUnicode
|
|||
from oxutils.html import stripTags
|
||||
from oxutils.text import findRe, removeSpecialCharacters
|
||||
|
||||
|
||||
def getData(criterionId):
|
||||
'''
|
||||
>>> getData(348)['imdbId']
|
||||
|
|
|
@ -1,19 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import unquote
|
||||
from oxutils.cache import getUrl
|
||||
|
||||
|
||||
def getVideoUrl(url):
|
||||
'''
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
||||
'''
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
||||
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
||||
'''
|
||||
data = getUrl(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return "http://www.dailymotion.com" + v
|
||||
return ''
|
||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
||||
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
||||
'''
|
||||
data = getUrl(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return "http://www.dailymotion.com" + v
|
||||
return ''
|
||||
|
||||
|
|
34
ox/google.py
34
ox/google.py
|
@ -1,6 +1,5 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
|
@ -29,24 +28,23 @@ FIXME: how search depper than first page?
|
|||
DEFAULT_MAX_RESULTS = 10
|
||||
|
||||
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
||||
google_timeout=24*60*60
|
||||
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
||||
google_timeout=24*60*60
|
||||
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
||||
|
||||
def quote_plus(s):
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
return urllib.quote_plus(s.encode('utf-8'))
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
||||
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||
data = getUrl(url)
|
||||
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||
r'.*?(?:<br>|<table.*?>)' + \
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||
results = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||
results.append((stripTags(name), url, stripTags(desc)))
|
||||
if len(results) > max_results:
|
||||
results = results[:max_results]
|
||||
return results
|
||||
|
||||
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||
data = getUrl(url)
|
||||
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||
r'.*?(?:<br>|<table.*?>)' + \
|
||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||
results = []
|
||||
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||
results.append((stripTags(name), url, stripTags(desc)))
|
||||
if len(results) > max_results:
|
||||
results = results[:max_results]
|
||||
return results
|
||||
|
||||
|
|
1136
ox/imdb.py
1136
ox/imdb.py
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,5 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
|
||||
import ox.imdb as imdb
|
||||
|
@ -83,4 +85,4 @@ def archivePosters():
|
|||
|
||||
if __name__ == '__main__':
|
||||
archivePosters()
|
||||
getMovieData('Brick', 'Rian Johnson')
|
||||
getMovieData('Brick', 'Rian Johnson')
|
||||
|
|
288
ox/itunes.py
288
ox/itunes.py
|
@ -1,3 +1,5 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
import urllib
|
||||
|
||||
|
@ -6,6 +8,7 @@ from oxutils.html import decodeHtml, stripTags
|
|||
from oxutils.text import findRe
|
||||
from oxutils.text import findString
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||
|
||||
|
@ -22,162 +25,163 @@ ITUNES_HEADERS = {
|
|||
}
|
||||
|
||||
def composeUrl(request, parameters):
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
url += urllib.urlencode({
|
||||
'albumTerm': parameters['title'],
|
||||
'allArtistNames': parameters['artist'],
|
||||
'composerTerm': '',
|
||||
'flavor': 0,
|
||||
'genreIndex': 1,
|
||||
'media': 'music',
|
||||
'mediaType': 2,
|
||||
'ringtone': 0,
|
||||
'searchButton': 'submit',
|
||||
'songTerm': ''
|
||||
})
|
||||
elif parameters['media'] == 'movie':
|
||||
url += urllib.urlencode({
|
||||
'actorTerm': '',
|
||||
'closedCaption': 0,
|
||||
'descriptionTerm': '',
|
||||
'directorProducerName': parameters['director'],
|
||||
'flavor': 0,
|
||||
'media': 'movie',
|
||||
'mediaType': 3,
|
||||
'movieTerm': parameters['title'],
|
||||
'ratingIndex': 1,
|
||||
'releaseYearTerm': '',
|
||||
'searchButton': 'submit'
|
||||
})
|
||||
elif request == 'viewAlbum':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||
elif request == 'viewMovie':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
url += urllib.urlencode({
|
||||
'albumTerm': parameters['title'],
|
||||
'allArtistNames': parameters['artist'],
|
||||
'composerTerm': '',
|
||||
'flavor': 0,
|
||||
'genreIndex': 1,
|
||||
'media': 'music',
|
||||
'mediaType': 2,
|
||||
'ringtone': 0,
|
||||
'searchButton': 'submit',
|
||||
'songTerm': ''
|
||||
})
|
||||
elif parameters['media'] == 'movie':
|
||||
url += urllib.urlencode({
|
||||
'actorTerm': '',
|
||||
'closedCaption': 0,
|
||||
'descriptionTerm': '',
|
||||
'directorProducerName': parameters['director'],
|
||||
'flavor': 0,
|
||||
'media': 'movie',
|
||||
'mediaType': 3,
|
||||
'movieTerm': parameters['title'],
|
||||
'ratingIndex': 1,
|
||||
'releaseYearTerm': '',
|
||||
'searchButton': 'submit'
|
||||
})
|
||||
elif request == 'viewAlbum':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||
elif request == 'viewMovie':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
|
||||
def parseXmlDict(xml):
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = findRe(string, '(.*?)</key>')
|
||||
type = findRe(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decodeHtml(value)
|
||||
values[key] = value
|
||||
return values
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = findRe(string, '(.*?)</key>')
|
||||
type = findRe(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decodeHtml(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parseCast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
def parseMovies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
list = []
|
||||
try:
|
||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
class ItunesAlbum:
|
||||
def __init__(self, id = '', title = '', artist = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
def __init__(self, id = '', title = '', artist = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parseXmlDict(string))
|
||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
def __init__(self, id = '', title = '', director = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
def __init__(self, id = '', title = '', director = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.getId()
|
||||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewMovie', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewMovie', {'id': self.id})
|
||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parseCast(xml, 'actors')
|
||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parseCast(xml, 'directors')
|
||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parseCast(xml, 'producers')
|
||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
import simplejson
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).getData()
|
||||
import simplejson
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').getData()
|
||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from oxutils.cache import getUrl
|
||||
from oxutils.html import decodeHtml
|
||||
from oxutils.text import findRe
|
||||
|
@ -16,4 +18,4 @@ def getLyrics(title, artist):
|
|||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getLyrics('Election Day', 'Arcadia')
|
||||
print getLyrics('Election Day', 'Arcadia')
|
||||
|
|
193
ox/mininova.py
193
ox/mininova.py
|
@ -1,7 +1,5 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
|
@ -14,116 +12,115 @@ import oxutils
|
|||
|
||||
from torrent import Torrent
|
||||
|
||||
socket.setdefaulttimeout(10.0)
|
||||
|
||||
def _parseResultsPage(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decodeHtml(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decodeHtml(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def findMovie(query, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = getUrlUnicode(url)
|
||||
return _parseResultsPage(data, max_results)
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = getUrlUnicode(url)
|
||||
return _parseResultsPage(data, max_results)
|
||||
|
||||
def findMovieByImdb(imdbId):
|
||||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||
return _parseResultsPage(data)
|
||||
'''find torrents on mininova for a given imdb id
|
||||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = findRe(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = getId(mininovaId)
|
||||
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
mininovaId = getId(mininovaId)
|
||||
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def getData(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = getId(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = getId(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = getData(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = intValue(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = intValue(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = getData(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = intValue(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = intValue(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
|
@ -9,37 +8,34 @@ import oxutils
|
|||
from oxutils.lang import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
language = langTo3Code(language)
|
||||
url = "http://www.opensubtitles.org/en/search/"
|
||||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = getUrl(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
print url
|
||||
if fd.entries:
|
||||
link = fd.entries[0]['links'][0]['href']
|
||||
print link
|
||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
language = langTo3Code(language)
|
||||
url = "http://www.opensubtitles.org/en/search/"
|
||||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = getUrl(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
if fd.entries:
|
||||
link = fd.entries[0]['links'][0]['href']
|
||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
srts = {}
|
||||
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = oxutils.stripTags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = getUrlUnicode(url)
|
||||
return srts
|
||||
|
||||
|
||||
srts = {}
|
||||
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = oxutils.stripTags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = getUrlUnicode(url)
|
||||
return srts
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import time
|
||||
|
@ -8,6 +10,7 @@ import oxutils.cache
|
|||
from oxutils.html import decodeHtml, stripTags
|
||||
import oxutils.net
|
||||
|
||||
|
||||
def getNews(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
|
@ -287,4 +290,4 @@ if __name__ == '__main__':
|
|||
print x
|
||||
'''
|
||||
# archiveIssues()
|
||||
archiveNews()
|
||||
archiveNews()
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from oxutils.normalize import normalizeImdbId
|
||||
|
@ -16,107 +13,106 @@ import oxutils
|
|||
|
||||
from torrent import Torrent
|
||||
|
||||
socket.setdefaulttimeout(10.0)
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
|
||||
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
|
||||
headers = cache.DEFAULT_HEADERS
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.getUrl(url, data, headers, timeout)
|
||||
headers = cache.DEFAULT_HEADERS
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.getUrl(url, data, headers, timeout)
|
||||
|
||||
def _getUrlUnicode(url):
|
||||
return cache.getUrlUnicode(url, _getUrl=_getUrl)
|
||||
return cache.getUrlUnicode(url, _getUrl=_getUrl)
|
||||
|
||||
def findMovies(query, max_results=10):
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
url = next[0]
|
||||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = _getUrlUnicode(url)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decodeHtml(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
if len(results) >= max_results:
|
||||
return results
|
||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
url = next[0]
|
||||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = _getUrlUnicode(url)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decodeHtml(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
if len(results) >= max_results:
|
||||
return results
|
||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
|
||||
def findMovieByImdb(imdb):
|
||||
return findMovies("tt" + normalizeImdbId(imdb))
|
||||
return findMovies("tt" + normalizeImdbId(imdb))
|
||||
|
||||
def getId(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = findRe(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = findRe(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
||||
def exists(piratebayId):
|
||||
piratebayId = getId(piratebayId)
|
||||
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
|
||||
piratebayId = getId(piratebayId)
|
||||
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
|
||||
|
||||
def getData(piratebayId):
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
'by': u'uploader',
|
||||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = getId(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
'by': u'uploader',
|
||||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = getId(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||
|
||||
data = _getUrlUnicode(torrent['comment_link'])
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = _getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
data = _getUrlUnicode(torrent['comment_link'])
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = _getUrl(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = getData(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = getData(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
|
|
@ -1,39 +1,37 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from oxutils import intValue
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(intValue(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(intValue(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
|
|
102
ox/wikipedia.py
102
ox/wikipedia.py
|
@ -1,72 +1,72 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import urlencode
|
||||
|
||||
import simplejson
|
||||
from oxutils.cache import getUrl, getUrlUnicode
|
||||
from oxutils import findRe, decodeHtml
|
||||
|
||||
|
||||
def getMovieId(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
|
||||
def getUrlByImdb(imdbId):
|
||||
query = '"imdb_id = %s"'% imdbId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
if str(imdbId).startswith('0'):
|
||||
imdbId = imdbId[1:]
|
||||
return getUrlByImdb(imdbId)
|
||||
query = '"imdb_id = %s"'% imdbId
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
if str(imdbId).startswith('0'):
|
||||
imdbId = imdbId[1:]
|
||||
return getUrlByImdb(imdbId)
|
||||
|
||||
def getUrlByAmbId(amg_id):
|
||||
query = '"amg_id = %s"'% amg_id
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
query = '"amg_id = %s"'% amg_id
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
|
||||
def getWikiData(wikipediaUrl):
|
||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||
html = getUrlUnicode(url)
|
||||
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||
return data
|
||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||
html = getUrlUnicode(url)
|
||||
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||
return data
|
||||
|
||||
def getMovieData(wikipediaUrl):
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||
filmbox = {}
|
||||
for row in filmbox_data.strip().split('|'):
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
key = d[0].strip()
|
||||
value = d[1].strip()
|
||||
filmbox[key] = value
|
||||
return filmbox
|
||||
data = getWikiData(wikipediaUrl)
|
||||
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||
filmbox = {}
|
||||
for row in filmbox_data.strip().split('|'):
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
key = d[0].strip()
|
||||
value = d[1].strip()
|
||||
filmbox[key] = value
|
||||
return filmbox
|
||||
|
||||
def getAmgId(wikipediaUrl):
|
||||
data = getMovieData(wikipediaUrl)
|
||||
return data.get('amg_id', '')
|
||||
data = getMovieData(wikipediaUrl)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||
data = getUrl(url)
|
||||
if not data:
|
||||
data = getUrl(url, timeout=0)
|
||||
result = simplejson.loads(data)
|
||||
results = []
|
||||
for r in result['query']['search']:
|
||||
title = r['title']
|
||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||
results.append((title, url, ''))
|
||||
return results
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||
data = getUrl(url)
|
||||
if not data:
|
||||
data = getUrl(url, timeout=0)
|
||||
result = simplejson.loads(data)
|
||||
results = []
|
||||
for r in result['query']['search']:
|
||||
title = r['title']
|
||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||
results.append((title, url, ''))
|
||||
return results
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import quote
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
@ -8,49 +7,50 @@ import feedparser
|
|||
from oxutils.cache import getUrl
|
||||
from oxutils import findString
|
||||
|
||||
|
||||
def getVideoUrl(youtubeId, format='mp4'):
|
||||
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
|
||||
data = getUrl(url)
|
||||
xml = ET.fromstring(data)
|
||||
youtubeKey = xml.find('t').text
|
||||
if format == 'mp4':
|
||||
fmt=18
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
|
||||
else:
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
|
||||
return url
|
||||
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
|
||||
data = getUrl(url)
|
||||
xml = ET.fromstring(data)
|
||||
youtubeKey = xml.find('t').text
|
||||
if format == 'mp4':
|
||||
fmt=18
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
|
||||
else:
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
|
||||
return url
|
||||
|
||||
def getMovieInfo(youtubeId):
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
return getInfoFromAtom(fd.entries[0])
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
return getInfoFromAtom(fd.entries[0])
|
||||
|
||||
def getInfoFromAtom(entry):
|
||||
info = dict()
|
||||
info['title'] = entry['title']
|
||||
info['description'] = entry['description']
|
||||
info['author'] = entry['author']
|
||||
info['published'] = entry['published_parsed']
|
||||
info['keywords'] = entry['media_keywords'].split(', ')
|
||||
info['url'] = entry['links'][0]['href']
|
||||
info['id'] = findString(info['url'], "/watch?v=")
|
||||
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
||||
info['flv'] = getVideoUrl(info['id'], 'flv')
|
||||
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
||||
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
|
||||
return info
|
||||
info = dict()
|
||||
info['title'] = entry['title']
|
||||
info['description'] = entry['description']
|
||||
info['author'] = entry['author']
|
||||
info['published'] = entry['published_parsed']
|
||||
info['keywords'] = entry['media_keywords'].split(', ')
|
||||
info['url'] = entry['links'][0]['href']
|
||||
info['id'] = findString(info['url'], "/watch?v=")
|
||||
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
||||
info['flv'] = getVideoUrl(info['id'], 'flv')
|
||||
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
||||
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
|
||||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for entry in fd.entries:
|
||||
v = getInfoFromAtom(entry)
|
||||
videos.append(v)
|
||||
if len(videos) >= max_results:
|
||||
return videos
|
||||
return videos
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
|
||||
data = getUrl(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for entry in fd.entries:
|
||||
v = getInfoFromAtom(entry)
|
||||
videos.append(v)
|
||||
if len(videos) >= max_results:
|
||||
return videos
|
||||
return videos
|
||||
|
||||
|
|
50
setup.py
50
setup.py
|
@ -1,33 +1,33 @@
|
|||
#!/usr/bin/env python
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
import os
|
||||
|
||||
setup(
|
||||
name="ox",
|
||||
version="0.1",
|
||||
description="collection of scrapers for various websites",
|
||||
author="0x",
|
||||
author_email="code@0xdb.org",
|
||||
url="http://code.0xdb.org/ox",
|
||||
download_url="http://code.0xdb.org/ox/download",
|
||||
license="GPLv3",
|
||||
packages=find_packages(),
|
||||
zip_safe=False,
|
||||
install_requires=[
|
||||
'oxutils',
|
||||
'feedparser',
|
||||
'beautifulsoup',
|
||||
],
|
||||
keywords = [
|
||||
],
|
||||
classifiers = [
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
],
|
||||
)
|
||||
name="ox",
|
||||
version="0.1",
|
||||
description="collection of scrapers for various websites",
|
||||
author="0x",
|
||||
author_email="code@0xdb.org",
|
||||
url="http://code.0xdb.org/ox",
|
||||
download_url="http://code.0xdb.org/ox/download",
|
||||
license="GPLv3",
|
||||
packages=find_packages(),
|
||||
zip_safe=False,
|
||||
install_requires=[
|
||||
'oxutils',
|
||||
'feedparser',
|
||||
'beautifulsoup',
|
||||
],
|
||||
keywords = [
|
||||
],
|
||||
classifiers = [
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
],
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in a new issue