vi:si:et:sw=4:sts=4:ts=4
This commit is contained in:
parent
8886cfe8d3
commit
4c14ce613d
16 changed files with 1088 additions and 1134 deletions
|
@ -1,7 +1,5 @@
|
||||||
# -*- Mode: Python; -*-
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
|
||||||
# encoding: utf-8
|
# encoding: utf-8
|
||||||
|
|
||||||
__version__ = '0.1.0'
|
__version__ = '0.1.0'
|
||||||
|
|
||||||
import imdb
|
import imdb
|
||||||
|
|
|
@ -7,6 +7,7 @@ from oxutils.cache import getUrlUnicode
|
||||||
from oxutils.html import stripTags
|
from oxutils.html import stripTags
|
||||||
from oxutils.text import findRe, removeSpecialCharacters
|
from oxutils.text import findRe, removeSpecialCharacters
|
||||||
|
|
||||||
|
|
||||||
def getData(criterionId):
|
def getData(criterionId):
|
||||||
'''
|
'''
|
||||||
>>> getData(348)['imdbId']
|
>>> getData(348)['imdbId']
|
||||||
|
|
|
@ -1,19 +1,22 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from oxutils.cache import getUrl
|
from oxutils.cache import getUrl
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(url):
|
def getVideoUrl(url):
|
||||||
'''
|
'''
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms')
|
||||||
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0'
|
||||||
|
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms')
|
||||||
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4'
|
||||||
'''
|
'''
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||||
for v in video:
|
for v in video:
|
||||||
v = unquote(v).split('@@')[0]
|
v = unquote(v).split('@@')[0]
|
||||||
return "http://www.dailymotion.com" + v
|
return "http://www.dailymotion.com" + v
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
34
ox/google.py
34
ox/google.py
|
@ -1,6 +1,5 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import urllib
|
import urllib
|
||||||
|
@ -29,24 +28,23 @@ FIXME: how search depper than first page?
|
||||||
DEFAULT_MAX_RESULTS = 10
|
DEFAULT_MAX_RESULTS = 10
|
||||||
|
|
||||||
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS):
|
||||||
google_timeout=24*60*60
|
google_timeout=24*60*60
|
||||||
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
return oxutils.cache.getUrl(url, data, headers, google_timeout)
|
||||||
|
|
||||||
def quote_plus(s):
|
def quote_plus(s):
|
||||||
return urllib.quote_plus(s.encode('utf-8'))
|
return urllib.quote_plus(s.encode('utf-8'))
|
||||||
|
|
||||||
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
def find(query, max_results=DEFAULT_MAX_RESULTS):
|
||||||
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
url = "http://www.google.com/search?q=%s" % quote_plus(query)
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
link_re = r'<a href="(?P<url>[^"]*?)" class=l.*?>(?P<name>.*?)</a>' + \
|
||||||
r'.*?(?:<br>|<table.*?>)' + \
|
r'.*?(?:<br>|<table.*?>)' + \
|
||||||
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
r'(?P<desc>.*?)' + '(?:<font color=#008000>|<a)'
|
||||||
results = []
|
results = []
|
||||||
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
for match in re.compile(link_re, re.DOTALL).finditer(data):
|
||||||
(name, url, desc) = match.group('name', 'url', 'desc')
|
(name, url, desc) = match.group('name', 'url', 'desc')
|
||||||
results.append((stripTags(name), url, stripTags(desc)))
|
results.append((stripTags(name), url, stripTags(desc)))
|
||||||
if len(results) > max_results:
|
if len(results) > max_results:
|
||||||
results = results[:max_results]
|
results = results[:max_results]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
|
1136
ox/imdb.py
1136
ox/imdb.py
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,5 @@
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# encoding: utf-8
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import ox.imdb as imdb
|
import ox.imdb as imdb
|
||||||
|
@ -83,4 +85,4 @@ def archivePosters():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
archivePosters()
|
archivePosters()
|
||||||
getMovieData('Brick', 'Rian Johnson')
|
getMovieData('Brick', 'Rian Johnson')
|
||||||
|
|
288
ox/itunes.py
288
ox/itunes.py
|
@ -1,3 +1,5 @@
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
# encoding: utf-8
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
|
@ -6,6 +8,7 @@ from oxutils.html import decodeHtml, stripTags
|
||||||
from oxutils.text import findRe
|
from oxutils.text import findRe
|
||||||
from oxutils.text import findString
|
from oxutils.text import findString
|
||||||
|
|
||||||
|
|
||||||
# to sniff itunes traffic, use something like
|
# to sniff itunes traffic, use something like
|
||||||
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||||
|
|
||||||
|
@ -22,162 +25,163 @@ ITUNES_HEADERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
def composeUrl(request, parameters):
|
def composeUrl(request, parameters):
|
||||||
if request == 'advancedSearch':
|
if request == 'advancedSearch':
|
||||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||||
if parameters['media'] == 'music':
|
if parameters['media'] == 'music':
|
||||||
url += urllib.urlencode({
|
url += urllib.urlencode({
|
||||||
'albumTerm': parameters['title'],
|
'albumTerm': parameters['title'],
|
||||||
'allArtistNames': parameters['artist'],
|
'allArtistNames': parameters['artist'],
|
||||||
'composerTerm': '',
|
'composerTerm': '',
|
||||||
'flavor': 0,
|
'flavor': 0,
|
||||||
'genreIndex': 1,
|
'genreIndex': 1,
|
||||||
'media': 'music',
|
'media': 'music',
|
||||||
'mediaType': 2,
|
'mediaType': 2,
|
||||||
'ringtone': 0,
|
'ringtone': 0,
|
||||||
'searchButton': 'submit',
|
'searchButton': 'submit',
|
||||||
'songTerm': ''
|
'songTerm': ''
|
||||||
})
|
})
|
||||||
elif parameters['media'] == 'movie':
|
elif parameters['media'] == 'movie':
|
||||||
url += urllib.urlencode({
|
url += urllib.urlencode({
|
||||||
'actorTerm': '',
|
'actorTerm': '',
|
||||||
'closedCaption': 0,
|
'closedCaption': 0,
|
||||||
'descriptionTerm': '',
|
'descriptionTerm': '',
|
||||||
'directorProducerName': parameters['director'],
|
'directorProducerName': parameters['director'],
|
||||||
'flavor': 0,
|
'flavor': 0,
|
||||||
'media': 'movie',
|
'media': 'movie',
|
||||||
'mediaType': 3,
|
'mediaType': 3,
|
||||||
'movieTerm': parameters['title'],
|
'movieTerm': parameters['title'],
|
||||||
'ratingIndex': 1,
|
'ratingIndex': 1,
|
||||||
'releaseYearTerm': '',
|
'releaseYearTerm': '',
|
||||||
'searchButton': 'submit'
|
'searchButton': 'submit'
|
||||||
})
|
})
|
||||||
elif request == 'viewAlbum':
|
elif request == 'viewAlbum':
|
||||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||||
elif request == 'viewMovie':
|
elif request == 'viewMovie':
|
||||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def parseXmlDict(xml):
|
def parseXmlDict(xml):
|
||||||
values = {}
|
values = {}
|
||||||
strings = xml.split('<key>')
|
strings = xml.split('<key>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
if string.find('</key>') != -1:
|
if string.find('</key>') != -1:
|
||||||
key = findRe(string, '(.*?)</key>')
|
key = findRe(string, '(.*?)</key>')
|
||||||
type = findRe(string, '</key><(.*?)>')
|
type = findRe(string, '</key><(.*?)>')
|
||||||
if type == 'true/':
|
if type == 'true/':
|
||||||
value = True
|
value = True
|
||||||
else:
|
else:
|
||||||
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
value = findRe(string, '<%s>(.*?)</%s>' % (type, type))
|
||||||
if type == 'integer':
|
if type == 'integer':
|
||||||
value = int(value)
|
value = int(value)
|
||||||
elif type == 'string':
|
elif type == 'string':
|
||||||
value = decodeHtml(value)
|
value = decodeHtml(value)
|
||||||
values[key] = value
|
values[key] = value
|
||||||
return values
|
return values
|
||||||
|
|
||||||
def parseCast(xml, title):
|
def parseCast(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
strings = findRe(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
strings.pop()
|
strings.pop()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
list.append(findRe(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
return list
|
return list
|
||||||
except:
|
except:
|
||||||
return list
|
return list
|
||||||
|
|
||||||
def parseMovies(xml, title):
|
def parseMovies(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
strings = findRe(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
strings.pop()
|
strings.pop()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
list.append({
|
list.append({
|
||||||
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
'id': findRe(string, 'viewMovie\?id=(.*?)&'),
|
||||||
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
'title': findRe(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||||
})
|
})
|
||||||
return list
|
return list
|
||||||
except:
|
except:
|
||||||
return list
|
return list
|
||||||
|
|
||||||
class ItunesAlbum:
|
class ItunesAlbum:
|
||||||
def __init__(self, id = '', title = '', artist = ''):
|
def __init__(self, id = '', title = '', artist = ''):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.title = title
|
self.title = title
|
||||||
self.artist = artist
|
self.artist = artist
|
||||||
if not id:
|
if not id:
|
||||||
self.id = self.getId()
|
self.id = self.getId()
|
||||||
|
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewAlbum', {'id': self.id})
|
url = composeUrl('viewAlbum', {'id': self.id})
|
||||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['tracks'] = []
|
data['tracks'] = []
|
||||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
data['tracks'].append(parseXmlDict(string))
|
data['tracks'].append(parseXmlDict(string))
|
||||||
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
data['type'] = findRe(xml, '<key>listType</key><string>(.*?)<')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
class ItunesMovie:
|
class ItunesMovie:
|
||||||
def __init__(self, id = '', title = '', director = ''):
|
def __init__(self, id = '', title = '', director = ''):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.title = title
|
self.title = title
|
||||||
self.director = director
|
self.director = director
|
||||||
if not id:
|
if not id:
|
||||||
self.id = self.getId()
|
self.id = self.getId()
|
||||||
|
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
xml = getUrl(url, headers = ITUNES_HEADERS)
|
xml = getUrl(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewMovie', {'id': self.id})
|
url = composeUrl('viewMovie', {'id': self.id})
|
||||||
xml = getUrl(url, None, ITUNES_HEADERS)
|
xml = getUrl(url, None, ITUNES_HEADERS)
|
||||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||||
f.write(xml)
|
f.write(xml)
|
||||||
f.close()
|
f.close()
|
||||||
data['actors'] = parseCast(xml, 'actors')
|
data['actors'] = parseCast(xml, 'actors')
|
||||||
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
string = findRe(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||||
data['directors'] = parseCast(xml, 'directors')
|
data['directors'] = parseCast(xml, 'directors')
|
||||||
data['format'] = findRe(xml, 'Format:(.*?)<')
|
data['format'] = findRe(xml, 'Format:(.*?)<')
|
||||||
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<'))
|
||||||
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||||
data['producers'] = parseCast(xml, 'producers')
|
data['producers'] = parseCast(xml, 'producers')
|
||||||
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
data['rated'] = findRe(xml, 'Rated(.*?)<')
|
||||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
||||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
data['runTime'] = findRe(xml, 'Run Time:(.*?)<')
|
||||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
||||||
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import simplejson
|
import simplejson
|
||||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
||||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
|
||||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
|
||||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
|
||||||
for v in data['relatedMovies']:
|
|
||||||
data = ItunesMovie(id = v['id']).getData()
|
|
||||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
data = ItunesMovie(id='272960052').getData()
|
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
||||||
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
for v in data['relatedMovies']:
|
||||||
|
data = ItunesMovie(id = v['id']).getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
data = ItunesMovie(id='272960052').getData()
|
||||||
|
print simplejson.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from oxutils.cache import getUrl
|
from oxutils.cache import getUrl
|
||||||
from oxutils.html import decodeHtml
|
from oxutils.html import decodeHtml
|
||||||
from oxutils.text import findRe
|
from oxutils.text import findRe
|
||||||
|
@ -16,4 +18,4 @@ def getLyrics(title, artist):
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getLyrics('Election Day', 'Arcadia')
|
print getLyrics('Election Day', 'Arcadia')
|
||||||
|
|
193
ox/mininova.py
193
ox/mininova.py
|
@ -1,7 +1,5 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
|
@ -14,116 +12,115 @@ import oxutils
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
|
||||||
socket.setdefaulttimeout(10.0)
|
|
||||||
|
|
||||||
def _parseResultsPage(data, max_results=10):
|
def _parseResultsPage(data, max_results=10):
|
||||||
results=[]
|
results=[]
|
||||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
torrentDate = row[0]
|
torrentDate = row[0]
|
||||||
torrentExtra = row[1]
|
torrentExtra = row[1]
|
||||||
torrentId = row[2]
|
torrentId = row[2]
|
||||||
torrentTitle = decodeHtml(row[3]).strip()
|
torrentTitle = decodeHtml(row[3]).strip()
|
||||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||||
privateTracker = 'priv.gif' in torrentExtra
|
privateTracker = 'priv.gif' in torrentExtra
|
||||||
if not privateTracker:
|
if not privateTracker:
|
||||||
results.append((torrentTitle, torrentLink, ''))
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findMovie(query, max_results=10):
|
def findMovie(query, max_results=10):
|
||||||
'''search for torrents on mininova
|
'''search for torrents on mininova
|
||||||
'''
|
'''
|
||||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||||
data = getUrlUnicode(url)
|
data = getUrlUnicode(url)
|
||||||
return _parseResultsPage(data, max_results)
|
return _parseResultsPage(data, max_results)
|
||||||
|
|
||||||
def findMovieByImdb(imdbId):
|
def findMovieByImdb(imdbId):
|
||||||
'''find torrents on mininova for a given imdb id
|
'''find torrents on mininova for a given imdb id
|
||||||
'''
|
'''
|
||||||
results = []
|
results = []
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalizeImdbId(imdbId)
|
||||||
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||||
return _parseResultsPage(data)
|
return _parseResultsPage(data)
|
||||||
|
|
||||||
def getId(mininovaId):
|
def getId(mininovaId):
|
||||||
mininovaId = unicode(mininovaId)
|
mininovaId = unicode(mininovaId)
|
||||||
d = findRe(mininovaId, "/(\d+)")
|
d = findRe(mininovaId, "/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
return d
|
return d
|
||||||
mininovaId = mininovaId.split('/')
|
mininovaId = mininovaId.split('/')
|
||||||
if len(mininovaId) == 1:
|
if len(mininovaId) == 1:
|
||||||
return mininovaId[0]
|
return mininovaId[0]
|
||||||
else:
|
else:
|
||||||
return mininovaId[-1]
|
return mininovaId[-1]
|
||||||
|
|
||||||
def exists(mininovaId):
|
def exists(mininovaId):
|
||||||
mininovaId = getId(mininovaId)
|
mininovaId = getId(mininovaId)
|
||||||
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||||
if not data or 'Torrent not found...' in data:
|
if not data or 'Torrent not found...' in data:
|
||||||
return False
|
return False
|
||||||
if 'tracker</a> of this torrent requires registration.' in data:
|
if 'tracker</a> of this torrent requires registration.' in data:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getData(mininovaId):
|
def getData(mininovaId):
|
||||||
_key_map = {
|
_key_map = {
|
||||||
'by': u'uploader',
|
'by': u'uploader',
|
||||||
}
|
}
|
||||||
mininovaId = getId(mininovaId)
|
mininovaId = getId(mininovaId)
|
||||||
torrent = dict()
|
torrent = dict()
|
||||||
torrent[u'id'] = mininovaId
|
torrent[u'id'] = mininovaId
|
||||||
torrent[u'domain'] = 'mininova.org'
|
torrent[u'domain'] = 'mininova.org'
|
||||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||||
|
|
||||||
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
|
data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link'])
|
||||||
if '<h1>Torrent not found...</h1>' in data:
|
if '<h1>Torrent not found...</h1>' in data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
|
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||||
if torrent['description']:
|
if torrent['description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
t = getUrl(torrent[u'torrent_link'])
|
t = getUrl(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
||||||
class Mininova(Torrent):
|
class Mininova(Torrent):
|
||||||
'''
|
'''
|
||||||
>>> Mininova('123')
|
>>> Mininova('123')
|
||||||
{}
|
{}
|
||||||
>>> Mininova('1072195')['infohash']
|
>>> Mininova('1072195')['infohash']
|
||||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||||
'''
|
'''
|
||||||
def __init__(self, mininovaId):
|
def __init__(self, mininovaId):
|
||||||
self.data = getData(mininovaId)
|
self.data = getData(mininovaId)
|
||||||
if not self.data:
|
if not self.data:
|
||||||
return
|
return
|
||||||
Torrent.__init__(self)
|
Torrent.__init__(self)
|
||||||
ratio = self.data['share ratio'].split(',')
|
ratio = self.data['share ratio'].split(',')
|
||||||
self['seeder'] = -1
|
self['seeder'] = -1
|
||||||
self['leecher'] = -1
|
self['leecher'] = -1
|
||||||
if len(ratio) == 2:
|
if len(ratio) == 2:
|
||||||
val = intValue(ratio[0].replace(',','').strip())
|
val = intValue(ratio[0].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['seeder'] = int(val)
|
self['seeder'] = int(val)
|
||||||
val = intValue(ratio[1].replace(',','').strip())
|
val = intValue(ratio[1].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['leecher'] = int(val)
|
self['leecher'] = int(val)
|
||||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['downloaded'] = int(val)
|
self['downloaded'] = int(val)
|
||||||
else:
|
else:
|
||||||
self['downloaded'] = -1
|
self['downloaded'] = -1
|
||||||
published = self.data['added on']
|
published = self.data['added on']
|
||||||
published = published.split(' +')[0]
|
published = published.split(' +')[0]
|
||||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
|
@ -9,37 +8,34 @@ import oxutils
|
||||||
from oxutils.lang import langCode2To3, langTo3Code
|
from oxutils.lang import langCode2To3, langTo3Code
|
||||||
|
|
||||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
if len(language) == 2:
|
if len(language) == 2:
|
||||||
language = langCode2To3(language)
|
language = langCode2To3(language)
|
||||||
elif len(language) != 3:
|
elif len(language) != 3:
|
||||||
language = langTo3Code(language)
|
language = langTo3Code(language)
|
||||||
url = "http://www.opensubtitles.org/en/search/"
|
url = "http://www.opensubtitles.org/en/search/"
|
||||||
if language:
|
if language:
|
||||||
url += "sublanguageid-%s/" % language
|
url += "sublanguageid-%s/" % language
|
||||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
if "title>opensubtitles.com - search results</title" in data:
|
if "title>opensubtitles.com - search results</title" in data:
|
||||||
fd = feedparser.parse(data)
|
fd = feedparser.parse(data)
|
||||||
opensubtitleId = None
|
opensubtitleId = None
|
||||||
print url
|
if fd.entries:
|
||||||
if fd.entries:
|
link = fd.entries[0]['links'][0]['href']
|
||||||
link = fd.entries[0]['links'][0]['href']
|
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||||
print link
|
if opensubtitleId:
|
||||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
opensubtitleId = opensubtitleId[0]
|
||||||
if opensubtitleId:
|
else:
|
||||||
opensubtitleId = opensubtitleId[0]
|
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
|
||||||
else:
|
return opensubtitleId
|
||||||
opensubtitleId = oxutils.findRe(data, '/en/subtitles/(.*?)/')
|
|
||||||
return opensubtitleId
|
|
||||||
|
|
||||||
def downloadSubtitleById(opensubtitle_id):
|
def downloadSubtitleById(opensubtitle_id):
|
||||||
srts = {}
|
srts = {}
|
||||||
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
data = getUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||||
name = oxutils.stripTags(f[1]).split('\n')[0]
|
name = oxutils.stripTags(f[1]).split('\n')[0]
|
||||||
url = "http://www.opensubtitles.com%s" % f[0]
|
url = "http://www.opensubtitles.com%s" % f[0]
|
||||||
srts[name] = getUrlUnicode(url)
|
srts[name] = getUrlUnicode(url)
|
||||||
return srts
|
return srts
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
@ -8,6 +10,7 @@ import oxutils.cache
|
||||||
from oxutils.html import decodeHtml, stripTags
|
from oxutils.html import decodeHtml, stripTags
|
||||||
import oxutils.net
|
import oxutils.net
|
||||||
|
|
||||||
|
|
||||||
def getNews(year, month, day):
|
def getNews(year, month, day):
|
||||||
sections = [
|
sections = [
|
||||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||||
|
@ -287,4 +290,4 @@ if __name__ == '__main__':
|
||||||
print x
|
print x
|
||||||
'''
|
'''
|
||||||
# archiveIssues()
|
# archiveIssues()
|
||||||
archiveNews()
|
archiveNews()
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
from urllib import quote, urlencode
|
from urllib import quote, urlencode
|
||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
|
|
||||||
|
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
from oxutils import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||||
from oxutils.normalize import normalizeImdbId
|
from oxutils.normalize import normalizeImdbId
|
||||||
|
@ -16,107 +13,106 @@ import oxutils
|
||||||
|
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
|
||||||
socket.setdefaulttimeout(10.0)
|
|
||||||
|
|
||||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
|
def _getUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout):
|
||||||
headers = cache.DEFAULT_HEADERS
|
headers = cache.DEFAULT_HEADERS
|
||||||
headers['Cookie'] = 'language=en_EN'
|
headers['Cookie'] = 'language=en_EN'
|
||||||
return cache.getUrl(url, data, headers, timeout)
|
return cache.getUrl(url, data, headers, timeout)
|
||||||
|
|
||||||
def _getUrlUnicode(url):
|
def _getUrlUnicode(url):
|
||||||
return cache.getUrlUnicode(url, _getUrl=_getUrl)
|
return cache.getUrlUnicode(url, _getUrl=_getUrl)
|
||||||
|
|
||||||
def findMovies(query, max_results=10):
|
def findMovies(query, max_results=10):
|
||||||
results = []
|
results = []
|
||||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||||
page_count = 1
|
page_count = 1
|
||||||
while next and page_count < 4:
|
while next and page_count < 4:
|
||||||
page_count += 1
|
page_count += 1
|
||||||
url = next[0]
|
url = next[0]
|
||||||
if not url.startswith('http'):
|
if not url.startswith('http'):
|
||||||
if not url.startswith('/'):
|
if not url.startswith('/'):
|
||||||
url = "/" + url
|
url = "/" + url
|
||||||
url = "http://thepiratebay.org" + url
|
url = "http://thepiratebay.org" + url
|
||||||
data = _getUrlUnicode(url)
|
data = _getUrlUnicode(url)
|
||||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/tor/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
torrentType = row[0]
|
torrentType = row[0]
|
||||||
torrentLink = "http://thepiratebay.org" + row[1]
|
torrentLink = "http://thepiratebay.org" + row[1]
|
||||||
torrentTitle = decodeHtml(row[2])
|
torrentTitle = decodeHtml(row[2])
|
||||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||||
if torrentType in ['201']:
|
if torrentType in ['201']:
|
||||||
results.append((torrentTitle, torrentLink, ''))
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
if len(results) >= max_results:
|
if len(results) >= max_results:
|
||||||
return results
|
return results
|
||||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findMovieByImdb(imdb):
|
def findMovieByImdb(imdb):
|
||||||
return findMovies("tt" + normalizeImdbId(imdb))
|
return findMovies("tt" + normalizeImdbId(imdb))
|
||||||
|
|
||||||
def getId(piratebayId):
|
def getId(piratebayId):
|
||||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||||
piratebayId = piratebayId.split('org/')[1]
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
d = findRe(piratebayId, "tor/(\d+)")
|
d = findRe(piratebayId, "tor/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
return piratebayId
|
return piratebayId
|
||||||
|
|
||||||
def exists(piratebayId):
|
def exists(piratebayId):
|
||||||
piratebayId = getId(piratebayId)
|
piratebayId = getId(piratebayId)
|
||||||
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
|
return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId)
|
||||||
|
|
||||||
def getData(piratebayId):
|
def getData(piratebayId):
|
||||||
_key_map = {
|
_key_map = {
|
||||||
'spoken language(s)': u'language',
|
'spoken language(s)': u'language',
|
||||||
'texted language(s)': u'subtitle language',
|
'texted language(s)': u'subtitle language',
|
||||||
'by': u'uploader',
|
'by': u'uploader',
|
||||||
'leechers': 'leecher',
|
'leechers': 'leecher',
|
||||||
'seeders': 'seeder',
|
'seeders': 'seeder',
|
||||||
}
|
}
|
||||||
piratebayId = getId(piratebayId)
|
piratebayId = getId(piratebayId)
|
||||||
torrent = dict()
|
torrent = dict()
|
||||||
torrent[u'id'] = piratebayId
|
torrent[u'id'] = piratebayId
|
||||||
torrent[u'domain'] = 'thepiratebay.org'
|
torrent[u'domain'] = 'thepiratebay.org'
|
||||||
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId
|
||||||
|
|
||||||
data = _getUrlUnicode(torrent['comment_link'])
|
data = _getUrlUnicode(torrent['comment_link'])
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
if not torrent[u'title']:
|
if not torrent[u'title']:
|
||||||
return None
|
return None
|
||||||
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
torrent[u'title'] = decodeHtml(torrent[u'title']).strip()
|
||||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
title = quote(torrent['title'].encode('utf-8'))
|
title = quote(torrent['title'].encode('utf-8'))
|
||||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(stripTags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||||
if torrent[u'description']:
|
if torrent[u'description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||||
t = _getUrl(torrent[u'torrent_link'])
|
t = _getUrl(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
||||||
class Thepiratebay(Torrent):
|
class Thepiratebay(Torrent):
|
||||||
'''
|
'''
|
||||||
>>> Thepiratebay('123')
|
>>> Thepiratebay('123')
|
||||||
{}
|
{}
|
||||||
|
|
||||||
>>> Thepiratebay('3951349')['infohash']
|
>>> Thepiratebay('3951349')['infohash']
|
||||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||||
'''
|
'''
|
||||||
def __init__(self, piratebayId):
|
def __init__(self, piratebayId):
|
||||||
self.data = getData(piratebayId)
|
self.data = getData(piratebayId)
|
||||||
if not self.data:
|
if not self.data:
|
||||||
return
|
return
|
||||||
Torrent.__init__(self)
|
Torrent.__init__(self)
|
||||||
published = self.data['uploaded']
|
published = self.data['uploaded']
|
||||||
published = published.replace(' GMT', '').split(' +')[0]
|
published = published.replace(' GMT', '').split(' +')[0]
|
||||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
|
@ -1,39 +1,37 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
|
|
||||||
from oxutils import intValue
|
from oxutils import intValue
|
||||||
|
|
||||||
|
|
||||||
class Torrent(dict):
|
class Torrent(dict):
|
||||||
'''
|
'''
|
||||||
>>> Torrent()
|
>>> Torrent()
|
||||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||||
'''
|
'''
|
||||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||||
_dict_keys = ('torrent_info', )
|
_dict_keys = ('torrent_info', )
|
||||||
_list_keys = ()
|
_list_keys = ()
|
||||||
data = {'torrent_info': {}}
|
data = {'torrent_info': {}}
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
for key in self._string_keys:
|
for key in self._string_keys:
|
||||||
self[key] = self.data.get(key, u'')
|
self[key] = self.data.get(key, u'')
|
||||||
for key in self._dict_keys:
|
for key in self._dict_keys:
|
||||||
self[key] = self.data.get(key, {})
|
self[key] = self.data.get(key, {})
|
||||||
for key in self._list_keys:
|
for key in self._list_keys:
|
||||||
self[key] = self.data.get(key, [])
|
self[key] = self.data.get(key, [])
|
||||||
for key in self._int_keys:
|
for key in self._int_keys:
|
||||||
value = self.data.get(key, -1)
|
value = self.data.get(key, -1)
|
||||||
if not isinstance(value, int):
|
if not isinstance(value, int):
|
||||||
value = int(intValue(value))
|
value = int(intValue(value))
|
||||||
self[key] = value
|
self[key] = value
|
||||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||||
if 'files' in self.data['torrent_info']:
|
if 'files' in self.data['torrent_info']:
|
||||||
self['files'] = len(self.data['torrent_info']['files'])
|
self['files'] = len(self.data['torrent_info']['files'])
|
||||||
else:
|
else:
|
||||||
self['files'] = 1
|
self['files'] = 1
|
||||||
|
|
||||||
|
|
102
ox/wikipedia.py
102
ox/wikipedia.py
|
@ -1,72 +1,72 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
|
||||||
import simplejson
|
import simplejson
|
||||||
from oxutils.cache import getUrl, getUrlUnicode
|
from oxutils.cache import getUrl, getUrlUnicode
|
||||||
from oxutils import findRe, decodeHtml
|
from oxutils import findRe, decodeHtml
|
||||||
|
|
||||||
|
|
||||||
def getMovieId(title, director='', year=''):
|
def getMovieId(title, director='', year=''):
|
||||||
query = '"%s" film %s %s' % (title, director, year)
|
query = '"%s" film %s %s' % (title, director, year)
|
||||||
result = find(query, 1)
|
result = find(query, 1)
|
||||||
if result:
|
if result:
|
||||||
return result[0][1]
|
return result[0][1]
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getUrlByImdb(imdbId):
|
def getUrlByImdb(imdbId):
|
||||||
query = '"imdb_id = %s"'% imdbId
|
query = '"imdb_id = %s"'% imdbId
|
||||||
result = find(query)
|
result = find(query)
|
||||||
if result:
|
if result:
|
||||||
url = result[0][1]
|
url = result[0][1]
|
||||||
return url
|
return url
|
||||||
if str(imdbId).startswith('0'):
|
if str(imdbId).startswith('0'):
|
||||||
imdbId = imdbId[1:]
|
imdbId = imdbId[1:]
|
||||||
return getUrlByImdb(imdbId)
|
return getUrlByImdb(imdbId)
|
||||||
|
|
||||||
def getUrlByAmbId(amg_id):
|
def getUrlByAmbId(amg_id):
|
||||||
query = '"amg_id = %s"'% amg_id
|
query = '"amg_id = %s"'% amg_id
|
||||||
result = find(query)
|
result = find(query)
|
||||||
if result:
|
if result:
|
||||||
url = result[0][1]
|
url = result[0][1]
|
||||||
return url
|
return url
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getWikiData(wikipediaUrl):
|
def getWikiData(wikipediaUrl):
|
||||||
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '')
|
||||||
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title
|
||||||
html = getUrlUnicode(url)
|
html = getUrlUnicode(url)
|
||||||
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
data = decodeHtml(findRe(html, "<textarea.*?>(.*?)</textarea>"))
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieData(wikipediaUrl):
|
def getMovieData(wikipediaUrl):
|
||||||
data = getWikiData(wikipediaUrl)
|
data = getWikiData(wikipediaUrl)
|
||||||
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
for row in filmbox_data.strip().split('|'):
|
for row in filmbox_data.strip().split('|'):
|
||||||
d = row.split('=')
|
d = row.split('=')
|
||||||
if len(d) == 2:
|
if len(d) == 2:
|
||||||
key = d[0].strip()
|
key = d[0].strip()
|
||||||
value = d[1].strip()
|
value = d[1].strip()
|
||||||
filmbox[key] = value
|
filmbox[key] = value
|
||||||
return filmbox
|
return filmbox
|
||||||
|
|
||||||
def getAmgId(wikipediaUrl):
|
def getAmgId(wikipediaUrl):
|
||||||
data = getMovieData(wikipediaUrl)
|
data = getMovieData(wikipediaUrl)
|
||||||
return data.get('amg_id', '')
|
return data.get('amg_id', '')
|
||||||
|
|
||||||
def find(query, max_results=10):
|
def find(query, max_results=10):
|
||||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
if not data:
|
if not data:
|
||||||
data = getUrl(url, timeout=0)
|
data = getUrl(url, timeout=0)
|
||||||
result = simplejson.loads(data)
|
result = simplejson.loads(data)
|
||||||
results = []
|
results = []
|
||||||
for r in result['query']['search']:
|
for r in result['query']['search']:
|
||||||
title = r['title']
|
title = r['title']
|
||||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||||
results.append((title, url, ''))
|
results.append((title, url, ''))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# -*- Mode: Python; -*-
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
@ -8,49 +7,50 @@ import feedparser
|
||||||
from oxutils.cache import getUrl
|
from oxutils.cache import getUrl
|
||||||
from oxutils import findString
|
from oxutils import findString
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(youtubeId, format='mp4'):
|
def getVideoUrl(youtubeId, format='mp4'):
|
||||||
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
|
url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
xml = ET.fromstring(data)
|
xml = ET.fromstring(data)
|
||||||
youtubeKey = xml.find('t').text
|
youtubeKey = xml.find('t').text
|
||||||
if format == 'mp4':
|
if format == 'mp4':
|
||||||
fmt=18
|
fmt=18
|
||||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt)
|
||||||
else:
|
else:
|
||||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
|
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def getMovieInfo(youtubeId):
|
def getMovieInfo(youtubeId):
|
||||||
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
|
url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
fd = feedparser.parse(data)
|
fd = feedparser.parse(data)
|
||||||
return getInfoFromAtom(fd.entries[0])
|
return getInfoFromAtom(fd.entries[0])
|
||||||
|
|
||||||
def getInfoFromAtom(entry):
|
def getInfoFromAtom(entry):
|
||||||
info = dict()
|
info = dict()
|
||||||
info['title'] = entry['title']
|
info['title'] = entry['title']
|
||||||
info['description'] = entry['description']
|
info['description'] = entry['description']
|
||||||
info['author'] = entry['author']
|
info['author'] = entry['author']
|
||||||
info['published'] = entry['published_parsed']
|
info['published'] = entry['published_parsed']
|
||||||
info['keywords'] = entry['media_keywords'].split(', ')
|
info['keywords'] = entry['media_keywords'].split(', ')
|
||||||
info['url'] = entry['links'][0]['href']
|
info['url'] = entry['links'][0]['href']
|
||||||
info['id'] = findString(info['url'], "/watch?v=")
|
info['id'] = findString(info['url'], "/watch?v=")
|
||||||
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id']
|
||||||
info['flv'] = getVideoUrl(info['id'], 'flv')
|
info['flv'] = getVideoUrl(info['id'], 'flv')
|
||||||
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
info['mp4'] = getVideoUrl(info['id'], 'mp4')
|
||||||
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
|
info['embed'] = '''<object width="425" height="355"><param name="movie" value="http://www.youtube.com/v/%s&hl=en"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s&hl=en" type="application/x-shockwave-flash" wmode="transparent" width="425" height="355"></embed></object>''' % (info['id'], info['id'])
|
||||||
return info
|
return info
|
||||||
|
|
||||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||||
query = quote(query)
|
query = quote(query)
|
||||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
|
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results)
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
fd = feedparser.parse(data)
|
fd = feedparser.parse(data)
|
||||||
videos = []
|
videos = []
|
||||||
for entry in fd.entries:
|
for entry in fd.entries:
|
||||||
v = getInfoFromAtom(entry)
|
v = getInfoFromAtom(entry)
|
||||||
videos.append(v)
|
videos.append(v)
|
||||||
if len(videos) >= max_results:
|
if len(videos) >= max_results:
|
||||||
return videos
|
return videos
|
||||||
return videos
|
return videos
|
||||||
|
|
||||||
|
|
50
setup.py
50
setup.py
|
@ -1,33 +1,33 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vi:si:et:sw=2:sts=2:ts=2
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
# encoding: utf-8
|
# encoding: utf-8
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="ox",
|
name="ox",
|
||||||
version="0.1",
|
version="0.1",
|
||||||
description="collection of scrapers for various websites",
|
description="collection of scrapers for various websites",
|
||||||
author="0x",
|
author="0x",
|
||||||
author_email="code@0xdb.org",
|
author_email="code@0xdb.org",
|
||||||
url="http://code.0xdb.org/ox",
|
url="http://code.0xdb.org/ox",
|
||||||
download_url="http://code.0xdb.org/ox/download",
|
download_url="http://code.0xdb.org/ox/download",
|
||||||
license="GPLv3",
|
license="GPLv3",
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'oxutils',
|
'oxutils',
|
||||||
'feedparser',
|
'feedparser',
|
||||||
'beautifulsoup',
|
'beautifulsoup',
|
||||||
],
|
],
|
||||||
keywords = [
|
keywords = [
|
||||||
],
|
],
|
||||||
classifiers = [
|
classifiers = [
|
||||||
'Development Status :: 3 - Alpha',
|
'Development Status :: 3 - Alpha',
|
||||||
'Operating System :: OS Independent',
|
'Operating System :: OS Independent',
|
||||||
'Programming Language :: Python',
|
'Programming Language :: Python',
|
||||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue