cleanup
This commit is contained in:
parent
942cb819c7
commit
ef20041185
4 changed files with 26 additions and 199 deletions
|
@ -1,61 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import string
|
|
||||||
|
|
||||||
from oxlib import cache
|
|
||||||
from oxlib.html import stripTags, decodeHtml
|
|
||||||
from oxlib.text import findRe
|
|
||||||
from oxlib.normalize import canonicalName
|
|
||||||
import auth
|
|
||||||
|
|
||||||
|
|
||||||
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
|
||||||
headers = headers.copy()
|
|
||||||
headers["Cookie"] = auth.get("aaaarg.cookie")
|
|
||||||
return cache.readUrl(url, data, headers, timeout)
|
|
||||||
|
|
||||||
def readUrlUnicode(url, timeout=cache.cache_timeout):
|
|
||||||
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
|
||||||
|
|
||||||
def downloadText(id, filename=None):
|
|
||||||
#FIXME, what about the cache, this keeps all pdfs in oxcache...
|
|
||||||
url='http://a.aaaarg.org/node/%d/download' % id
|
|
||||||
data = readUrl(url, timeout=-1)
|
|
||||||
headers = cache.getHeaders(url, timeout=-1)
|
|
||||||
if filename:
|
|
||||||
with open(filename, "w") as f:
|
|
||||||
f.write(data)
|
|
||||||
return
|
|
||||||
return data
|
|
||||||
|
|
||||||
def getTextByLetter(letter):
|
|
||||||
texts = []
|
|
||||||
url = 'http://a.aaaarg.org/library/%s' % letter
|
|
||||||
data = readUrlUnicode(url)
|
|
||||||
txts = re.compile('<li class="author">(.*?)</li><li class="title"><a href="(.*?)">(.*?)</a></li>').findall(data)
|
|
||||||
author = 'Unknown Author'
|
|
||||||
for r in txts:
|
|
||||||
if r[0] != ' ':
|
|
||||||
author = r[0]
|
|
||||||
link = r[1]
|
|
||||||
id = findRe(link, '/(\d+)')
|
|
||||||
title = decodeHtml(r[2])
|
|
||||||
author_foder = canonicalName(author)
|
|
||||||
author_foder = os.path.join(author_foder[0], author_foder)
|
|
||||||
filename = os.path.join(author_foder, '%s (aaarg %s).pdf' % (title.replace('/', '_'), id))
|
|
||||||
texts.append({
|
|
||||||
'author': author,
|
|
||||||
'title': title,
|
|
||||||
'id': id,
|
|
||||||
'filename': filename,
|
|
||||||
})
|
|
||||||
return texts
|
|
||||||
|
|
||||||
def getTexts():
|
|
||||||
texts = []
|
|
||||||
for letter in string.letters[:26]:
|
|
||||||
texts += getTextByLetter(letter)
|
|
||||||
return texts
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
# GPL 2009
|
# GPL 2009
|
||||||
import os
|
import os
|
||||||
import simplejson
|
import simplejson as json
|
||||||
|
|
||||||
|
|
||||||
def get(key):
|
def get(key):
|
||||||
|
@ -12,9 +12,22 @@ def get(key):
|
||||||
f = open(user_auth, "r")
|
f = open(user_auth, "r")
|
||||||
data = f.read()
|
data = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
auth = simplejson.loads(data)
|
auth = json.loads(data)
|
||||||
if key in auth:
|
if key in auth:
|
||||||
return auth[key]
|
return auth[key]
|
||||||
print "please add key %s to json file '%s'" % (key, user_auth)
|
print "please add key %s to json file '%s'" % (key, user_auth)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def update(key, value):
|
||||||
|
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||||
|
auth = {}
|
||||||
|
if os.path.exists(user_auth):
|
||||||
|
f = open(user_auth, "r")
|
||||||
|
data = f.read()
|
||||||
|
f.close()
|
||||||
|
auth = json.loads(data)
|
||||||
|
auth[key] = value
|
||||||
|
f = open(user_auth, "w")
|
||||||
|
f.write(json.dumps(auth, indent=2))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
|
@ -368,18 +368,21 @@ def getMoviePosters(imdbId):
|
||||||
|
|
||||||
def getMovieTrivia(imdbId):
|
def getMovieTrivia(imdbId):
|
||||||
url = "%strivia" % getUrlBase(imdbId)
|
url = "%strivia" % getUrlBase(imdbId)
|
||||||
data = readUrlUnicode(url)
|
data_ = readUrlUnicode(url)
|
||||||
data = findRe(data, '<ul class="trivia">(.*?)</ul>')
|
data = findRe(data_, '<ul class="trivia">(.*?)</ul>')
|
||||||
trivia = re.compile('<li>(.*?)</li>', re.DOTALL).findall(data)
|
trivia = re.compile('<li>(.*?)</li>', re.DOTALL).findall(data)
|
||||||
def clean(t):
|
def clean(t):
|
||||||
t = decodeHtml(t)
|
t = decodeHtml(t)
|
||||||
t = t.replace(u'', '"')
|
t = t.replace(u'', '"').strip()
|
||||||
if t.endswith('<br><br>'):
|
for s in ('<br><br>', '<br>\n<br>', '<br>'):
|
||||||
t = t[:-8]
|
if t.endswith(s):
|
||||||
if t.endswith('<br>\n<br>'):
|
t = t[:-len(s)].strip()
|
||||||
t = t[:-len('<br>\n<br>')]
|
|
||||||
return t.strip()
|
return t.strip()
|
||||||
|
if len(trivia) == 0:
|
||||||
|
trivia = re.compile('<div class="sodatext">(.*?)<span', re.DOTALL).findall(data_)
|
||||||
|
|
||||||
trivia = [clean(t) for t in trivia]
|
trivia = [clean(t) for t in trivia]
|
||||||
|
|
||||||
return trivia
|
return trivia
|
||||||
|
|
||||||
def getMovieConnections(imdbId):
|
def getMovieConnections(imdbId):
|
||||||
|
|
|
@ -1,128 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
|
||||||
import re
|
|
||||||
from oxlib import cache
|
|
||||||
from oxlib.html import stripTags
|
|
||||||
from oxlib.text import findRe
|
|
||||||
|
|
||||||
import auth
|
|
||||||
|
|
||||||
|
|
||||||
def readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
|
||||||
headers = headers.copy()
|
|
||||||
headers["Cookie"] = auth.get("karagarga.cookie")
|
|
||||||
return cache.readUrl(url, data, headers, timeout)
|
|
||||||
|
|
||||||
def readUrlUnicode(url, timeout=cache.cache_timeout):
|
|
||||||
return cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
|
||||||
|
|
||||||
def getData(id):
|
|
||||||
data = {
|
|
||||||
"url": getUrl(id)
|
|
||||||
}
|
|
||||||
html = readUrlUnicode("%s%s" % (data["url"], "&filelist=1"))
|
|
||||||
if 'No torrent with ID' in html:
|
|
||||||
return False
|
|
||||||
data['added'] = stripTags(parseTable(html, 'Added'))
|
|
||||||
data['country'] = findRe(html, 'title="([\w ]*?)" border="0" width="32" height="20"')
|
|
||||||
# data['description'] = parseTable(html, 'Description')
|
|
||||||
data['director'] = stripTags(parseTable(html, 'Director / Artist'))
|
|
||||||
data['files'] = []
|
|
||||||
result = findRe(html, '<table class=main border="1" cellspacing=0 cellpadding="5">(.*?)</table>')
|
|
||||||
results = re.compile('<td>(.*?)</td><td align="right">(.*?)</td>', re.DOTALL).findall(result)
|
|
||||||
for name, size in results:
|
|
||||||
data['files'].append({
|
|
||||||
'name': name,
|
|
||||||
'size': '%s %s' % (size[:-2], size[-2:].strip().upper())
|
|
||||||
})
|
|
||||||
data['format'] = ''
|
|
||||||
if html.find('genreimages/dvdr.png') != -1:
|
|
||||||
data['format'] = 'DVD'
|
|
||||||
elif html.find('genreimages/hdrip.png') != -1:
|
|
||||||
data['format'] = 'HD'
|
|
||||||
data['genre'] = []
|
|
||||||
result = parseTable(html, 'Genres')
|
|
||||||
for string in result.split('\n'):
|
|
||||||
string = stripTags(findRe(string, '<a href="browse.php\?genre=.*?">(.*?)</a>'))
|
|
||||||
if string:
|
|
||||||
data['genre'].append(string)
|
|
||||||
data['id'] = id
|
|
||||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
|
||||||
data['language'] = stripTags(parseTable(html, 'Language'))
|
|
||||||
data['leechers'] = int(findRe(html, 'seeder\(s\), (.*?) leecher\(s\)'))
|
|
||||||
data['link'] = stripTags(parseTable(html, 'Internet Link'))
|
|
||||||
data['links'] = []
|
|
||||||
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Description'))
|
|
||||||
for (url, title) in results:
|
|
||||||
if url.find('javascript') == -1:
|
|
||||||
data['links'].append({
|
|
||||||
'title': title,
|
|
||||||
'url': url.replace('http://anonym.to/?', '')
|
|
||||||
})
|
|
||||||
data['people'] = 0
|
|
||||||
result = stripTags(findRe(html, '<a href="top10others.php.*?>(.*?) people')).strip()
|
|
||||||
if result:
|
|
||||||
data['people'] = int(result)
|
|
||||||
data['posters'] = []
|
|
||||||
results = re.compile('<img border=0 src="(http://.*?)"', re.DOTALL).findall(html)
|
|
||||||
for result in results:
|
|
||||||
data['posters'].append(result)
|
|
||||||
data['seeders'] = int(findRe(html, '#seeders" class="sublink".*?colspan=2>(.*?) seeder\(s\)'))
|
|
||||||
data['size'] = int(findRe(parseTable(html, 'Size'), '\((.*?) ').replace(',', ''))
|
|
||||||
data['snatched'] = int(findRe(html, '<a name="snatchers">.*?colspan=2>(.*?) '))
|
|
||||||
data['subtitle'] = findRe(parseTable(html, 'Subtitles'), '>(.*?)<hr>').replace('included: ', '')
|
|
||||||
data['subtitles'] = []
|
|
||||||
results = re.compile('<a href="(.*?)">(.*?)</a>', re.DOTALL).findall(parseTable(html, 'Subtitles'))
|
|
||||||
for (url, language) in results:
|
|
||||||
data['subtitles'].append({
|
|
||||||
'language': language.replace('click here for ', ''),
|
|
||||||
'url': url
|
|
||||||
})
|
|
||||||
data['torrent'] = 'http://karagarga.net/%s' % findRe(html, '(down.php/.*?)"')
|
|
||||||
data['year'] = stripTags(parseTable(html, 'Year'))
|
|
||||||
data['title'] = stripTags(findRe(html, '<h1>(.*?)</h1>')).strip()
|
|
||||||
data['title'] = re.sub('^%s - ' % re.escape(data['director']), '', data['title'])
|
|
||||||
data['title'] = re.sub(' \(%s\)$' % re.escape(data['year']), '', data['title'])
|
|
||||||
return data
|
|
||||||
|
|
||||||
def getId(url):
|
|
||||||
return url.split("=")[-1]
|
|
||||||
|
|
||||||
def getTorrent(id):
|
|
||||||
return readUrl(getData(id)['torrent'])
|
|
||||||
|
|
||||||
def getIds(lastId = 20):
|
|
||||||
lastId = '%s' % lastId
|
|
||||||
ids = []
|
|
||||||
page = 0
|
|
||||||
while True:
|
|
||||||
for id in getIdsByPage(page):
|
|
||||||
if not id in ids:
|
|
||||||
ids.append(id)
|
|
||||||
if lastId in ids:
|
|
||||||
break
|
|
||||||
page += 1
|
|
||||||
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
|
||||||
|
|
||||||
def getIdsByPage(page):
|
|
||||||
ids = []
|
|
||||||
url = 'http://karagarga.net/browse.php?page=%s&cat=1&sort=added&d=DESC' % page
|
|
||||||
html = readUrlUnicode(url, timeout = 23*60*60) #get new ids once per day
|
|
||||||
strings = html.split('<td width="42" style="padding:0px;">')
|
|
||||||
strings.pop(0)
|
|
||||||
for string in strings:
|
|
||||||
ids.append(findRe(string, '"details.php\?id=(.*?)"'))
|
|
||||||
return ids
|
|
||||||
|
|
||||||
def getUrl(id):
|
|
||||||
return "http://karagarga.net/details.php?id=%s" % id
|
|
||||||
|
|
||||||
def parseTable(html, title):
|
|
||||||
if title == 'Genres':
|
|
||||||
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</table>' % title)
|
|
||||||
else:
|
|
||||||
return findRe(html, '<td class="heading" [\w=" ]*?>%s</td>(.*?)</td>' % title)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print getIds("79317")
|
|
||||||
print getData("79317")
|
|
Loading…
Reference in a new issue