From 404842f8493c345b20176d49edc4ca44b69584bb Mon Sep 17 00:00:00 2001 From: j Date: Wed, 3 Feb 2016 01:00:40 +0530 Subject: [PATCH] use old google api --- oml/item/api.py | 6 +-- oml/item/scan.py | 13 ++--- oml/meta/__init__.py | 8 ++-- oml/meta/google.py | 112 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 119 insertions(+), 20 deletions(-) diff --git a/oml/item/api.py b/oml/item/api.py index ebcba08..9b94478 100644 --- a/oml/item/api.py +++ b/oml/item/api.py @@ -242,11 +242,7 @@ def findMetadata(data): if r: response['items'].append(r) elif key == 'author,title': - value = ' '.join(data.values()) - for isbn in meta.find(value): - r = meta.lookup('isbn', isbn) - if r: - response['items'].append(r) + response['items'] = meta.find(**data) elif key == 'id': import user.models items = {} diff --git a/oml/item/scan.py b/oml/item/scan.py index 4b2a41f..90d9c4e 100644 --- a/oml/item/scan.py +++ b/oml/item/scan.py @@ -101,12 +101,13 @@ def run_scan(): return position += 1 with db.session(): - id = media.get_id(f) - file = File.get(id) - if not file: - file = add_file(id, f, prefix, f) - added += 1 - trigger_event('change', {}) + if os.path.exists(f): + id = media.get_id(f) + file = File.get(id) + if not file: + file = add_file(id, f, prefix, f) + added += 1 + trigger_event('change', {}) def change_path(old, new): new_books = os.path.join(new, 'Books') diff --git a/oml/meta/__init__.py b/oml/meta/__init__.py index e7b91a8..b606a76 100644 --- a/oml/meta/__init__.py +++ b/oml/meta/__init__.py @@ -31,9 +31,9 @@ providers = [ ('abebooks', 'isbn') ] -def find(query): - #results = google.find(query) - results = duckduckgo.find(query) +def find(title=None, author=None): + results = google.find(title=title, author=author) + #results = duckduckgo.find(query) ''' results = openlibrary.find(query) for r in results: @@ -55,7 +55,7 @@ def lookup(key, value): return {} if key == 'isbn': try: - data = google.info(key, value) + data = google.info(value) except: logger.debug('google.info failed %s=%s', key, value, exc_info=True) data = {} diff --git a/oml/meta/google.py b/oml/meta/google.py index 3d79b1f..7bcbf42 100644 --- a/oml/meta/google.py +++ b/oml/meta/google.py @@ -2,10 +2,14 @@ # vi:si:et:sw=4:sts=4:ts=4 from time import time, sleep +from urllib.parse import urlencode +import re +from functools import partial -from ox.cache import get_json, store +from ox.cache import get_json, store, read_url import ox.web.google import stdnum.isbn +from lxml import etree from .utils import find_isbns, get_language, decode_html_data, to_isbn13 import settings @@ -13,8 +17,16 @@ import settings import logging logger = logging.getLogger(__name__) +NAMESPACES = { + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', + 'atom' : 'http://www.w3.org/2005/Atom', + 'dc' : 'http://purl.org/dc/terms', + 'gbs' : 'http://schemas.google.com/books/2008', + 'gd' : 'http://schemas.google.com/g/2005' +} +XPath = partial(etree.XPath, namespaces=NAMESPACES) -def find(query): +def find_(query): logger.debug('find %s', query) query += ' isbn' isbns = [] @@ -37,9 +49,99 @@ def find(query): done.add(stdnum.isbn.to_isbn10(isbn)) return results -def info(key, value): - if key not in ('isbn', 'lccn', 'oclc'): - raise IOError('unknwon key %s' % key) +def parse_entry(entry_): + entry_id = XPath('descendant::atom:id') + creator = XPath('descendant::dc:creator') + date = XPath('descendant::dc:date') + description = XPath('descendant::dc:description') + _format = XPath('descendant::dc:format') + identifier = XPath('descendant::dc:identifier') + language = XPath('descendant::dc:language') + publisher = XPath('descendant::dc:publisher') + subject = XPath('descendant::dc:subject') + title = XPath('descendant::dc:title') + viewability = XPath('descendant::gbs:viewability') + id_url = entry_id(entry_)[0].text + _id = id_url.split('/')[-1] + info = {} + info['title'] = ': '.join([x.text for x in title(entry_)]).strip() + authors = [x.text.strip() for x in creator(entry_) if x.text] + if authors: + info['author'] = authors + info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip() + info['date'] = ''.join([x.text for x in date(entry_)]).strip() + info['categories'] = [x.text for x in subject(entry_)] + info['publisher'] = [x.text for x in publisher(entry_)] + info['language'] = [get_language(x.text) for x in language(entry_)] + v = viewability(entry_) + if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages': + info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id + format_ = ''.join([x.text for x in _format(entry_)]) + if format_: + pages = re.compile('\d+').findall(format_) + if pages: + info['pages'] = int(pages[0]) + for x in identifier(entry_): + t = str(x.text).strip() + if t[:5].upper() == 'ISBN:': + t = to_isbn13(t[5:]) + if t: + info['isbn'] = t + break + info = decode_html_data(info) + return info + +def find(title=None, author=None): + ''' + parts = [] + if title: + parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')])) + if author: + parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')])) + q = '+'.join(parts) + ''' + q = '' + if title: + q += title + ' ' + if author: + q += author + url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ + 'q': q.strip(), + 'max-results': 20, + 'start-index':1, + 'min-viewability':'none', + }) + print(url) + data = read_url(url) + feed = etree.fromstring(data, + parser=etree.XMLParser(recover=True, no_network=True)) + results = [] + isbns = set() + for entry_ in XPath('//atom:entry')(feed): + info = parse_entry(entry_) + if 'isbn' in info and not 'isbn' in isbns: + results.append(info) + isbns.add(info['isbn']) + return results + +def info(isbn): + url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ + 'q': 'isnb:' + isbn, + 'max-results':1, + 'start-index':1, + 'min-viewability':'none', + }) + data = read_url(url) + feed = etree.fromstring(data, + parser=etree.XMLParser(recover=True, no_network=True)) + for entry_ in XPath('//atom:entry')(feed): + info = parse_entry(entry_) + info['isbn'] = isbn + return info + return {} + +def info_newapi(value): + key = 'isbn' url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value) api_key = settings.server.get('google_api_key') if api_key: