diff --git a/oml/meta/google.py b/oml/meta/google.py index 029124a..713647d 100644 --- a/oml/meta/google.py +++ b/oml/meta/google.py @@ -17,7 +17,7 @@ import logging logger = logging.getLogger(__name__) NAMESPACES = { - 'openSearch': 'http://a9.com/-/spec/opensearchrss/1.0/', + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'atom' : 'http://www.w3.org/2005/Atom', 'dc' : 'http://purl.org/dc/terms', 'gbs' : 'http://schemas.google.com/books/2008', @@ -25,6 +25,29 @@ NAMESPACES = { } XPath = partial(etree.XPath, namespaces=NAMESPACES) +def find_(query): + logger.debug('find %s', query) + query += ' isbn' + isbns = [] + for r in ox.web.google.find(query): + isbns += find_isbns(' '.join(r)) + logger.debug('isbns', isbns) + results = [] + done = set() + for isbn in isbns: + if isbn not in done: + r = { + 'isbn': isbn, + 'primaryid': ['isbn', isbn] + } + results.append(r) + done.add(isbn) + if len(isbn) == 10: + done.add(stdnum.isbn.to_isbn13(isbn)) + if len(isbn) == 13 and isbn.startswith('978'): + done.add(stdnum.isbn.to_isbn10(isbn)) + return results + def parse_entry(entry_): entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') @@ -67,7 +90,7 @@ def parse_entry(entry_): info = decode_html_data(info) return info -def find_feeds(title=None, author=None): +def find(title=None, author=None): ''' parts = [] if title: @@ -84,48 +107,36 @@ def find_feeds(title=None, author=None): url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ 'q': q.strip(), 'max-results': 20, - 'start-index': 1, - 'min-viewability': 'none', + 'start-index':1, + 'min-viewability':'none', }) data = read_url(url) - feed = etree.fromstring(data, parser=etree.XMLParser(recover=True, no_network=True)) + feed = etree.fromstring(data, + parser=etree.XMLParser(recover=True, no_network=True)) results = [] isbns = set() for entry_ in XPath('//atom:entry')(feed): - entry = parse_entry(entry_) - if 'isbn' in entry and 'isbn' not in isbns: - results.append(info(entry['isbn'])) + info = parse_entry(entry_) + if 'isbn' in info and not 'isbn' in isbns: + results.append(info) isbns.add(info['isbn']) return results -def find(title=None, author=None): - q = '' - if title: - q += title + ' ' - if author: - q += author - url = 'https://www.googleapis.com/books/v1/volumes?q=%s' % q - api_key = settings.server.get('google_api_key') - if api_key: - url += '&key=' + api_key - if api_limit.error: - raise IOError(url) - while not api_limit.consume(1): - logger.debug('hitting google api to fast, waiting 1 second') - sleep(1) - r = get_json(url, timeout=-1) - if 'error' in r: - logger.debug('got google api error, dont call for 10 minutes') - store.delete(url) - api_limit.error = True - raise IOError(url, r) - if 'items' not in r: - logger.debug('unknown %s: %s [%s]', key, value, r) - return [] - results = [] - for item in r['items']: - results.append(parse_item(item)) - return results +def info_old(isbn): + url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ + 'q': 'isnb:' + isbn, + 'max-results':1, + 'start-index':1, + 'min-viewability':'none', + }) + data = read_url(url) + feed = etree.fromstring(data, + parser=etree.XMLParser(recover=True, no_network=True)) + for entry_ in XPath('//atom:entry')(feed): + info = parse_entry(entry_) + info['isbn'] = isbn + return info + return {} def info(value): key = 'isbn' @@ -144,14 +155,11 @@ def info(value): store.delete(url) api_limit.error = True raise IOError(url, r) - if 'items' not in r: + if not 'items' in r: logger.debug('unknown %s: %s [%s]', key, value, r) return {} - return parse_item(r['items'][0]) - -def parse_item(item): - _data = item['volumeInfo'] - _id = item['id'] + _data = r['items'][0]['volumeInfo'] + _id = r['items'][0]['id'] data = {} for key in [ 'authors', @@ -161,17 +169,17 @@ def parse_item(item): 'publishedDate', 'publisher', 'title', - ]: + ]: if key in _data: data[{ 'authors': 'author', 'pageCount': 'pages', 'publishedDate': 'date', - }.get(key, key)] = _data[key] + }.get(key,key)] = _data[key] if 'subtitle' in _data and _data['subtitle'].strip(): data['title'] = '{title}: {subtitle}'.format(**_data) - if item['accessInfo']['viewability'] != 'NO_PAGES': + if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES': #data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id @@ -183,7 +191,7 @@ def parse_item(item): if 'industryIdentifiers' in _data: for k in _data['industryIdentifiers']: if k['type'].startswith('ISBN'): - if 'isbn' not in data: + if not 'isbn' in data: data['isbn'] = [] data['isbn'].append(k['identifier']) else: diff --git a/oml/nodes.py b/oml/nodes.py index e3621d9..9dda107 100644 --- a/oml/nodes.py +++ b/oml/nodes.py @@ -504,7 +504,7 @@ class Nodes(Thread): while not state.shutdown: args = self._q.get() if args: - logger.debug('processing nodes queue: next: "%s", %s entries in queue', args[0], self._q.qsize()) + logger.debug('processing nodes queue: next: %s, %s entries in queue', args[0], self._q.qsize()) if args[0] == 'add': self._add(*args[1:]) elif args[0] == 'pull': @@ -514,7 +514,7 @@ class Nodes(Thread): def queue(self, *args): if args: - logger.debug('queue "%s", %s entries in queue', args[0], self._q.qsize()) + logger.debug('add %s to queue, queue currently has %s entries', args[0], self._q.qsize()) self._q.put(list(args)) def is_online(self, id): @@ -568,7 +568,7 @@ class Nodes(Thread): users = [] with db.session(): from user.models import User - for u in User.query.filter(User.id != settings.USER_ID).filter_by(peered=True).all(): + for u in User.query.filter(User.id!=settings.USER_ID).filter_by(peered=True).all(): users.append(u.json(['id', 'index', 'name'])) users.sort(key=user_sort_key) for u in users: