catch more 404s

This commit is contained in:
j 2016-01-05 14:00:25 +05:30
parent 9996c6b603
commit 9aff4766e5

View file

@ -13,13 +13,14 @@ def info(key, value):
value = stdnum.isbn.to_isbn10(value) value = stdnum.isbn.to_isbn10(value)
if len(value) != 10: if len(value) != 10:
raise IOError('invalid isbn %s' % value) raise IOError('invalid isbn %s' % value)
url = 'http://www.amazon.com/dp/' + value url = 'http://www.amazon.com/dp/' + value
data = read_url(url).decode() data = read_url(url).decode()
doc = lxml.html.document_fromstring(data) doc = lxml.html.document_fromstring(data)
info = {} info = {}
if '<title>404 - Document Not Found</title>' in data: if '<title>404 - Document Not Found</title>' in data:
return info return info
if 'To discuss automated access to Amazon data please' in data:
return info
for l in doc.xpath('//link[@rel="canonical" and @href]'): for l in doc.xpath('//link[@rel="canonical" and @href]'):
info['asin'] = [l.get('href').rpartition('/')[-1]] info['asin'] = [l.get('href').rpartition('/')[-1]]
break break