dont include editors in authors
This commit is contained in:
parent
c3de15587c
commit
a350831792
1 changed files with 15 additions and 9 deletions
|
@ -1,5 +1,5 @@
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import decode_html, strip_tags, find_re
|
from ox import decode_html, strip_tags, find_re, fix_bad_unicode
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
@ -24,7 +24,10 @@ def info(key, value):
|
||||||
info['asin'] = [l.get('href').rpartition('/')[-1]]
|
info['asin'] = [l.get('href').rpartition('/')[-1]]
|
||||||
break
|
break
|
||||||
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
|
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
|
||||||
|
info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
|
||||||
|
info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
|
||||||
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
|
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
|
||||||
|
info['description'] = fix_bad_unicode(info['description'])
|
||||||
content = doc.xpath('//div[@class="content"]')[0]
|
content = doc.xpath('//div[@class="content"]')[0]
|
||||||
content_info = {}
|
content_info = {}
|
||||||
for li in content.xpath('.//li'):
|
for li in content.xpath('.//li'):
|
||||||
|
@ -52,19 +55,13 @@ def info(key, value):
|
||||||
if a:
|
if a:
|
||||||
for span in a:
|
for span in a:
|
||||||
r = span.getchildren()[0].text.strip()
|
r = span.getchildren()[0].text.strip()
|
||||||
if 'Translator' in r:
|
role = get_role(r)
|
||||||
role = 'translator'
|
|
||||||
else:
|
|
||||||
role = 'author'
|
|
||||||
if not role in info: info[role] = []
|
if not role in info: info[role] = []
|
||||||
info[role].append(span.text.strip())
|
info[role].append(span.text.strip())
|
||||||
else:
|
else:
|
||||||
for span in doc.xpath('//span[@class="author notFaded"]'):
|
for span in doc.xpath('//span[@class="author notFaded"]'):
|
||||||
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
|
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
|
||||||
if 'Translator' in author[-1]:
|
role = get_role(author[-1])
|
||||||
role = 'translator'
|
|
||||||
else:
|
|
||||||
role = 'author'
|
|
||||||
if not role in info: info[role] = []
|
if not role in info: info[role] = []
|
||||||
info[role].append(author[0])
|
info[role].append(author[0])
|
||||||
|
|
||||||
|
@ -76,3 +73,12 @@ def info(key, value):
|
||||||
last = covers[url]
|
last = covers[url]
|
||||||
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
|
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
def get_role(value):
|
||||||
|
if 'Translator' in value:
|
||||||
|
role = 'translator'
|
||||||
|
elif 'Editor' in value:
|
||||||
|
role = 'editor'
|
||||||
|
else:
|
||||||
|
role = 'author'
|
||||||
|
return role
|
||||||
|
|
Loading…
Reference in a new issue