dont include editors in authors

This commit is contained in:
j 2016-01-05 13:16:30 +05:30
parent c3de15587c
commit a350831792

View file

@ -1,5 +1,5 @@
from ox.cache import read_url from ox.cache import read_url
from ox import decode_html, strip_tags, find_re from ox import decode_html, strip_tags, find_re, fix_bad_unicode
import json import json
import re import re
from urllib.parse import unquote from urllib.parse import unquote
@ -24,7 +24,10 @@ def info(key, value):
info['asin'] = [l.get('href').rpartition('/')[-1]] info['asin'] = [l.get('href').rpartition('/')[-1]]
break break
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0]))) info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
info['description'] = fix_bad_unicode(info['description'])
content = doc.xpath('//div[@class="content"]')[0] content = doc.xpath('//div[@class="content"]')[0]
content_info = {} content_info = {}
for li in content.xpath('.//li'): for li in content.xpath('.//li'):
@ -52,19 +55,13 @@ def info(key, value):
if a: if a:
for span in a: for span in a:
r = span.getchildren()[0].text.strip() r = span.getchildren()[0].text.strip()
if 'Translator' in r: role = get_role(r)
role = 'translator'
else:
role = 'author'
if not role in info: info[role] = [] if not role in info: info[role] = []
info[role].append(span.text.strip()) info[role].append(span.text.strip())
else: else:
for span in doc.xpath('//span[@class="author notFaded"]'): for span in doc.xpath('//span[@class="author notFaded"]'):
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()] author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
if 'Translator' in author[-1]: role = get_role(author[-1])
role = 'translator'
else:
role = 'author'
if not role in info: info[role] = [] if not role in info: info[role] = []
info[role].append(author[0]) info[role].append(author[0])
@ -76,3 +73,12 @@ def info(key, value):
last = covers[url] last = covers[url]
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
return info return info
def get_role(value):
if 'Translator' in value:
role = 'translator'
elif 'Editor' in value:
role = 'editor'
else:
role = 'author'
return role