normalize language

This commit is contained in:
j 2015-12-25 19:40:49 +05:30
parent c5afc46af1
commit f8c09226de
4 changed files with 10 additions and 6 deletions

View file

@ -12,7 +12,7 @@ from urllib.parse import unquote
from PIL import Image from PIL import Image
import stdnum.isbn import stdnum.isbn
from utils import normalize_isbn, find_isbns from utils import normalize_isbn, find_isbns, get_language
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -99,6 +99,8 @@ def info(epub):
data['isbn'] = [isbn] data['isbn'] = [isbn]
if 'date' in data and 'T' in data['date']: if 'date' in data and 'T' in data['date']:
data['date'] = data['date'].split('T')[0] data['date'] = data['date'].split('T')[0]
if 'language' in data and isinstance(data['language'], str):
data['language'] = get_language(data['language'])
return data return data
def extract_text(path): def extract_text(path):

View file

@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
import stdnum.isbn import stdnum.isbn
from utils import normalize_isbn from utils import normalize_isbn, get_language
from ox import strip_tags from ox import strip_tags
import ox.iso import ox.iso
@ -48,5 +48,5 @@ def info(opf):
if 'date' in data and len(data['date']) > 10: if 'date' in data and len(data['date']) > 10:
data['date'] =data['date'][:10] data['date'] =data['date'][:10]
if 'language' in data: if 'language' in data:
data['language'] = ox.iso.codeToLang(data['language']) data['language'] = get_language(data['language'])
return data return data

View file

@ -10,12 +10,11 @@ import shutil
from glob import glob from glob import glob
from datetime import datetime from datetime import datetime
import ox
from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileReader
import stdnum.isbn import stdnum.isbn
import settings import settings
from utils import normalize_isbn, find_isbns from utils import normalize_isbn, find_isbns, get_language
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -163,7 +162,7 @@ def info(pdf):
if data[key] in ('Unknown',): if data[key] in ('Unknown',):
del data[key] del data[key]
if key == 'language': if key == 'language':
data[key] = ox.iso.codeToLang(data[key]) data[key] = get_language(data[key])
text = extract_text(pdf) text = extract_text(pdf)
data['textsize'] = len(text) data['textsize'] = len(text)
if settings.server['extract_text']: if settings.server['extract_text']:

View file

@ -121,6 +121,9 @@ def get_position_by_id(list, key):
return i return i
return -1 return -1
def get_language(lang):
return ox.iso.codeToLang(lang.split('-')[0]) or lang
def valid(key, value, sig): def valid(key, value, sig):
''' '''
validate that value was signed by key validate that value was signed by key