normalize language

This commit is contained in:
j 2015-12-25 19:40:49 +05:30
parent c5afc46af1
commit f8c09226de
4 changed files with 10 additions and 6 deletions

View file

@ -12,7 +12,7 @@ from urllib.parse import unquote
from PIL import Image
import stdnum.isbn
from utils import normalize_isbn, find_isbns
from utils import normalize_isbn, find_isbns, get_language
import logging
logger = logging.getLogger(__name__)
@ -99,6 +99,8 @@ def info(epub):
data['isbn'] = [isbn]
if 'date' in data and 'T' in data['date']:
data['date'] = data['date'].split('T')[0]
if 'language' in data and isinstance(data['language'], str):
data['language'] = get_language(data['language'])
return data
def extract_text(path):

View file

@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
import stdnum.isbn
from utils import normalize_isbn
from utils import normalize_isbn, get_language
from ox import strip_tags
import ox.iso
@ -48,5 +48,5 @@ def info(opf):
if 'date' in data and len(data['date']) > 10:
data['date'] =data['date'][:10]
if 'language' in data:
data['language'] = ox.iso.codeToLang(data['language'])
data['language'] = get_language(data['language'])
return data

View file

@ -10,12 +10,11 @@ import shutil
from glob import glob
from datetime import datetime
import ox
from PyPDF2 import PdfFileReader
import stdnum.isbn
import settings
from utils import normalize_isbn, find_isbns
from utils import normalize_isbn, find_isbns, get_language
import logging
logger = logging.getLogger(__name__)
@ -163,7 +162,7 @@ def info(pdf):
if data[key] in ('Unknown',):
del data[key]
if key == 'language':
data[key] = ox.iso.codeToLang(data[key])
data[key] = get_language(data[key])
text = extract_text(pdf)
data['textsize'] = len(text)
if settings.server['extract_text']:

View file

@ -121,6 +121,9 @@ def get_position_by_id(list, key):
return i
return -1
def get_language(lang):
return ox.iso.codeToLang(lang.split('-')[0]) or lang
def valid(key, value, sig):
'''
validate that value was signed by key