store metadata per user. remove primaryid. only store isbn13

This commit is contained in:
j 2016-01-11 19:13:54 +05:30
parent 90648f9e65
commit 02e040d9f5
16 changed files with 245 additions and 192 deletions

View file

@ -159,44 +159,20 @@ class Changelog(db.Model):
return True
def action_edititem(self, user, timestamp, itemid, meta):
from user.models import Metadata
m = Metadata.get_or_create(user.id, itemid)
m.edit(meta)
'''
FIXME: "sometimes" update item too...
from item.models import Item
i = Item.get(itemid)
if itemid == 'RDWQN35QAY6VW2UQEINOTQXLUCOFKIZK':
return True
if not i:
logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid)
return True
if i.timestamp > timestamp:
logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta)
return True
if state.user() in i.users:
logger.debug('ignore edititem events for own items %s %s %s', timestamp, itemid, meta)
logger.debug('----------------------item.modified: %s', i.modified)
return True
primary = None
if 'primaryid' in meta:
primary = meta['primaryid']
key = primary[0]
else:
keys = [k for k in meta if k in Item.id_keys]
if keys:
key = keys[0]
primary = [key, meta[key]]
i.modified = ts2datetime(timestamp)
if primary:
if not meta[key] and i.meta.get('primaryid', [''])[0] == key:
logger.debug('remove id mapping %s %s', i.id, primary)
i.update_primaryid(*primary, scrape=False, modified=i.modified)
elif meta[key] and i.meta.get('primaryid') != primary:
logger.debug('edit mapping %s %s', i.id, primary)
i.update_primaryid(*primary, scrape=False, modified=i.modified)
else:
i.update_meta(meta, modified=i.modified)
i.save()
i.edit(meta, ts2datetime(timestamp))
'''
return True
def action_removeitem(self, user, timestamp, itemid):
from item.models import Item
from user.models import Metadata
i = Item.get(itemid)
if i:
if user in i.users:
@ -205,6 +181,7 @@ class Changelog(db.Model):
i.update()
else:
i.delete()
Metadata.query(user_id=user.id, item_id=itemid).delete()
return True
def action_addlist(self, user, timestamp, name, query=None):
@ -289,6 +266,8 @@ class Changelog(db.Model):
return True
def action_editmeta(self, user, timestamp, key, value, data):
return True
'''>> Metadata no longer tracked per isbn'''
from item.models import Metadata
m = Metadata.get(key, value)
if not m or m.timestamp < timestamp:
@ -299,6 +278,7 @@ class Changelog(db.Model):
return True
def action_resetmeta(self, user, timestamp, key, value):
return True
from item.models import Metadata
m = Metadata.get(key, value)
if m and m.timestamp < timestamp:

View file

@ -132,18 +132,8 @@ def edit(data):
for id in ids:
item = models.Item.get(id)
if item and item.json()['mediastate'] == 'available':
if 'primaryid' in data:
if data['primaryid']:
key, value = data['primaryid']
logger.debug('update primaryid %s %s', key, value)
value = cleanup_id(key, value)
item.update_primaryid(key, value)
else:
item.update_primaryid()
response = item.json()
else:
item.edit_metadata(data)
response = item.json()
item.edit(data)
response = item.json()
edited.append(id)
else:
logger.info('can only edit available items %s', id)
@ -264,8 +254,6 @@ def getMetadata(data):
for key in [k['id'] for k in settings.config['itemKeys'] if isinstance(k['type'], list)]:
if key in response and not isinstance(response[key], list):
response[key] = [response[key]]
if response:
response['primaryid'] = [key, value]
return response
actions.register(getMetadata)

View file

@ -121,11 +121,14 @@ class Item(db.Model):
#j['users'] = list(map(str, list(self.users)))
if self.info:
j.update(self.info)
for key in self.info:
if (not keys or key in keys) and key not in self.meta_keys:
j[key] = self.info[key]
if self.meta:
j.update(self.meta)
for key in self.id_keys + ['primaryid']:
for key in self.id_keys:
if key not in self.meta and key in j:
del j[key]
if keys:
@ -220,7 +223,7 @@ class Item(db.Model):
for f in Find.query.filter_by(item_id=self.id).filter(Find.key.notin_(keys)):
state.db.session.delete(f)
def update(self):
def update(self, modified=None):
for key in ('mediastate', 'coverRatio', 'previewRatio'):
if key in self.meta:
if key not in self.info:
@ -233,17 +236,12 @@ class Item(db.Model):
self.info['mediastate'] = 'transferring'
else:
self.info['mediastate'] = 'available' if settings.USER_ID in users else 'unavailable'
if 'primaryid' in self.meta:
# self.meta.update does not trigger db update!
m = Metadata.load(*self.meta['primaryid'])
for key in m:
if key == 'id':
continue
self.meta[key] = m[key]
self.modified = datetime.utcnow()
if modified:
self.modified = modified
else:
self.modified = datetime.utcnow()
self.update_sort()
self.update_find()
#self.modified = datetime.utcnow()
self.save()
def save(self):
@ -260,14 +258,18 @@ class Item(db.Model):
meta_keys = (
'author',
'classification',
'categories',
'cover',
'date',
'description',
'edition',
'isbn',
'language',
'pages',
'place',
'publisher',
'series',
'tableofcontents',
'title'
)
@ -285,64 +287,17 @@ class Item(db.Model):
del self.meta[key]
update = True
if update:
self.update()
if not modified:
modified = datetime.utcnow()
self.modified = modified
self.update(modified)
self.save()
if 'cover' in record:
self.update_icons()
user = state.user()
if record and user in self.users:
Changelog.record_ts(user, modified, 'edititem', self.id, record)
def update_primaryid(self, key=None, id=None, scrape=True, modified=None):
if key is None and id is None:
if 'primaryid' not in self.meta:
return
else:
key = self.meta['primaryid'][0]
record = {}
if id:
if not key in self.meta or not key in self.meta[key]:
self.meta[key] = list(set([id] + self.meta.get(key, [])))
self.meta['primaryid'] = [key, id]
record[key] = id
else:
if key in self.meta:
del self.meta[key]
if 'primaryid' in self.meta:
del self.meta['primaryid']
record[key] = ''
for k in self.id_keys:
if k != key:
if k in self.meta:
del self.meta[k]
logger.debug('set primaryid %s %s', key, id)
# get metadata from external resources
if scrape:
self.scrape()
self.update_icons()
if not modified:
modified = datetime.utcnow()
self.modified = modified
self.save()
#if not scrape:
# Scrape.get_or_create(self.id)
for f in self.files.all():
f.move()
user = state.user()
if user in self.users:
Changelog.record_ts(user, modified, 'edititem', self.id, record)
def edit_metadata(self, data):
def edit(self, data, modified=None):
Scrape.query.filter_by(item_id=self.id).delete()
if 'primaryid' in self.meta:
logger.debug('m: %s', self.meta['primaryid'])
m = Metadata.get_or_create(*self.meta['primaryid'])
if m.edit(data):
self.update()
else:
self.update_meta(data)
self.update_meta(data, modified)
for f in self.files.all():
f.move()
@ -388,23 +343,19 @@ class Item(db.Model):
for resolution in (128, 256, 512):
del icons['%s:%s' % (key, resolution)]
def scrape(self):
primaryid = self.meta.get('primaryid')
logger.debug('scrape %s', primaryid)
if primaryid:
try:
m = meta.lookup(*primaryid)
except:
logger.debug('meta.lookup %s failed:', primaryid, exc_info=True)
m = None
if m:
m['primaryid'] = primaryid
self.meta = m
self.modified = datetime.utcnow()
self.update()
return True
return False
return True
def load_metadata(self):
'''
load metadata from user_metadata or get via isbn?
'''
for key in self.meta_keys:
if key not in self.meta and key in self.info:
self.meta[key] = self.info[key]
#FIXME get from user_meta
if state.online:
if 'isbn' in self.meta:
data = meta.lookup('isbn', self.meta['isbn'])
if data:
self.meta.update(data)
def queue_download(self):
u = state.user()

View file

@ -46,23 +46,14 @@ def add_file(id, f, prefix, from_=None):
data = media.metadata(f, from_)
file = File.get_or_create(id, data, path)
item = file.item
if 'primaryid' in file.info:
del file.info['primaryid']
state.db.session.add(file)
if 'primaryid' in item.info:
item.meta['primaryid'] = item.info.pop('primaryid')
state.db.session.add(item)
item.add_user(user)
Changelog.record(user, 'additem', item.id, file.info)
item.added = datetime.utcnow()
if state.online:
item.scrape()
#Changelog.record(user, 'edititem', item.id, dict([item.meta['primaryid']]))
item.load_metadata()
Changelog.record(user, 'additem', item.id, file.info)
Changelog.record(user, 'edititem', item.id, item.meta)
item.update_icons()
item.modified = datetime.utcnow()
item.update()
#Scrape.get_or_create(item.id)
return file
def run_scan():

View file

@ -15,7 +15,7 @@ from . import epub
from . import txt
from . import opf
from meta.utils import decode_html_data
from meta.utils import decode_html_data, to_isbn13
def get_id(f=None, data=None):
if data:
@ -23,7 +23,6 @@ def get_id(f=None, data=None):
else:
return base64.b32encode(codecs.decode(ox.sha1sum(f, cached=True), 'hex')).decode()
def metadata(f, from_=None):
ext = f.split('.')[-1]
data = {}
@ -64,10 +63,17 @@ def metadata(f, from_=None):
data[key] = data[key].replace('\x00', '')
elif isinstance(data[key], list):
data[key] = [e.replace('\x00', '') if isinstance(e, str) else e for e in data[key]]
if 'isbn' in data:
data['primaryid'] = ['isbn', data['isbn'][0]]
elif 'asin' in data:
data['primaryid'] = ['asin', data['asin'][0]]
if 'isbn' in data and isinstance(data['isbn'], list):
isbns = set()
for i in data['isbn']:
i = to_isbn13(i)
if i:
isbns.add(i)
if isbns:
data['isbn'] = list(isbns)[0]
else:
del data['isbn']
if 'author' in data:
if isinstance(data['author'], str):
if data['author'].strip():

View file

@ -5,14 +5,13 @@
import os
import xml.etree.ElementTree as ET
import zipfile
from io import BytesIO
import re
from urllib.parse import unquote
import lxml.html
import stdnum.isbn
from ox import strip_tags, decode_html
from utils import normalize_isbn, find_isbns, get_language
from utils import find_isbns, get_language, to_isbn13
import logging
logger = logging.getLogger(__name__)
@ -104,13 +103,24 @@ def info(epub):
}.get(key, key)
value = e.text.strip()
if key == 'identifier':
value = normalize_isbn(value)
if stdnum.isbn.is_valid(value):
data['isbn'] = [value]
value = to_isbn13(value)
if value:
data['isbn'] = value
elif key == 'author':
data[key] = value.split(', ')
else:
data[key] = value
guide = info.findall('{http://www.idpf.org/2007/opf}guide')
if guide:
for ref in guide[0].findall('{http://www.idpf.org/2007/opf}reference'):
if ref.attrib.get('type') == 'toc':
filename = unquote(ref.attrib['href'])
filename = os.path.normpath(os.path.join(os.path.dirname(opf[0]), filename))
toc = z.read(filename)
if toc:
doc = lxml.html.document_fromstring(toc)
data['tableofcontents'] = '\n'.join([a.text_content() for a in doc.xpath('//a')])
if 'description' in data:
data['description'] = strip_tags(decode_html(data['description']))
text = extract_text(epub)
@ -118,7 +128,7 @@ def info(epub):
if not 'isbn' in data:
isbn = extract_isbn(text)
if isbn:
data['isbn'] = [isbn]
data['isbn'] = isbn
if 'date' in data and 'T' in data['date']:
data['date'] = data['date'].split('T')[0]
if 'language' in data and isinstance(data['language'], str):
@ -139,4 +149,3 @@ def extract_isbn(data):
isbns = find_isbns(data)
if isbns:
return isbns[0]

View file

@ -4,11 +4,9 @@
import xml.etree.ElementTree as ET
import stdnum.isbn
from utils import normalize_isbn, get_language
from utils import get_language, to_isbn13
from ox import strip_tags
import ox.iso
import logging
logger = logging.getLogger(__name__)
@ -31,12 +29,9 @@ def info(opf):
}.get(key, key)
value = e.text
if key == 'identifier':
isbn = normalize_isbn(value)
if stdnum.isbn.is_valid(isbn):
if not 'isbn' in data:
data['isbn'] = [isbn]
else:
data['isbn'].append(isbn)
isbn = to_isbn13(value)
if isbn:
data['isbn'] = isbn
if e.attrib.get(ns + 'scheme') == 'AMAZON':
if not 'asin' in data:
data['asin'] = [value]

View file

@ -11,11 +11,10 @@ from glob import glob
from datetime import datetime
from PyPDF2 import PdfFileReader
import stdnum.isbn
import ox
import settings
from utils import normalize_isbn, find_isbns, get_language
from utils import get_language, to_isbn13, find_isbns
import logging
logger = logging.getLogger(__name__)
@ -151,9 +150,9 @@ def info(pdf):
del data[key]
'''
if 'identifier' in data:
value = normalize_isbn(data['identifier'])
if stdnum.isbn.is_valid(value):
data['isbn'] = [value]
value = to_isbn13(data['identifier'])
if value:
data['isbn'] = value
del data['identifier']
for key, value in data.items():
if isinstance(value, dict):
@ -170,9 +169,7 @@ def info(pdf):
if not 'isbn' in data:
isbn = extract_isbn(text)
if isbn:
data['isbn'] = [isbn]
if 'isbn' in data and isinstance(data['isbn'], str):
data['isbn'] = [data['isbn']]
data['isbn'] = isbn
if 'date' in data and len(data['date']) == 8 and data['date'].isdigit():
d = data['date']
data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:])

View file

@ -23,7 +23,7 @@ def info(path):
text = extract_text(path)
isbn = extract_isbn(text)
if isbn:
data['isbn'] = [isbn]
data['isbn'] = isbn
data['textsize'] = len(text)
return data

View file

@ -46,11 +46,10 @@ def info(key, value):
info['publisher'], info['edition'] = info['publisher'].split('; ', 1)
if 'ISBN-13' in content_info:
if not 'isbn' in info: info['isbn'] = []
info['isbn'] = content_info['ISBN-13'].replace('-', '')
info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
if 'ISBN-10' in content_info:
if not 'isbn' in info: info['isbn'] = []
info['isbn'].append(content_info['ISBN-10'])
elif 'ISBN-10' in content_info:
info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10'])
a = doc.xpath('//span[@class="a-size-medium"]')
if a:

View file

@ -21,14 +21,13 @@ def find(query):
done = set()
for isbn in isbns:
if isbn not in done:
isbn = stdnum.isbn.to_isbn13(isbn)
r = {
'isbn': [isbn],
'primaryid': ['isbn', isbn]
}
results.append(r)
done.add(isbn)
if len(isbn) == 10:
done.add(stdnum.isbn.to_isbn13(isbn))
if len(isbn) == 13 and isbn.startswith('978'):
done.add(stdnum.isbn.to_isbn10(isbn))
return results

View file

@ -6,7 +6,7 @@ from ox.cache import get_json, store
import ox.web.google
import stdnum.isbn
from .utils import find_isbns, get_language, decode_html_data
from .utils import find_isbns, get_language, decode_html_data, to_isbn13
import logging
logger = logging.getLogger(__name__)
@ -51,6 +51,7 @@ def info(key, value):
data = {}
for key in [
'authors',
'categories',
'description',
'pageCount',
'publishedDate',
@ -83,6 +84,9 @@ def info(key, value):
data['isbn'].append(k['identifier'])
else:
print('unknown identifier', k)
if 'isbn' in data:
data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]
if 'publisher' in data and isinstance(data['publisher'], str):
data['publisher'] = [data['publisher']]
if 'language' in _data:

View file

@ -6,6 +6,16 @@ import re
import stdnum.isbn
import ox
import ox.iso
def to_isbn13(isbn):
try:
isbn = stdnum.isbn.validate(isbn, True)
if isbn[:2] != '97':
isbn = None
except:
isbn = None
return isbn
def normalize_isbn(value):
return ''.join([s for s in value if s.isdigit() or s == 'X'])
@ -13,14 +23,11 @@ def normalize_isbn(value):
def find_isbns(text):
if isinstance(text, bytes):
text = text.decode()
matches = re.compile('\d[\d\-X\ ]+').findall(text)
matches = re.compile('\d[\d\-X\u2013\ ]+').findall(text)
matches = [normalize_isbn(value) for value in matches]
return [isbn for isbn in matches if stdnum.isbn.is_valid(isbn)
and len(isbn) in (10, 13)
and isbn not in (
'0' * 10,
'0' * 13,
)]
matches = [to_isbn13(value) for value in matches]
matches = list(set([value for value in matches if value]))
return matches
def get_language(lang):
return ox.iso.codeToLang(lang.split('-')[0]) or lang

View file

@ -111,7 +111,6 @@ CREATE TABLE sort (
date VARCHAR(1000),
language VARCHAR(1000),
pages BIGINT,
classification VARCHAR(1000),
extension VARCHAR(1000),
size BIGINT,
created DATETIME,
@ -136,7 +135,6 @@ CREATE INDEX ix_sort_accessed ON sort (accessed);
CREATE INDEX ix_sort_added ON sort (added);
CREATE INDEX ix_sort_asin ON sort (asin);
CREATE INDEX ix_sort_author ON sort (author);
CREATE INDEX ix_sort_classification ON sort (classification);
CREATE INDEX ix_sort_country ON sort (country);
CREATE INDEX ix_sort_created ON sort (created);
CREATE INDEX ix_sort_date ON sort (date);
@ -274,7 +272,64 @@ def upgrade_db(old, new=None):
i.update_sort()
i.update_find()
session.commit()
if old <= '20160111-603-90648f9' and new > '20160111-603-90648f9':
for f in settings.ui['filters']:
if f['id'] == 'classification':
f['id'] = 'categories'
settings.ui._save()
run_sql('ALTER TABLE sort ADD categories VARCHAR(1000)')
run_sql('ALTER TABLE sort ADD series VARCHAR(1000)')
run_sql('CREATE INDEX ix_sort_categories ON sort (categories)')
run_sql('''CREATE TABLE user_metadata (
created DATETIME,
modified DATETIME,
id INTEGER NOT NULL,
item_id VARCHAR(32),
user_id VARCHAR(43),
data_hash VARCHAR(40),
data BLOB,
PRIMARY KEY (id),
FOREIGN KEY(user_id) REFERENCES user (id)
)''')
run_sql('CREATE UNIQUE INDEX IF NOT EXISTS user_metadata_index ON user_metadata(id, user_id)')
run_sql('CREATE INDEX ix_user_metadata_data_hash ON user_metadata (data_hash)')
from meta.utils import to_isbn13
from item.models import Item
from user.models import Metadata
with db.session() as session:
for i in Item.query:
update = False
if 'primaryid' in i.meta:
del i.meta['primaryid']
update = True
if 'primaryid' in i.info:
del i.info['primaryid']
update = True
for key in i.meta_keys:
if key not in i.meta and key in i.info:
i.meta[key] = i.info[key]
update = True
if 'isbn' in i.meta and isinstance(i.meta['isbn'], list):
isbns = [to_isbn13(isbn) for isbn in i.meta['isbn']]
isbns = [isbn for isbn in isbns if isbn]
if isbns:
i.meta['isbn'] = isbns[0]
if 'isbn' in i.info:
i.info['isbn'] = i.meta['isbn']
else:
del i.meta['isbn']
if 'isbn' in i.info:
del i.info['isbn']
update = True
if 'isbn' in i.meta and not i.meta['isbn']:
del i.meta['isbn']
update = True
if update:
session.add(i)
for u in i.users:
if u.id != settings.USER_ID:
Metadata.get_or_create(u.id, i.id, i.meta, False)
session.commit()
if old <= '20140527-120-3cb9819':
run_sql('CREATE INDEX ix_find_findvalue ON find (findvalue)')

View file

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import json
import hashlib
import sqlalchemy as sa
@ -11,6 +12,7 @@ import db
import json_pickler
import settings
import state
import utils
import logging
logger = logging.getLogger(__name__)
@ -118,8 +120,10 @@ class User(db.Model):
if not i.users:
i.delete()
Changelog.query.filter_by(user_id=self.id).delete()
if self.id in settings.ui['showFolder']:
del settings.ui['showFolder'][self.id]
Metadata.query.filter_by(user_id=self.id).delete()
if self.name in settings.ui['showFolder']:
del settings.ui['showFolder'][self.name]
settings.ui._save()
self.save()
if was_peering:
Changelog.record(state.user(), 'removepeer', self.id)
@ -318,3 +322,71 @@ class List(db.Model):
def save(self):
state.db.session.add(self)
state.db.session.commit()
class Metadata(db.Model):
__tablename__ = 'user_metadata'
created = sa.Column(sa.DateTime())
modified = sa.Column(sa.DateTime())
id = sa.Column(sa.Integer(), primary_key=True)
item_id = sa.Column(sa.String(32))
user_id = sa.Column(sa.String(43), sa.ForeignKey('user.id'))
data_hash = sa.Column(sa.String(40), index=True)
data = sa.Column(MutableDict.as_mutable(sa.PickleType(pickler=json_pickler)))
def __repr__(self):
return '{item}/{user}'.format(item=self.item_id, user=self.user_id)
@property
def timestamp(self):
return utils.datetime2ts(self.modified)
@classmethod
def get(cls, user_id, item_id):
return cls.query.filter_by(user_id=user_id,item_id=item_id).first()
@classmethod
def get_or_create(cls, user_id, item_id, data=None, commit=True):
m = cls.get(user_id=user_id, item_id=item_id)
if not m:
m = cls(user_id=user_id, item_id=item_id)
m.created = datetime.utcnow()
if data:
m.data = data
else:
m.data = {}
m.save(commit)
elif data:
m.edit(data, commit)
return m
def save(self, commit=True):
self.modified = datetime.utcnow()
self.data_hash = hashlib.sha1(json.dumps(self.data, ensure_ascii=False, sort_keys=True).encode()).hexdigest()
state.db.session.add(self)
if commit:
state.db.session.commit()
def edit(self, data, commit=True):
changes = {}
if 'isbn' in data and isinstance(data['isbn'], list):
isbns = [utils.to_isbn13(isbn) for isbn in data['isbn']]
isbns = [isbn for isbn in isbns if isbn]
if isbn:
data['isbn'] = isbn
else:
del data['isbn']
for key in data:
if key == 'id':
continue
if data[key] != self.data.get(key):
self.data[key] = data[key]
changes[key] = data[key]
if changes:
self.save(commit)
return changes
def delete(self):
state.db.session.delete(self)
state.db.session.commit()

View file

@ -29,7 +29,7 @@ from Crypto.PublicKey import RSA
from Crypto.Util.asn1 import DerSequence
from meta.utils import normalize_isbn, find_isbns, get_language
from meta.utils import normalize_isbn, find_isbns, get_language, to_isbn13
import logging
logger = logging.getLogger(__name__)