dd-re/recommendation_engine.py

285 lines
10 KiB
Python

'''
Recommendation Engine Example
1 Nov 2017, 0x2620
'''
from collections import defaultdict
import json
import logging
import os
import random
import time
import ox
from utils import run_async
logger = logging.getLogger(__name__)
class Engine:
_pandora = None
def __init__(self, path, **kwargs):
self.path = path
self.pandora_args = dict(
url=kwargs.get('pandora', 'http://pandora.dmp/api/'),
username=kwargs.get('username', 'dd.re'),
password=kwargs.get('password', 'dd.re')
)
filename = os.path.join(self.path, 'playlists.json')
if os.path.exists(filename):
with open(filename) as f:
self.playlists = json.load(f)
else:
self.playlists = []
filename = os.path.join(self.path, 'state.json')
if os.path.exists(filename):
with open(filename) as f:
self.state = json.load(f)
else:
self.state = {
'channels': {
'globalKeywords': {'locked': False, 'value': 7},
'userKeywords': {'locked': False, 'value': 7},
'screenings': {'locked': True, 'value': 2}
},
'globalKeywords': {},
}
self.update_keywords()
@property
def pandora(self):
while not self._pandora:
try:
self._pandora = Pandora(**self.pandora_args)
except:
logger.error('failed to connect to pandora, retry in 10 seconds')
time.sleep(10)
return self._pandora
def _patch_clips(self, clips):
inpoints = {}
for index, clip in enumerate(clips):
video_id = clip['id'].split('/')[0]
inpoints[video_id] = inpoints.get(video_id, []) + [{
'index': index,
'position': clip['in']
}]
for video_id in inpoints:
for i, inpoint in enumerate(sorted(
inpoints[video_id], key=lambda inpoint: inpoint['position']
)):
if i < len(inpoints[video_id]) - 1:
clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position']
else:
clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']
return clips
def get_videos(self, user):
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
# For each playlist, compute user keyword score
user_keywords = user.get('keywords', {})
score = {}
for playlist in self.playlists:
score[playlist['name']] = random.random()
for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:
score[playlist['name']] += user_keywords[tag]
# Select highest scoring playlists
playlists = sorted(
self.playlists,
key=lambda playlist: -score[playlist['name']]
)
videos = playlists[:channels['userKeywords']]
playlists = playlists[channels['userKeywords']:]
# For each playlist, compute global keyword score
score = {}
for playlist in playlists:
score[playlist['name']] = random.random()
for tag in [tag for tag in playlist['tags'] if tag in sliders]:
score[playlist['name']] += sliders[tag]
# Select highest scoring playlists
playlists = sorted(
playlists,
key=lambda playlist: -score[playlist['name']]
)
videos += playlists[:channels['globalKeywords']]
playlists = playlists[channels['globalKeywords']:]
# Count products the user has seen
count = defaultdict(lambda: 0)
for event in user.get('events', []):
if event.get('data', {}).get('product'):
count[event['data']['product']] += 1
# For each product in playlist tags, increment score by count
for playlist in playlists:
score[playlist['name']] = random.random()
for tag in set(playlist['tags']) & set(count):
score[playlist['name']] += count[tag]
# Select highest scoring playlists
videos += sorted(
playlists,
key=lambda playlist: -score[playlist['name']]
)[:16 - channels['userKeywords'] - channels['globalKeywords']]
# Shuffle playlists (randomize layout) and shift clips (randomize start)
random.shuffle(videos)
return [{
'clips': video['clips'],
'position': random.randrange(len(video['clips'])),
'name': video['name'],
'tags': video['tags'],
} for video in videos]
def get_next(self, user, position):
video = self.get_videos(user)[position]
return video
def update_state(self, data):
for key in data:
if key in self.state:
self.state[key].update(data[key])
else:
self.state[key] = data[key]
self.save_state()
return self.state
def save_state(self):
filename = os.path.join(self.path, 'state.json')
with open(filename, 'w') as f:
json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True)
def update(self):
# Get all storylines with tags
storylines = [{
'id': entity['id'],
'name': entity['name'],
'nodename': entity['nodename'],
'tags': [t.strip() for t in entity['tags']]
} for entity in self.pandora.find_entities({
'conditions': [
{'key': 'type', 'operator': '==', 'value': 'storylines'},
],
'operator': '&'
}, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')]
# Get list of storyline names
names = list(set([storyline['name'] for storyline in storylines]))
# Get list of items to use in DD
items = [item['id'] for item in self.pandora.find({
'conditions': [
{'key': 'list', 'operator': '==', 'value': 'dau:DD'}
]
}, ['id'])]
# Get all clips annotated with storyline references
clips = [clip for clip in self.pandora.find_annotations({
'conditions': [
{'key': 'layer', 'operator': '==', 'value': 'storylines'}
],
'operator': '&'
}, ['id', 'in', 'out', 'value']) if clip['value'] in names and clip['id'].split('/')[0] in items]
# Get list of ids for videos with clips
ids = list(set([clip['id'].split('/')[0] for clip in clips]))
# Get and cache video data
filename = os.path.join(self.path, 'videos.json')
if os.path.exists(filename):
with open(filename) as f:
videos_ = json.loads(f.read())
ids_ = [video['id'] for video in videos_]
else:
videos_, ids_ = [], []
videos = sorted(videos_ + [
self.pandora.get(id, ['code', 'id', 'order', 'title'])
for id in ids if not id in ids_
], key=lambda video: int(video['order']))
with open(filename, 'w') as f:
f.write(json.dumps(videos, indent=4, sort_keys=True))
# Get video order
order = {video['id']: int(video['order']) for video in videos}
# Sort clips
clips = sorted(
clips,
key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
)
# Get and cache playlists
self.playlists = [playlist for playlist in [{
'id': storyline['id'],
'name': storyline['nodename'].strip(),
'tags': storyline['tags'],
'clips': [{
'item': clip['id'].split('/')[0],
'id': clip['id'],
'in': clip['in'],
'out': clip['out']
} for clip in clips if clip['value'] == storyline['name']]
} for storyline in storylines] if playlist['clips']]
with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
self.update_keywords()
def update_keywords(self):
changed = False
if 'globalKeywords' not in self.state:
self.state['globalKeywords'] = {}
changed = True
existing_tags = set()
for playlist in self.playlists:
for tag in playlist.get('tags', []):
if not tag.isupper() and tag:
existing_tags.add(tag)
if not tag.isupper() and tag not in self.state['globalKeywords']:
self.state['globalKeywords'][tag] = {'value': 0}
changed = True
for tag in set(self.state['globalKeywords']) - existing_tags:
del self.state['globalKeywords'][tag]
changed = True
if changed:
self.save_state()
@run_async
def update_async(self):
self.update()
class Pandora:
# pan.do/ra API wrapper
def __init__(self, url, username, password):
self.api = ox.API(url)
self.api.signin(username=username, password=password)
def find(self, query, keys):
# print('FIND', query, keys)
return self.api.find({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def find_annotations(self, query, keys):
# print('FIND ANNOTATIONS', query, keys)
return self.api.findAnnotations({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def find_entities(self, query, keys):
# print('FIND ENTITIES', query, keys)
return self.api.findEntities({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def get(self, id, keys):
# print('GET', id, keys)
return self.api.get({
'id': id,
'keys': keys
})['data']
if __name__ == '__main__':
engine = Engine('json')
engine.update()