393 lines
16 KiB
Python
393 lines
16 KiB
Python
'''
|
|
Recommendation Engine Example
|
|
1 Nov 2017, 0x2620
|
|
'''
|
|
|
|
from collections import defaultdict
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import time
|
|
|
|
import ox
|
|
|
|
from utils import run_async
|
|
|
|
logger = logging.getLogger(__name__)
|
|
verbose = True
|
|
|
|
|
|
class Engine:
|
|
_pandora = None
|
|
|
|
def __init__(self, path, **kwargs):
|
|
self.path = path
|
|
self.pandora_args = dict(
|
|
url=kwargs.get('pandora', 'http://pandora.dmp/api/'),
|
|
username=kwargs.get('username', 'dd.re'),
|
|
password=kwargs.get('password', 'dd.re')
|
|
)
|
|
filename = os.path.join(self.path, 'playlists.json')
|
|
if os.path.exists(filename):
|
|
with open(filename) as f:
|
|
self.playlists = json.load(f)
|
|
else:
|
|
self.playlists = []
|
|
|
|
filename = os.path.join(self.path, 'state.json')
|
|
if os.path.exists(filename):
|
|
with open(filename) as f:
|
|
self.state = json.load(f)
|
|
else:
|
|
self.state = {
|
|
'channels': {
|
|
'globalKeywords': {'locked': False, 'value': 7},
|
|
'userKeywords': {'locked': False, 'value': 7},
|
|
'screenings': {'locked': True, 'value': 2}
|
|
},
|
|
'globalKeywords': {},
|
|
}
|
|
if 'gridChange' not in self.state:
|
|
self.state['gridChange'] = {
|
|
'nextClip': {'locked': False, 'value': 4},
|
|
'nextPlaylist': {'locked': False, 'value': 4},
|
|
'staySame': {'locked': False, 'value': 8}
|
|
}
|
|
self.update_keywords()
|
|
|
|
@property
|
|
def pandora(self):
|
|
while not self._pandora:
|
|
try:
|
|
self._pandora = Pandora(**self.pandora_args)
|
|
except:
|
|
logger.error('failed to connect to pandora, retry in 10 seconds')
|
|
time.sleep(10)
|
|
return self._pandora
|
|
|
|
def _patch_clips(self, clips):
|
|
inpoints = {}
|
|
for index, clip in enumerate(clips):
|
|
video_id = clip['id'].split('/')[0]
|
|
inpoints[video_id] = inpoints.get(video_id, []) + [{
|
|
'index': index,
|
|
'position': clip['in']
|
|
}]
|
|
for video_id in inpoints:
|
|
for i, inpoint in enumerate(sorted(
|
|
inpoints[video_id], key=lambda inpoint: inpoint['position']
|
|
)):
|
|
if i < len(inpoints[video_id]) - 1:
|
|
clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position']
|
|
else:
|
|
clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']
|
|
return clips
|
|
|
|
def get_videos(self, user):
|
|
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
|
|
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
|
|
grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
|
|
|
|
# check if there were grid events for all indexes.
|
|
grid_events = {}
|
|
(nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame"))
|
|
# this assumes np + nc + ns = total number of videos in the grid view (16).
|
|
# Make sure sanity check exists in front-end (error if it does not add up to 16).
|
|
video_num = nc + np + ns
|
|
|
|
# for event in user.get('events', []):
|
|
# if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
|
|
# grid_events[event.get('data').get('index')] = event.get('data')
|
|
# if len(grid_events) == video_num:
|
|
# break
|
|
|
|
# # The version where the loop also extract play_index (requires "index" in play event data):
|
|
play_index = None
|
|
for event in user.get('events', []):
|
|
if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
|
|
grid_events[event.get('data').get('index')] = event.get('data')
|
|
if event.get('event') == "play" and not play_index:
|
|
play_index = event.get('data').get('index')
|
|
if len(grid_events) == video_num and play_index:
|
|
break
|
|
|
|
prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
|
|
|
|
# if there were no grid events for all, initialize all grids.
|
|
if len(prev_grid_list) < video_num:
|
|
return self.get_recommendations(user)
|
|
|
|
else:
|
|
# played index is excluded from the random shuffle and deterministically added to staySame pool.
|
|
video_indx = [*range(play_index)]+[*range(play_index+1,video_num)]
|
|
# video_indx = list(range(video_num))
|
|
random.shuffle(video_indx)
|
|
video_indx.append(play_index)
|
|
next_clip_index = video_indx[:nc]
|
|
next_playlist_index = video_indx[nc:nc+np]
|
|
stay_same_index = video_indx[nc+np:]
|
|
|
|
rec_list = []
|
|
# select next clip for nextClip pool except when the playlist has only one clip.
|
|
for i in next_clip_index:
|
|
for playlist in self.playlists:
|
|
if playlist.get('name')== prev_grid_list[i].get('playlist'):
|
|
if len(playlist["clips"]) == 1:
|
|
next_playlist_index.append(i)
|
|
break
|
|
# Discuss how this behavour should be: should it switch to a new playlist if it is the end of the playlist clip sequence already?
|
|
elif prev_grid_list[i].get('playlistPosition', 0) + 1 == len(playlist['clips']):
|
|
playlist_pos = 0
|
|
else:
|
|
playlist_pos = prev_grid_list[i].get('playlistPosition', 0) + 1
|
|
|
|
rec_list.append((i, {
|
|
'clips': playlist['clips'],
|
|
# 'position': random.randrange(len(playlist['clips'])),
|
|
'position': playlist_pos,
|
|
'name': playlist['name'],
|
|
'tags': playlist['tags'],
|
|
}))
|
|
|
|
# randomly select playlists (excluding the playlists from the current grid once "playlist" is recorded for grid events)
|
|
# for nextPlaylist pool.
|
|
vids_exclude = [e.get("playlist") for e in prev_grid_list]
|
|
video = self.get_recommendations(user, vids_exclude)
|
|
rec_list += [(i, video[i]) for i in next_playlist_index]
|
|
|
|
#staySame pool
|
|
rec_list += [(i,{}) for i in stay_same_index]
|
|
|
|
rec_list = sorted(rec_list, key=lambda k:k[0])
|
|
return [e[1] for e in rec_list]
|
|
|
|
|
|
# NOTE for future improvement: vids_exclude element unit could be clip or in/out time pairs, rather than playlist.
|
|
# The same playlist could be played in the grid view as long as these are differenct clips or separate times.
|
|
def get_recommendations(self, user, vids_exclude = []):
|
|
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
|
|
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
|
|
gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
|
|
|
|
# Exclude playlists from the most recent grid
|
|
playlists = self.playlists
|
|
if len(vids_exclude) > 0:
|
|
for playlist in playlists:
|
|
if playlist["name"] in vids_exclude:
|
|
playlists.remove(playlist)
|
|
|
|
# For each playlist, compute user keyword score
|
|
user_keywords = user.get('keywords', {})
|
|
score = {}
|
|
for playlist in playlists:
|
|
score[playlist['name']] = random.random()
|
|
for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:
|
|
score[playlist['name']] += user_keywords[tag]
|
|
# Select highest scoring playlists
|
|
playlists = sorted(
|
|
playlists,
|
|
key=lambda playlist: -score[playlist['name']]
|
|
)
|
|
videos = playlists[:channels['userKeywords']]
|
|
playlists = playlists[channels['userKeywords']:]
|
|
# For each playlist, compute global keyword score
|
|
score = {}
|
|
for playlist in playlists:
|
|
score[playlist['name']] = random.random()
|
|
for tag in [tag for tag in playlist['tags'] if tag in sliders]:
|
|
score[playlist['name']] += sliders[tag]
|
|
# Select highest scoring playlists
|
|
playlists = sorted(
|
|
playlists,
|
|
key=lambda playlist: -score[playlist['name']]
|
|
)
|
|
videos += playlists[:channels['globalKeywords']]
|
|
playlists = playlists[channels['globalKeywords']:]
|
|
# Count products the user has seen
|
|
count = defaultdict(lambda: 0)
|
|
for event in user.get('events', []):
|
|
if event.get('data', {}).get('product'):
|
|
count[event['data']['product']] += 1
|
|
# For each product in playlist tags, increment score by count
|
|
for playlist in playlists:
|
|
score[playlist['name']] = random.random()
|
|
for tag in set(playlist['tags']) & set(count):
|
|
score[playlist['name']] += count[tag]
|
|
# Select highest scoring playlists
|
|
videos += sorted(
|
|
playlists,
|
|
key=lambda playlist: -score[playlist['name']]
|
|
)[:16 - channels['userKeywords'] - channels['globalKeywords']]
|
|
# Shuffle playlists (randomize layout) and shift clips (randomize start)
|
|
random.shuffle(videos)
|
|
return [{
|
|
'clips': video['clips'],
|
|
'position': random.randrange(len(video['clips'])),
|
|
'name': video['name'],
|
|
'tags': video['tags'],
|
|
} for video in videos]
|
|
|
|
|
|
def get_next(self, user, position):
|
|
grid_events = {}
|
|
video_num = 16
|
|
|
|
for event in user.get('events', []):
|
|
if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
|
|
grid_events[event.get('data').get('index')] = event.get('data')
|
|
if len(grid_events) == video_num:
|
|
break
|
|
prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
|
|
vids_exclude = [e.get("playlist") for e in prev_grid_list]
|
|
video = self.get_recommendations(user, vids_exclude)[position]
|
|
return video
|
|
|
|
def update_state(self, data):
|
|
for key in data:
|
|
if key in self.state:
|
|
self.state[key].update(data[key])
|
|
else:
|
|
self.state[key] = data[key]
|
|
self.save_state()
|
|
return self.state
|
|
|
|
def save_state(self):
|
|
filename = os.path.join(self.path, 'state.json')
|
|
with open(filename, 'w') as f:
|
|
json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True)
|
|
|
|
def update(self):
|
|
# Get all storylines with tags
|
|
storylines = [{
|
|
'id': entity['id'],
|
|
'name': entity['name'],
|
|
'nodename': entity['nodename'],
|
|
'tags': [t.strip() for t in entity['tags']]
|
|
} for entity in self.pandora.find_entities({
|
|
'conditions': [
|
|
{'key': 'type', 'operator': '==', 'value': 'storylines'},
|
|
],
|
|
'operator': '&'
|
|
}, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')]
|
|
# Get list of storyline names
|
|
names = list(set([storyline['name'] for storyline in storylines]))
|
|
# Get list of items to use in DD
|
|
items = [item['id'] for item in self.pandora.find({
|
|
'conditions': [
|
|
{'key': 'list', 'operator': '==', 'value': 'dau:DD'}
|
|
]
|
|
}, ['id'])]
|
|
# Get all clips annotated with storyline references
|
|
clips = [clip for clip in self.pandora.find_annotations({
|
|
'conditions': [
|
|
{'key': 'layer', 'operator': '==', 'value': 'storylines'}
|
|
],
|
|
'operator': '&'
|
|
}, ['id', 'in', 'out', 'value']) if clip['value'] in names and clip['id'].split('/')[0] in items]
|
|
# Get list of ids for videos with clips
|
|
ids = list(set([clip['id'].split('/')[0] for clip in clips]))
|
|
# Get and cache video data
|
|
filename = os.path.join(self.path, 'videos.json')
|
|
if os.path.exists(filename):
|
|
with open(filename) as f:
|
|
videos_ = json.loads(f.read())
|
|
ids_ = [video['id'] for video in videos_]
|
|
else:
|
|
videos_, ids_ = [], []
|
|
videos = sorted(videos_ + [
|
|
self.pandora.get(id, ['code', 'id', 'order', 'title'])
|
|
for id in ids if not id in ids_
|
|
], key=lambda video: int(video['order']))
|
|
with open(filename, 'w') as f:
|
|
f.write(json.dumps(videos, indent=4, sort_keys=True))
|
|
# Get video order
|
|
order = {video['id']: int(video['order']) for video in videos}
|
|
# Sort clips
|
|
clips = sorted(
|
|
clips,
|
|
key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
|
|
)
|
|
# Get and cache playlists
|
|
self.playlists = [playlist for playlist in [{
|
|
'id': storyline['id'],
|
|
'name': storyline['nodename'].strip(),
|
|
'tags': storyline['tags'],
|
|
'clips': [{
|
|
'item': clip['id'].split('/')[0],
|
|
'id': clip['id'],
|
|
'in': clip['in'],
|
|
'out': clip['out']
|
|
} for clip in clips if clip['value'] == storyline['name']]
|
|
} for storyline in storylines] if playlist['clips']]
|
|
with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
|
|
f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
|
|
self.update_keywords()
|
|
|
|
def update_keywords(self):
|
|
changed = False
|
|
if 'globalKeywords' not in self.state:
|
|
self.state['globalKeywords'] = {}
|
|
changed = True
|
|
existing_tags = set()
|
|
for playlist in self.playlists:
|
|
for tag in playlist.get('tags', []):
|
|
if not tag.isupper() and tag:
|
|
existing_tags.add(tag)
|
|
if not tag.isupper() and tag not in self.state['globalKeywords']:
|
|
self.state['globalKeywords'][tag] = {'value': 0}
|
|
changed = True
|
|
for tag in set(self.state['globalKeywords']) - existing_tags:
|
|
del self.state['globalKeywords'][tag]
|
|
changed = True
|
|
if changed:
|
|
self.save_state()
|
|
|
|
@run_async
|
|
def update_async(self):
|
|
self.update()
|
|
|
|
|
|
class Pandora:
|
|
|
|
# pan.do/ra API wrapper
|
|
|
|
def __init__(self, url, username, password):
|
|
self.api = ox.API(url)
|
|
self.api.signin(username=username, password=password)
|
|
|
|
def find(self, query, keys):
|
|
# print('FIND', query, keys)
|
|
return self.api.find({
|
|
'keys': keys,
|
|
'query': query,
|
|
'range': [0, 1000000]
|
|
})['data']['items']
|
|
|
|
def find_annotations(self, query, keys):
|
|
# print('FIND ANNOTATIONS', query, keys)
|
|
return self.api.findAnnotations({
|
|
'keys': keys,
|
|
'query': query,
|
|
'range': [0, 1000000]
|
|
})['data']['items']
|
|
|
|
def find_entities(self, query, keys):
|
|
# print('FIND ENTITIES', query, keys)
|
|
return self.api.findEntities({
|
|
'keys': keys,
|
|
'query': query,
|
|
'range': [0, 1000000]
|
|
})['data']['items']
|
|
|
|
def get(self, id, keys):
|
|
# print('GET', id, keys)
|
|
return self.api.get({
|
|
'id': id,
|
|
'keys': keys
|
|
})['data']
|
|
|
|
|
|
if __name__ == '__main__':
|
|
engine = Engine('json')
|
|
engine.update()
|