dd-re/recommendation_engine.py

429 lines
17 KiB
Python
Raw Permalink Normal View History

2017-11-01 16:38:58 +00:00
'''
2017-11-01 22:56:33 +00:00
Recommendation Engine Example
1 Nov 2017, 0x2620
2017-11-01 16:38:58 +00:00
'''
2018-01-24 15:48:50 +00:00
from collections import defaultdict
2017-11-01 16:38:58 +00:00
import json
2018-02-05 10:47:38 +00:00
import logging
2017-11-01 16:38:58 +00:00
import os
import random
2018-02-05 10:47:38 +00:00
import time
import copy
2017-11-01 16:38:58 +00:00
import ox
2017-11-02 08:40:02 +00:00
from utils import run_async
2018-02-05 10:47:38 +00:00
logger = logging.getLogger(__name__)
verbose = True
2018-02-05 10:47:38 +00:00
2017-11-01 16:38:58 +00:00
class Engine:
2018-02-05 10:47:38 +00:00
_pandora = None
2017-11-01 16:38:58 +00:00
2018-01-18 20:32:14 +00:00
def __init__(self, path, **kwargs):
2017-11-01 16:38:58 +00:00
self.path = path
2018-02-05 10:47:38 +00:00
self.pandora_args = dict(
2018-01-18 20:32:14 +00:00
url=kwargs.get('pandora', 'http://pandora.dmp/api/'),
username=kwargs.get('username', 'dd.re'),
password=kwargs.get('password', 'dd.re')
2017-11-01 16:38:58 +00:00
)
2018-02-05 14:15:39 +00:00
filename = os.path.join(self.path, 'playlists.json')
2017-11-01 16:38:58 +00:00
if os.path.exists(filename):
with open(filename) as f:
2018-02-05 14:15:39 +00:00
self.playlists = json.load(f)
2017-11-01 16:38:58 +00:00
else:
self.playlists = []
2018-02-05 14:15:39 +00:00
filename = os.path.join(self.path, 'state.json')
if os.path.exists(filename):
with open(filename) as f:
self.state = json.load(f)
else:
self.state = {
'channels': {
2018-09-26 20:14:58 +00:00
'globalKeywords': {'locked': False, 'value': 8},
'userKeywords': {'locked': False, 'value': 8}
2018-02-05 14:15:39 +00:00
},
2018-04-21 15:36:23 +00:00
'globalKeywords': {},
2018-08-02 22:43:13 +00:00
}
if 'gridChange' not in self.state:
self.state['gridChange'] = {
2018-08-02 22:58:54 +00:00
'nextClip': {'locked': True, 'value': 4},
2018-08-02 22:43:13 +00:00
'nextPlaylist': {'locked': False, 'value': 4},
'staySame': {'locked': False, 'value': 8}
2018-02-05 14:15:39 +00:00
}
2018-09-26 20:14:58 +00:00
if 'userKeywordsWeights' not in self.state:
self.state['userKeywordsWeights'] = {
'themeTags': {'locked': False, 'value': 0.3},
'characterTags': {'locked': False, 'value': 0.7}
}
2018-02-05 14:15:39 +00:00
self.update_keywords()
2018-02-05 10:47:38 +00:00
@property
def pandora(self):
while not self._pandora:
try:
self._pandora = Pandora(**self.pandora_args)
except:
logger.error('failed to connect to pandora, retry in 10 seconds')
time.sleep(10)
return self._pandora
def _patch_clips(self, clips):
inpoints = {}
for index, clip in enumerate(clips):
video_id = clip['id'].split('/')[0]
inpoints[video_id] = inpoints.get(video_id, []) + [{
'index': index,
'position': clip['in']
}]
for video_id in inpoints:
2018-01-26 11:01:36 +00:00
for i, inpoint in enumerate(sorted(
inpoints[video_id], key=lambda inpoint: inpoint['position']
2018-01-26 11:01:36 +00:00
)):
if i < len(inpoints[video_id]) - 1:
2018-01-26 11:01:36 +00:00
clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position']
else:
2018-01-26 11:01:36 +00:00
clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']
return clips
2017-11-01 16:38:58 +00:00
def get_videos(self, user):
2018-08-03 14:30:02 +00:00
if user.get('events', [{}])[0].get("event")=="login":
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': self.get_recommendations(user)
}
2018-08-03 14:30:02 +00:00
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
# check if there were grid events for all indexes.
grid_events = {}
(nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame"))
# this assumes np + nc + ns = total number of videos in the grid view (16).
# Make sure sanity check exists in front-end (error if it does not add up to 16).
video_num = nc + np + ns
# for event in user.get('events', []):
# if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
# grid_events[event.get('data').get('index')] = event.get('data')
# if len(grid_events) == video_num:
# break
# # The version where the loop also extract play_index (requires "index" in play event data):
play_index = None
for event in user.get('events', []):
if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
grid_events[event.get('data').get('index')] = event.get('data')
if event.get('event') == "play" and not play_index:
play_index = event.get('data').get('index')
if len(grid_events) == video_num and play_index:
break
prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
# if there were no grid events for all, initialize all grids.
if len(prev_grid_list) < video_num:
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': self.get_recommendations(user)
}
else:
if play_index is None:
video_indx = list(range(video_num))
random.shuffle(video_indx)
else:
# played index is excluded from the random shuffle and deterministically added to staySame pool.
video_indx = [*range(play_index)]+[*range(play_index+1,video_num)]
random.shuffle(video_indx)
video_indx.append(play_index)
next_clip_index = video_indx[:nc]
next_playlist_index = video_indx[nc:nc+np]
stay_same_index = video_indx[nc+np:]
rec_list = []
# select next clip for nextClip pool except when the playlist has only one clip.
for i in next_clip_index:
if prev_grid_list[i].get('playlist') is None:
# add this to deal with the absence of "playlist" data in old grid event.
# If there's no playlist data recorded, add the nextClip pool to nextPlaylist pool.
next_playlist_index.append(next_clip_index)
break
else:
# if "playlist" and "playlistPostion" (if not, default to 0) exists in grid event
for playlist in self.playlists:
if playlist.get('name')== prev_grid_list[i].get('playlist'):
if len(playlist["clips"]) == 1:
next_playlist_index.append(i)
break
# Discuss how this behavour should be: should it switch to a new playlist if it is the end of the playlist clip sequence already?
elif prev_grid_list[i].get('playlistPosition', 0) + 1 == len(playlist['clips']):
playlist_pos = 0
else:
playlist_pos = prev_grid_list[i].get('playlistPosition', 0) + 1
rec_list.append((i, {
'clips': playlist['clips'],
# 'position': random.randrange(len(playlist['clips'])),
'position': playlist_pos,
'name': playlist['name'],
'tags': playlist['tags'],
}))
# randomly select playlists (excluding the playlists from the current grid once "playlist" is recorded for grid events)
# for nextPlaylist pool.
vids_exclude = [e.get("playlist") for e in prev_grid_list]
while None in vids_exclude:
vids_exclude.remove(None)
video = self.get_recommendations(user, vids_exclude)
rec_list += [(i, video[i]) for i in next_playlist_index]
#staySame pool
rec_list += [(i,{}) for i in stay_same_index]
rec_list = sorted(rec_list, key=lambda k:k[0])
2018-09-21 20:09:45 +00:00
videos_ = [e[1] for e in rec_list]
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': videos_
}
def get_recommendations(self, user, vids_exclude = []):
2018-02-05 14:25:46 +00:00
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
2018-02-15 16:06:18 +00:00
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
2018-09-26 20:14:58 +00:00
userKeywordsWeights = {k: v.get('value', 1) for k, v in self.state['userKeywordsWeights'].items()}
# Exclude playlists from the most recent grid
playlists = copy.deepcopy(self.playlists)
if len(vids_exclude) > 0:
for playlist in playlists:
if playlist["name"] in vids_exclude:
playlists.remove(playlist)
2018-09-26 20:14:58 +00:00
# For each playlist, compute user keyword score by theme and character tags
user_keywords = copy.deepcopy(user.get('keywords', {}))
theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()}
character_tags = {k:v for k,v in user_keywords.items() if k.isupper()}
# manually modify some of the user keywords to match the playlist tags
theme_tags["god"] = theme_tags.get("god - gods",0)
theme_tags["visionary"] = theme_tags.get("visionary - enlightenment",0)
theme_tags["enlightenment"] = theme_tags.get("visionary - enlightenment",0)
character_tags["FEDOR MIKHAILOVICH SOFRONOV"] = character_tags.get("FYODOR MIKHAILOVICH SOFRONOV",0)
character_tags["SHKABARNYA OLGA SERGEEVNA"] = character_tags.get("OLGA SERGEEVNA SHKABARNYA",0)
character_tags["VICTORIA OLEGOVNA SKITSKAYA"] = character_tags.get("VIKTORIA OLEGOVNA SKITSKAYA",0)
2018-02-15 16:06:18 +00:00
score = {}
for playlist in playlists:
2018-09-26 20:14:58 +00:00
score[playlist['name']] = random.random() * 0.001
for tag in playlist['tags']:
if tag in theme_tags:
score[playlist['name']] += theme_tags[tag] * userKeywordsWeights["themeTags"]
elif tag in character_tags:
score[playlist['name']] += character_tags[tag] * userKeywordsWeights["characterTags"]
2018-02-15 16:06:18 +00:00
# Select highest scoring playlists
playlists = sorted(
playlists,
2018-02-15 16:06:18 +00:00
key=lambda playlist: -score[playlist['name']]
)
videos = playlists[:channels['userKeywords']]
playlists = playlists[channels['userKeywords']:]
# For each playlist, compute global keyword score
2018-01-22 11:55:02 +00:00
score = {}
2018-05-23 19:29:18 +00:00
for playlist in playlists:
2018-01-22 11:55:02 +00:00
score[playlist['name']] = random.random()
for tag in [tag for tag in playlist['tags'] if tag in sliders]:
score[playlist['name']] += sliders[tag]
# Select highest scoring playlists
playlists = sorted(
2018-05-23 19:29:18 +00:00
playlists,
2018-01-22 11:55:02 +00:00
key=lambda playlist: -score[playlist['name']]
)
videos += playlists[:16 - channels['userKeywords']]
2017-11-01 18:14:15 +00:00
# Shuffle playlists (randomize layout) and shift clips (randomize start)
2018-01-22 11:55:02 +00:00
random.shuffle(videos)
return [{
2018-01-27 14:13:28 +00:00
'clips': video['clips'],
'position': random.randrange(len(video['clips'])),
2018-04-21 16:13:02 +00:00
'name': video['name'],
'tags': video['tags'],
2018-01-22 11:55:02 +00:00
} for video in videos]
2017-11-01 16:38:58 +00:00
2018-05-19 11:51:02 +00:00
def get_next(self, user, position):
grid_events = {}
video_num = 16
for event in user.get('events', []):
if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
grid_events[event.get('data').get('index')] = event.get('data')
if len(grid_events) == video_num:
break
prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
vids_exclude = [e.get("playlist") for e in prev_grid_list]
video = self.get_recommendations(user, vids_exclude)[position]
2018-05-19 11:51:02 +00:00
return video
2018-02-05 14:15:39 +00:00
def update_state(self, data):
for key in data:
if key in self.state:
self.state[key].update(data[key])
else:
self.state[key] = data[key]
self.save_state()
return self.state
def save_state(self):
filename = os.path.join(self.path, 'state.json')
with open(filename, 'w') as f:
json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True)
2017-11-01 16:38:58 +00:00
def update(self):
# Get all storylines with tags
storylines = [{
2018-01-25 20:54:38 +00:00
'id': entity['id'],
2017-11-01 16:38:58 +00:00
'name': entity['name'],
2018-01-25 20:54:38 +00:00
'nodename': entity['nodename'],
2018-05-27 16:48:39 +00:00
'tags': [t.strip() for t in entity['tags']]
2017-11-01 16:38:58 +00:00
} for entity in self.pandora.find_entities({
'conditions': [
{'key': 'type', 'operator': '==', 'value': 'storylines'},
],
'operator': '&'
2018-02-14 16:27:39 +00:00
}, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')]
2017-11-01 16:38:58 +00:00
# Get list of storyline names
names = list(set([storyline['name'] for storyline in storylines]))
2018-06-20 08:44:40 +00:00
# Get list of items to use in DD
items = [item['id'] for item in self.pandora.find({
'conditions': [
{'key': 'list', 'operator': '==', 'value': 'dau:DD'}
]
}, ['id'])]
2017-11-01 16:38:58 +00:00
# Get all clips annotated with storyline references
clips = [clip for clip in self.pandora.find_annotations({
'conditions': [
{'key': 'layer', 'operator': '==', 'value': 'storylines'}
],
'operator': '&'
2018-06-20 08:44:40 +00:00
}, ['id', 'in', 'out', 'value']) if clip['value'] in names and clip['id'].split('/')[0] in items]
2017-11-01 16:38:58 +00:00
# Get list of ids for videos with clips
ids = list(set([clip['id'].split('/')[0] for clip in clips]))
2017-11-01 18:14:15 +00:00
# Get and cache video data
2017-11-01 16:38:58 +00:00
filename = os.path.join(self.path, 'videos.json')
if os.path.exists(filename):
with open(filename) as f:
videos_ = json.loads(f.read())
ids_ = [video['id'] for video in videos_]
else:
videos_, ids_ = [], []
videos = sorted(videos_ + [
self.pandora.get(id, ['code', 'id', 'order', 'title'])
for id in ids if not id in ids_
2018-04-21 16:28:08 +00:00
], key=lambda video: int(video['order']))
2017-11-01 16:38:58 +00:00
with open(filename, 'w') as f:
f.write(json.dumps(videos, indent=4, sort_keys=True))
2017-11-01 18:14:15 +00:00
# Get video order
2018-04-21 16:28:08 +00:00
order = {video['id']: int(video['order']) for video in videos}
2017-11-01 16:38:58 +00:00
# Sort clips
clips = sorted(
clips,
2018-01-27 18:40:02 +00:00
key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
2017-11-01 16:38:58 +00:00
)
2017-11-01 18:14:15 +00:00
# Get and cache playlists
2017-11-01 16:38:58 +00:00
self.playlists = [playlist for playlist in [{
2018-01-25 20:54:38 +00:00
'id': storyline['id'],
2018-02-04 18:28:35 +00:00
'name': storyline['nodename'].strip(),
2017-11-01 16:38:58 +00:00
'tags': storyline['tags'],
2018-01-27 18:40:02 +00:00
'clips': [{
'item': clip['id'].split('/')[0],
2017-11-01 18:14:15 +00:00
'id': clip['id'],
'in': clip['in'],
'out': clip['out']
2018-01-27 18:40:02 +00:00
} for clip in clips if clip['value'] == storyline['name']]
2017-11-01 16:38:58 +00:00
} for storyline in storylines] if playlist['clips']]
with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
2018-02-05 14:15:39 +00:00
self.update_keywords()
def update_keywords(self):
changed = False
2018-02-15 16:06:18 +00:00
if 'globalKeywords' not in self.state:
self.state['globalKeywords'] = {}
2018-02-05 14:15:39 +00:00
changed = True
2018-04-21 16:13:02 +00:00
existing_tags = set()
2018-02-05 14:15:39 +00:00
for playlist in self.playlists:
for tag in playlist.get('tags', []):
2018-04-21 16:13:02 +00:00
if not tag.isupper() and tag:
existing_tags.add(tag)
2018-02-15 16:06:18 +00:00
if not tag.isupper() and tag not in self.state['globalKeywords']:
self.state['globalKeywords'][tag] = {'value': 0}
2018-02-05 14:15:39 +00:00
changed = True
2018-04-21 16:13:02 +00:00
for tag in set(self.state['globalKeywords']) - existing_tags:
del self.state['globalKeywords'][tag]
changed = True
2018-02-05 14:15:39 +00:00
if changed:
self.save_state()
2017-11-02 08:40:02 +00:00
@run_async
def update_async(self):
self.update()
2017-11-01 22:56:33 +00:00
class Pandora:
2017-11-02 11:12:56 +00:00
# pan.do/ra API wrapper
2017-11-01 22:56:33 +00:00
def __init__(self, url, username, password):
self.api = ox.API(url)
self.api.signin(username=username, password=password)
2018-06-20 08:44:40 +00:00
def find(self, query, keys):
# print('FIND', query, keys)
return self.api.find({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
2017-11-01 22:56:33 +00:00
def find_annotations(self, query, keys):
# print('FIND ANNOTATIONS', query, keys)
return self.api.findAnnotations({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def find_entities(self, query, keys):
# print('FIND ENTITIES', query, keys)
return self.api.findEntities({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def get(self, id, keys):
# print('GET', id, keys)
return self.api.get({
'id': id,
'keys': keys
})['data']
2017-11-01 16:38:58 +00:00
if __name__ == '__main__':
engine = Engine('json')
engine.update()