Compare commits

..

No commits in common. "master" and "iss1" have entirely different histories.
master ... iss1

2 changed files with 97 additions and 271 deletions

View file

@ -7,7 +7,7 @@ Restart=always
User=dd User=dd
Group=dd Group=dd
WorkingDirectory=/srv/dd/re WorkingDirectory=/srv/dd/re
ExecStart=/srv/dd/re/venv/bin/python3 /srv/dd/re/server.py ExecStart=/srv/dd/re/server.py
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View file

@ -1,6 +1,6 @@
''' '''
Recommendation Engine ver 1 Recommendation Engine Example
30 Nov 2018, 0x2620 1 Nov 2017, 0x2620
''' '''
from collections import defaultdict from collections import defaultdict
@ -9,7 +9,6 @@ import logging
import os import os
import random import random
import time import time
import copy
import ox import ox
@ -33,10 +32,6 @@ class Engine:
if os.path.exists(filename): if os.path.exists(filename):
with open(filename) as f: with open(filename) as f:
self.playlists = json.load(f) self.playlists = json.load(f)
# ## the following is for testing purpose.
# for playlist in self.playlists:
# for clip in playlist["clips"]:
# clip["pass"] = bool(random.getrandbits(1))
else: else:
self.playlists = [] self.playlists = []
@ -47,25 +42,18 @@ class Engine:
else: else:
self.state = { self.state = {
'channels': { 'channels': {
'globalKeywords': {'locked': False, 'value': 8}, 'globalKeywords': {'locked': False, 'value': 7},
'userKeywords': {'locked': False, 'value': 8} 'userKeywords': {'locked': False, 'value': 7},
'screenings': {'locked': True, 'value': 2}
}, },
'globalKeywords': {}, 'globalKeywords': {},
'gridChange': {
'nextClip': {'locked': False, 'value': 4},
'nextPlaylist': {'locked': False, 'value': 4},
'staySame': {'locked': False, 'value': 8}
} }
if 'gridChange' not in self.state:
self.state['gridChange'] = {
'nextClip': {'locked': False, 'value': 5},
'nextPlaylist': {'locked': False, 'value': 8},
'staySame': {'locked': True, 'value': 3}
} }
if 'userKeywordsWeights' not in self.state:
self.state['userKeywordsWeights'] = {
'themeTags': {'locked': False, 'value': 0.3},
'characterTags': {'locked': False, 'value': 0.7},
'random': {'locked': False, 'value': False}
}
if 'random' not in self.state['userKeywordsWeights']:
self.state['userKeywordsWeights']['random'] = {'locked': False, 'value': False}
self.update_keywords() self.update_keywords()
@property @property
@ -96,54 +84,30 @@ class Engine:
clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration'] clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']
return clips return clips
def get_videos(self, user): def get_videos(self, user):
## Output is a dictionary of: user keyword scores, list of videos for each grid index (0-15),
## and parameters to be displayed on debug view.
## It implements "next clip" "next playlist" "stay same" grid allocation for the output video, depending on the user log history.
# Update self_playlists to reflect user log history
playlists = self.update_user_playlists(user)
# Get the user keyword scores for debug view
user_keywords = copy.deepcopy(user.get('keywords', {}))
theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()}
character_tags = {k:v for k,v in user_keywords.items() if k.isupper()}
top_user_keywords = sorted([(k,v) for (k,v) in theme_tags.items()], key=lambda kv: kv[1])[-10:]
top_user_characters = sorted([(k,v) for (k,v) in character_tags.items()], key=lambda kv: kv[1])[-10:]
debug_index_output = defaultdict(list)
# If the most recent event is "login," initialize grid videos.
if user.get('events', [{}])[0].get("event")=="login":
rec = self.get_recommendations(playlists, user)
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': rec["videos"],
"_debug": {
"top_user_keywords": top_user_keywords,
"top_user_characters": top_user_characters,
"top_user_playlists": rec["top_user_playlists"],
"top_global_playlists": rec["top_global_playlists"]
}
}
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
# check if there were grid events for all indexes.
grid_events = {} grid_events = {}
(nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame")) (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame"))
# this assumes np + nc + ns = total number of videos in the grid view (16).
# Make sure sanity check exists in front-end (error if it does not add up to 16).
video_num = nc + np + ns video_num = nc + np + ns
# collect the most recent grid event for each grid index and the grid index of the most recent play event. # for event in user.get('events', []):
# the following requires "index" in play event data (previously unavailable) # if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
# grid_events[event.get('data').get('index')] = event.get('data')
# if len(grid_events) == video_num:
# break
# # The version where the loop also extract play_index (requires "index" in play event data):
play_index = None play_index = None
for event in user.get('events', []): for event in user.get('events', []):
if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
grid_events[event.get('data').get('index')] = event.get('data') grid_events[event.get('data').get('index')] = event.get('data')
if event.get('event') == "play" and event["data"].get("type") == "video" and not play_index: if event.get('event') == "play" and not play_index:
play_index = event.get('data').get('index') play_index = event.get('data').get('index')
if len(grid_events) == video_num and play_index: if len(grid_events) == video_num and play_index:
break break
@ -152,158 +116,79 @@ class Engine:
# if there were no grid events for all, initialize all grids. # if there were no grid events for all, initialize all grids.
if len(prev_grid_list) < video_num: if len(prev_grid_list) < video_num:
rec = self.get_recommendations(playlists, user) return self.get_recommendations(user)
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': rec["videos"],
"_debug": {
"top_user_keywords": top_user_keywords,
"top_user_characters": top_user_characters,
"top_user_playlists": rec["top_user_playlists"],
"top_global_playlists": rec["top_global_playlists"]
}
}
else: else:
if play_index is None: # played index is excluded from the random shuffle and deterministically added to staySame pool.
video_indx = list(range(video_num))
random.shuffle(video_indx)
else:
# play index is excluded from the random shuffle and deterministically added to staySame pool.
video_indx = [*range(play_index)]+[*range(play_index+1,video_num)] video_indx = [*range(play_index)]+[*range(play_index+1,video_num)]
# video_indx = list(range(video_num))
random.shuffle(video_indx) random.shuffle(video_indx)
video_indx.append(play_index) video_indx.append(play_index)
next_clip_index = video_indx[:nc] next_clip_index = video_indx[:nc]
next_playlist_index = video_indx[nc:nc+np] next_playlist_index = video_indx[nc:nc+np]
stay_same_index = video_indx[nc+np:] stay_same_index = video_indx[nc+np:]
rec_list = [] rec_list = []
# select next clip for nextClip pool except when the playlist has only one clip.
# nextClip pool: select next clip except when the playlist has only one clip. skip the clip with "pass":True when selecting the next clip.
for i in next_clip_index: for i in next_clip_index:
# add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists(). for playlist in self.playlists:
if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]:
next_playlist_index.append(i)
else:
for playlist in playlists:
if playlist.get('name')== prev_grid_list[i].get('playlist'): if playlist.get('name')== prev_grid_list[i].get('playlist'):
unwatched_clips_indx = [j for j in range(len(playlist["clips"])) if not playlist["clips"][j].get("pass")]
if len(playlist["clips"]) == 1: if len(playlist["clips"]) == 1:
next_playlist_index.append(i) next_playlist_index.append(i)
else:
next_unwatched_indx = [j for j in unwatched_clips_indx if j > prev_grid_list[i]['playlistPosition']]
if len(next_unwatched_indx) == 0:
if unwatched_clips_indx[0] != prev_grid_list[i]['playlistPosition']:
playlist_pos = unwatched_clips_indx[0]
else:
next_playlist_index.append(i)
break break
# Discuss how this behavour should be: should it switch to a new playlist if it is the end of the playlist clip sequence already?
elif prev_grid_list[i].get('playlistPosition') + 1 == len(playlist['clips']):
playlist_pos = 0
else: else:
playlist_pos = next_unwatched_indx[0] playlist_pos = prev_grid_list[i].get('playlistPosition') + 1
rec_list.append((i, { rec_list.append((i, {
'clips': playlist['clips'], 'clips': playlist['clips'],
# 'position': random.randrange(len(playlist['clips'])),
'position': playlist_pos, 'position': playlist_pos,
'name': playlist['name'], 'name': playlist['name'],
'tags': playlist['tags'] 'tags': playlist['tags'],
})) }))
debug_index_output["next_clip"].append((i,playlist['name'])) # randomly select playlists (excluding the playlists from the current grid once "playlist" is recorded for grid events)
# for nextPlaylist pool.
vids_exclude = [e.get("playlist") for e in prev_grid_list]
video = self.get_recommendations(user, vids_exclude)
rec_list += [(i, video[i]) for i in next_playlist_index]
#staySame pool #staySame pool
for i in stay_same_index: rec_list += [(i,{}) for i in stay_same_index]
# add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists().
if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]:
next_playlist_index.append(i)
else:
rec_list.append((i,{}))
debug_index_output["stay_same"].append(i)
# nextPlaylist pool: randomly select playlists (excluding the playlists from the current grid).
vids_exclude = [e.get("playlist") for e in prev_grid_list]
while None in vids_exclude:
vids_exclude.remove(None)
rec = self.get_recommendations(playlists, user, vids_exclude)
rec_list += [(i, rec['videos'][i]) for i in next_playlist_index]
debug_index_output["new_playlist"] = [(i, rec['videos'][i]["name"]) for i in next_playlist_index]
rec_list = sorted(rec_list, key=lambda k:k[0]) rec_list = sorted(rec_list, key=lambda k:k[0])
videos_ = [e[1] for e in rec_list] return [e[1] for e in rec_list]
return {
'user': {
'keywords': user.get('keywords', {})
},
'videos': videos_,
"_debug": {
"top_user_keywords": top_user_keywords, # list of (keyword, score)
"top_user_characters": top_user_characters, # list of (keyword, score)
"top_user_playlists": rec["top_user_playlists"], # list of (playlist name, score)
"top_global_playlists": rec["top_global_playlists"], # list of (playlist name, score)
"stay_same_index": debug_index_output["stay_same"], # list of integers
"next_clip_index": debug_index_output["next_clip"], # list of (integer, playlist name)
"new_playlist_index": debug_index_output["new_playlist"] # list of (integer, playlist name)
}
}
def get_recommendations(self, playlists, user, vids_exclude = []): # NOTE for future improvement: vids_exclude element unit could be clip or in/out time pairs, rather than playlist.
# The same playlist could be played in the grid view as long as these are differenct clips or separate times.
def get_recommendations(self, user, vids_exclude = []):
channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
userKeywordsWeights = {k: v.get('value', 1) for k, v in self.state['userKeywordsWeights'].items()}
# Exclude playlists from the most recent grid # Exclude playlists from the most recent grid
playlists = self.playlists
if len(vids_exclude) > 0: if len(vids_exclude) > 0:
for playlist in playlists: for playlist in playlists:
if playlist["name"] in vids_exclude: if playlist["name"] in vids_exclude:
playlists.remove(playlist) playlists.remove(playlist)
# Generate random weights if random option is chosen in the dashboard: # For each playlist, compute user keyword score
if userKeywordsWeights.get('random'): user_keywords = user.get('keywords', {})
themeWeights = random.random()
charWeights = 1-themeWeights
else:
themeWeights = userKeywordsWeights['themeTags']
charWeights = userKeywordsWeights['characterTags']
# For each playlist, compute user keyword score by theme and character tags
user_keywords = copy.deepcopy(user.get('keywords', {}))
theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()}
character_tags = {k:v for k,v in user_keywords.items() if k.isupper()}
# manually modify some of the user keywords to match the playlist tags
theme_tags["god"] = theme_tags.get("god - gods",0)
theme_tags["visionary"] = theme_tags.get("visionary - enlightenment",0)
theme_tags["enlightenment"] = theme_tags.get("visionary - enlightenment",0)
character_tags["FEDOR MIKHAILOVICH SOFRONOV"] = character_tags.get("FYODOR MIKHAILOVICH SOFRONOV",0)
character_tags["SHKABARNYA OLGA SERGEEVNA"] = character_tags.get("OLGA SERGEEVNA SHKABARNYA",0)
character_tags["VICTORIA OLEGOVNA SKITSKAYA"] = character_tags.get("VIKTORIA OLEGOVNA SKITSKAYA",0)
score = {} score = {}
for playlist in playlists: for playlist in playlists:
score[playlist['name']] = random.random() * 0.1 score[playlist['name']] = random.random()
for tag in playlist['tags']: for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:
if tag in theme_tags: score[playlist['name']] += user_keywords[tag]
score[playlist['name']] += theme_tags[tag] * themeWeights
elif tag in character_tags:
score[playlist['name']] += character_tags[tag] * charWeights
# Select highest scoring playlists # Select highest scoring playlists
playlists = sorted( playlists = sorted(
playlists, playlists,
key=lambda playlist: -score[playlist['name']] key=lambda playlist: -score[playlist['name']]
) )
# Record the following for debug view input
top_user_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['userKeywords']]]
# top_user_playlists = [{
# 'name': playlist['name'],
# 'tags': playlist['tags'],
# 'score': score[playlist['name']],
# } for playlist in playlists[:channels['userKeywords']]]
videos = playlists[:channels['userKeywords']] videos = playlists[:channels['userKeywords']]
playlists = playlists[channels['userKeywords']:] playlists = playlists[channels['userKeywords']:]
# For each playlist, compute global keyword score # For each playlist, compute global keyword score
@ -317,88 +202,34 @@ class Engine:
playlists, playlists,
key=lambda playlist: -score[playlist['name']] key=lambda playlist: -score[playlist['name']]
) )
# Record the following for debug view input videos += playlists[:channels['globalKeywords']]
top_global_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['globalKeywords']]] playlists = playlists[channels['globalKeywords']:]
# top_global_playlists = [{ # Count products the user has seen
# 'name': playlist['name'], count = defaultdict(lambda: 0)
# 'tags': playlist['tags'], for event in user.get('events', []):
# 'score': score[playlist['name']], if event.get('data', {}).get('product'):
# } for playlist in playlists[:channels['globalKeywords']]] count[event['data']['product']] += 1
# For each product in playlist tags, increment score by count
videos += playlists[:16 - channels['userKeywords']] for playlist in playlists:
score[playlist['name']] = random.random()
for tag in set(playlist['tags']) & set(count):
score[playlist['name']] += count[tag]
# Select highest scoring playlists
videos += sorted(
playlists,
key=lambda playlist: -score[playlist['name']]
)[:16 - channels['userKeywords'] - channels['globalKeywords']]
# Shuffle playlists (randomize layout) and shift clips (randomize start) # Shuffle playlists (randomize layout) and shift clips (randomize start)
random.shuffle(videos) random.shuffle(videos)
return { return [{
'videos': [{
'clips': video['clips'], 'clips': video['clips'],
'position': random.choice([i for i in range(len(video["clips"])) if not video["clips"][i].get("pass")]), 'position': random.randrange(len(video['clips'])),
'name': video['name'], 'name': video['name'],
'tags': video['tags'], 'tags': video['tags'],
} for video in videos], } for video in videos]
"top_user_playlists":top_user_playlists,
"top_global_playlists": top_global_playlists
}
def update_user_playlists(self, user, watch_cutoff = 0.9):
# Output: playlists with updated in/out time of clips that have been watched as well as "pass" indicators for the clips that has been watched for more than watch_cutoff.
# Watched is defined as a video being played in full screen.
# "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1]
# + check (play, pause) pairs and eliminate unusual cases most likely due to a bug.
# + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, add "pass": True to the clip.
# + Otherwise, find the last pause position of a clip and record it as "in" position of the clip.
# + If clips are all marked as "pass" in a playlist, elliminate the playlist from the user playlists.
playlists = copy.deepcopy(self.playlists)
play = {}
clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs
# The current max time of a clip duration is 10379.383333377269 from "DDLaunch: Erik Verlinde, Gravity as an emergent force (1956)"
# A user could potentially spend more than 3 hours if they keep watching after the clip enters into the subsequent "scene"
for event in user.get('events', [])[::-1]:
if event["event"] == "play" and event["data"].get("type") == "video":
play = event
elif event["event"] == "pause" and play!={} and event["data"].get("type") == "video":
if "position" not in play["data"]:
play = {}
break
if play["data"].get("playlist") == event["data"].get("playlist"):
if event["data"]["position"] - play["data"]["position"] > 0 and event["data"]["position"] - play["data"]["position"] < clip_max_dur and event["data"].get("playlistPosition") == play["data"].get("playlistPosition") and event["data"].get("playlistPosition") is not None:
i = event["data"]["playlistPosition"]
for playlist in playlists:
if playlist["name"] == event["data"]["playlist"] and i < len(playlist["clips"]):
if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 15, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 15:
# This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 15secs buffer. There were newer edits of clip positions with 12 seconds difference.
# instances where this might not be the case: clip in/out may be largely edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug)
if "orig_in" not in playlist["clips"][i]:
cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]
else:
cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["orig_in"])*watch_cutoff + playlist["clips"][i]["orig_in"]
if event["data"]["position"] >= cutoff_pos:
playlist["clips"][i]["pass"] = True
else:
if "orig_in" not in playlist["clips"][i]:
# record the original "in" position to calculate cutoff position in the future
playlist["clips"][i]["orig_in"] = playlist["clips"][i]["in"]
# update "in" position of the clip in the playlist
playlist["clips"][i]["in"] = event["data"]["position"]
break
play = {}
for playlist in playlists.copy():
unwatched = [clip for clip in playlist["clips"] if not clip.get("pass")]
if not unwatched:
playlists.remove(playlist)
# If the number of playlists is reduced to 30, reset it to the original.
if len(playlists) < 30:
playlists = copy.deepcopy(self.playlists)
return(playlists)
def get_next(self, user, position): def get_next(self, user, position):
# Update self_playlists to reflect user log history
playlists = self.update_user_playlists(user)
grid_events = {} grid_events = {}
video_num = 16 video_num = 16
@ -409,8 +240,8 @@ class Engine:
break break
prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
vids_exclude = [e.get("playlist") for e in prev_grid_list] vids_exclude = [e.get("playlist") for e in prev_grid_list]
rec = self.get_recommendations(playlists, user, vids_exclude) video = self.get_recommendations(user, vids_exclude)[position]
return rec["videos"][position] return video
def update_state(self, data): def update_state(self, data):
for key in data: for key in data:
@ -472,15 +303,10 @@ class Engine:
f.write(json.dumps(videos, indent=4, sort_keys=True)) f.write(json.dumps(videos, indent=4, sort_keys=True))
# Get video order # Get video order
order = {video['id']: int(video['order']) for video in videos} order = {video['id']: int(video['order']) for video in videos}
code = {video['id']: video['code'] for video in videos}
# Sort clips # Sort clips
clips = sorted( clips = sorted(
clips, clips,
key=lambda clip: ( key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
order[clip['id'].split('/')[0]],
ox.sort_string(code[clip['id'].split('/')[0]]),
clip['in']
)
) )
# Get and cache playlists # Get and cache playlists
self.playlists = [playlist for playlist in [{ self.playlists = [playlist for playlist in [{
@ -489,13 +315,13 @@ class Engine:
'tags': storyline['tags'], 'tags': storyline['tags'],
'clips': [{ 'clips': [{
'item': clip['id'].split('/')[0], 'item': clip['id'].split('/')[0],
'id': '%s_%0.3f-%0.3f' % (clip['id'].split('/')[0], clip['in'], clip['out']), 'id': clip['id'],
'in': clip['in'], 'in': clip['in'],
'out': clip['out'] 'out': clip['out']
} for clip in clips if clip['value'] == storyline['name']] } for clip in clips if clip['value'] == storyline['name']]
} for storyline in storylines] if playlist['clips']] } for storyline in storylines] if playlist['clips']]
with open(os.path.join(self.path, 'playlists.json'), 'w') as f: with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
f.write(json.dumps(self.playlists, indent=4, sort_keys=True, ensure_ascii=False)) f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
self.update_keywords() self.update_keywords()
def update_keywords(self): def update_keywords(self):