From 59d21527584805440d16f819bccf6f5735159a4b Mon Sep 17 00:00:00 2001 From: pythagoraswitch Date: Thu, 13 Sep 2018 20:07:45 +0200 Subject: [PATCH 1/3] added update_user_playlist function [issue 2] --- recommendation_engine.py | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/recommendation_engine.py b/recommendation_engine.py index 4a8b075..618298f 100644 --- a/recommendation_engine.py +++ b/recommendation_engine.py @@ -112,7 +112,7 @@ class Engine: for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') - if event.get('event') == "play" and not play_index: + if event.get('event') == "play" and event["data"].get("type") == "video" and not play_index: play_index = event.get('data').get('index') if len(grid_events) == video_num and play_index: break @@ -181,8 +181,7 @@ class Engine: return [e[1] for e in rec_list] -# NOTE for future improvement: vids_exclude element unit could be clip or in/out time pairs, rather than playlist. -# The same playlist could be played in the grid view as long as these are differenct clips or separate times. +# Current assumption: Avoid the same playlist in the grid view. In the future, the same playlist could be played in the grid view as long as these are differenct clips or separate times? def get_recommendations(self, user, vids_exclude = []): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} @@ -247,6 +246,54 @@ class Engine: } for video in videos] +# Output: playlists with updated in/out time of clips that have been watched. +# Watched is defined as a video being played in full screen. +# "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1] +# + check (play, pause) pairs and eliminate unusual cases most likely due to a bug. +# + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, eliminate the clip from the playlist. +# + Otherwise, find the last pause position of a clip and record it as "in" position of the clip. +# + If the clips are all eliminated from a playlist, eliminate the playlist. + def update_user_playlists(playlists, user, watch_cutoff = 0.8): + play = {} + watched = [] + clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs + # The current max time of a clip duration is 10379.383333377269 from "DDLaunch: Erik Verlinde, Gravity as an emergent force (1956)" + for event in user["events"][::-1]: + if event["event"] == "play" and event["data"].get("type") == "video": + play = event + elif event["event"] == "pause" and play!={} and event["data"].get("type") == "video": + if "position" not in play["data"]: + play = {} + break + if play["data"].get("playlist") == event["data"].get("playlist"): + if event["data"]["position"] - play["data"]["position"] > 0 and event["data"]["position"] - play["data"]["position"] < clip_max_dur and event["data"].get("playlistPosition") == play["data"].get("playlistPosition") and event["data"].get("playlistPosition") is not None: + i = event["data"]["playlistPosition"] + for playlist in playlists: + if playlist["name"] == event["data"]["playlist"] and i < len(playlist["clips"]): + if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 30, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 30: + # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 30secs buffer. Check if there are instances where this might not be the case. + # i.e. clip in/out may be edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug) + if event["data"]["position"] >= ((playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]): + watched.append((playlist["name"],i)) + else: + playlist["clips"][i]["in"] = event["data"]["position"] + break + play = {} + + d_watched = defaultdict(set) + for k, v in watched: + d_watched[k].add(v) + for k, v in d_watched.items(): + for playlist in playlists: + if playlist["name"] == k: + if len(v) == len(playlist["clips"]): + playlists.remove(playlist) + else: + playlist["clips"] = [playlist["clips"][i] for i in range(len(playlist["clips"])) if i not in v] + break + return(playlists) + + def get_next(self, user, position): grid_events = {} video_num = 16 From acb9b499be3f5f5a31c1c23a1f112bcf048a5659 Mon Sep 17 00:00:00 2001 From: pythagoraswitch Date: Wed, 26 Sep 2018 20:24:37 +0200 Subject: [PATCH 2/3] updated parameters and comments --- recommendation_engine.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/recommendation_engine.py b/recommendation_engine.py index 618298f..5cb107c 100644 --- a/recommendation_engine.py +++ b/recommendation_engine.py @@ -85,8 +85,8 @@ class Engine: clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration'] return clips - def get_videos(self, user): + def get_videos(self, user): if user.get('events', [{}])[0].get("event")=="login": return self.get_recommendations(user) @@ -97,8 +97,6 @@ class Engine: # check if there were grid events for all indexes. grid_events = {} (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame")) - # this assumes np + nc + ns = total number of videos in the grid view (16). - # Make sure sanity check exists in front-end (error if it does not add up to 16). video_num = nc + np + ns # for event in user.get('events', []): @@ -181,7 +179,6 @@ class Engine: return [e[1] for e in rec_list] -# Current assumption: Avoid the same playlist in the grid view. In the future, the same playlist could be played in the grid view as long as these are differenct clips or separate times? def get_recommendations(self, user, vids_exclude = []): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} @@ -246,14 +243,14 @@ class Engine: } for video in videos] -# Output: playlists with updated in/out time of clips that have been watched. -# Watched is defined as a video being played in full screen. -# "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1] -# + check (play, pause) pairs and eliminate unusual cases most likely due to a bug. -# + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, eliminate the clip from the playlist. -# + Otherwise, find the last pause position of a clip and record it as "in" position of the clip. -# + If the clips are all eliminated from a playlist, eliminate the playlist. - def update_user_playlists(playlists, user, watch_cutoff = 0.8): + def update_user_playlists(playlists, user, watch_cutoff = 0.9): + # Output: playlists with updated in/out time of clips that have been watched. + # Watched is defined as a video being played in full screen. + # "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1] + # + check (play, pause) pairs and eliminate unusual cases most likely due to a bug. + # + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, eliminate the clip from the playlist. + # + Otherwise, find the last pause position of a clip and record it as "in" position of the clip. + # + If the clips are all eliminated from a playlist, eliminate the playlist. play = {} watched = [] clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs @@ -270,9 +267,9 @@ class Engine: i = event["data"]["playlistPosition"] for playlist in playlists: if playlist["name"] == event["data"]["playlist"] and i < len(playlist["clips"]): - if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 30, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 30: - # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 30secs buffer. Check if there are instances where this might not be the case. - # i.e. clip in/out may be edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug) + if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 15, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 15: + # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 15secs buffer. There were newer edits of clip positions with 12 seconds difference. + # instances where this might not be the case: clip in/out may be largely edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug) if event["data"]["position"] >= ((playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]): watched.append((playlist["name"],i)) else: From 834ba630f19b822744058928a0eed6b70c5fc8a8 Mon Sep 17 00:00:00 2001 From: pythagoraswitch Date: Fri, 16 Nov 2018 23:47:02 +0100 Subject: [PATCH 3/3] resolved conflist with master branch, added orig_in element for playlist, added update_user_playlists call in get_videos and get_next. --- recommendation_engine.py | 116 ++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 44 deletions(-) diff --git a/recommendation_engine.py b/recommendation_engine.py index 5cb107c..cb72f83 100644 --- a/recommendation_engine.py +++ b/recommendation_engine.py @@ -43,9 +43,8 @@ class Engine: else: self.state = { 'channels': { - 'globalKeywords': {'locked': False, 'value': 7}, - 'userKeywords': {'locked': False, 'value': 7}, - 'screenings': {'locked': True, 'value': 2} + 'globalKeywords': {'locked': False, 'value': 8}, + 'userKeywords': {'locked': False, 'value': 8} }, 'globalKeywords': {}, } @@ -55,6 +54,12 @@ class Engine: 'nextPlaylist': {'locked': False, 'value': 4}, 'staySame': {'locked': False, 'value': 8} } + if 'userKeywordsWeights' not in self.state: + self.state['userKeywordsWeights'] = { + 'themeTags': {'locked': False, 'value': 0.3}, + 'characterTags': {'locked': False, 'value': 0.7} + + } self.update_keywords() @property @@ -87,8 +92,16 @@ class Engine: def get_videos(self, user): - if user.get('events', [{}])[0].get("event")=="login": - return self.get_recommendations(user) + # Update self_playlists first to reflect changes + update_user_playlists(self.playlists, user) + + if user.get('events', [{}])[0].get("event")=="login": + return { + 'user': { + 'keywords': user.get('keywords', {}) + }, + 'videos': self.get_recommendations(user) + } channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} @@ -97,29 +110,28 @@ class Engine: # check if there were grid events for all indexes. grid_events = {} (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame")) - video_num = nc + np + ns + video_num = nc + np + ns - # for event in user.get('events', []): - # if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: - # grid_events[event.get('data').get('index')] = event.get('data') - # if len(grid_events) == video_num: - # break - - # # The version where the loop also extract play_index (requires "index" in play event data): + # # The version where the loop also extract play_index; requires "index" in play event data (previously unavailable): play_index = None for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') if event.get('event') == "play" and event["data"].get("type") == "video" and not play_index: - play_index = event.get('data').get('index') + play_index = event.get('data').get('index') if len(grid_events) == video_num and play_index: - break + break prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) # if there were no grid events for all, initialize all grids. - if len(prev_grid_list) < video_num: - return self.get_recommendations(user) + if len(prev_grid_list) < video_num: + return { + 'user': { + 'keywords': user.get('keywords', {}) + }, + 'videos': self.get_recommendations(user) + } else: if play_index is None: @@ -144,7 +156,6 @@ class Engine: next_playlist_index.append(next_clip_index) break else: - # if "playlist" and "playlistPostion" (if not, default to 0) exists in grid event for playlist in self.playlists: if playlist.get('name')== prev_grid_list[i].get('playlist'): if len(playlist["clips"]) == 1: @@ -176,13 +187,20 @@ class Engine: rec_list += [(i,{}) for i in stay_same_index] rec_list = sorted(rec_list, key=lambda k:k[0]) - return [e[1] for e in rec_list] + videos_ = [e[1] for e in rec_list] + return { + 'user': { + 'keywords': user.get('keywords', {}) + }, + 'videos': videos_ + } def get_recommendations(self, user, vids_exclude = []): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} + userKeywordsWeights = {k: v.get('value', 1) for k, v in self.state['userKeywordsWeights'].items()} # Exclude playlists from the most recent grid playlists = copy.deepcopy(self.playlists) @@ -191,13 +209,26 @@ class Engine: if playlist["name"] in vids_exclude: playlists.remove(playlist) - # For each playlist, compute user keyword score - user_keywords = user.get('keywords', {}) + # For each playlist, compute user keyword score by theme and character tags + user_keywords = copy.deepcopy(user.get('keywords', {})) + theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()} + character_tags = {k:v for k,v in user_keywords.items() if k.isupper()} + # manually modify some of the user keywords to match the playlist tags + theme_tags["god"] = theme_tags.get("god - gods",0) + theme_tags["visionary"] = theme_tags.get("visionary - enlightenment",0) + theme_tags["enlightenment"] = theme_tags.get("visionary - enlightenment",0) + character_tags["FEDOR MIKHAILOVICH SOFRONOV"] = character_tags.get("FYODOR MIKHAILOVICH SOFRONOV",0) + character_tags["SHKABARNYA OLGA SERGEEVNA"] = character_tags.get("OLGA SERGEEVNA SHKABARNYA",0) + character_tags["VICTORIA OLEGOVNA SKITSKAYA"] = character_tags.get("VIKTORIA OLEGOVNA SKITSKAYA",0) + score = {} for playlist in playlists: - score[playlist['name']] = random.random() - for tag in [tag for tag in playlist['tags'] if tag in user_keywords]: - score[playlist['name']] += user_keywords[tag] + score[playlist['name']] = random.random() * 0.001 + for tag in playlist['tags']: + if tag in theme_tags: + score[playlist['name']] += theme_tags[tag] * userKeywordsWeights["themeTags"] + elif tag in character_tags: + score[playlist['name']] += character_tags[tag] * userKeywordsWeights["characterTags"] # Select highest scoring playlists playlists = sorted( playlists, @@ -216,23 +247,7 @@ class Engine: playlists, key=lambda playlist: -score[playlist['name']] ) - videos += playlists[:channels['globalKeywords']] - playlists = playlists[channels['globalKeywords']:] - # Count products the user has seen - count = defaultdict(lambda: 0) - for event in user.get('events', []): - if event.get('data', {}).get('product'): - count[event['data']['product']] += 1 - # For each product in playlist tags, increment score by count - for playlist in playlists: - score[playlist['name']] = random.random() - for tag in set(playlist['tags']) & set(count): - score[playlist['name']] += count[tag] - # Select highest scoring playlists - videos += sorted( - playlists, - key=lambda playlist: -score[playlist['name']] - )[:16 - channels['userKeywords'] - channels['globalKeywords']] + videos += playlists[:16 - channels['userKeywords']] # Shuffle playlists (randomize layout) and shift clips (randomize start) random.shuffle(videos) return [{ @@ -255,7 +270,8 @@ class Engine: watched = [] clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs # The current max time of a clip duration is 10379.383333377269 from "DDLaunch: Erik Verlinde, Gravity as an emergent force (1956)" - for event in user["events"][::-1]: + # A user could potentially spend more than 3 hours if they keep watching after the clip enters into the subsequent "scene" + for event in user.get('events', [])[::-1]: if event["event"] == "play" and event["data"].get("type") == "video": play = event elif event["event"] == "pause" and play!={} and event["data"].get("type") == "video": @@ -270,9 +286,18 @@ class Engine: if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 15, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 15: # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 15secs buffer. There were newer edits of clip positions with 12 seconds difference. # instances where this might not be the case: clip in/out may be largely edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug) - if event["data"]["position"] >= ((playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]): + if "orig_in" not in playlist["clips"][i]: + cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"] + else: + cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["orig_in"])*watch_cutoff + playlist["clips"][i]["orig_in"] + + if event["data"]["position"] >= cutoff_pos: watched.append((playlist["name"],i)) - else: + else: + if "orig_in" not in playlist["clips"][i]: + # record the original "in" position to calculate cutoff position in the future + playlist["clips"][i]["orig_in"] = playlist["clips"][i]["in"] + # update "in" position of the clip in the playlist playlist["clips"][i]["in"] = event["data"]["position"] break play = {} @@ -292,6 +317,9 @@ class Engine: def get_next(self, user, position): + # Update self_playlists first to reflect changes + update_user_playlists(self.playlists, user) + grid_events = {} video_num = 16