dont default to random and don't fail if it is missing

typo
use code as secondary sort
2018-12-03 21:53:13 +01:00 · 2018-12-01 19:47:06 +01:00 · 2018-12-01 19:47:06 +01:00 · 2018-12-01 11:02:36 +01:00 · 2018-12-01 11:00:46 +01:00 · 2018-12-01 10:11:16 +01:00
1 changed files with 220 additions and 108 deletions
--- a/recommendation_engine.py
+++ b/recommendation_engine.py
@ -1,6 +1,6 @@
 '''
-Recommendation Engine Example
+Recommendation Engine ver 1
-1 Nov 2017, 0x2620
+30 Nov 2018, 0x2620
 '''
 from collections import defaultdict
@ -33,6 +33,10 @@ class Engine:
        if os.path.exists(filename):
            with open(filename) as f:
                self.playlists = json.load(f)
            # ## the following is for testing purpose.
            # for playlist in self.playlists:
            #     for clip in playlist["clips"]:
            #         clip["pass"] = bool(random.getrandbits(1))
        else:
            self.playlists = []
@ -43,18 +47,25 @@ class Engine:
        else:
            self.state = {
                'channels': {
-                    'globalKeywords': {'locked': False, 'value': 7},
+                    'globalKeywords': {'locked': False, 'value': 8},
-                    'userKeywords': {'locked': False, 'value': 7},
+                    'userKeywords': {'locked': False, 'value': 8}
                    'screenings': {'locked': True, 'value': 2}
                },
                'globalKeywords': {},
            }
        if 'gridChange' not in self.state:
            self.state['gridChange'] = {
-                'nextClip': {'locked': True, 'value': 4},
+                'nextClip': {'locked': False, 'value': 5},
-                'nextPlaylist': {'locked': False, 'value': 4},
+                'nextPlaylist': {'locked': False, 'value': 8},
-                'staySame': {'locked': False, 'value': 8}
+                'staySame': {'locked': True, 'value': 3}
            }
        if 'userKeywordsWeights' not in self.state:
            self.state['userKeywordsWeights'] = {
                'themeTags': {'locked': False, 'value': 0.3},
                'characterTags': {'locked': False, 'value': 0.7},
                'random': {'locked': False, 'value': False}
            }
        if 'random' not in self.state['userKeywordsWeights']:
            self.state['userKeywordsWeights']['random'] = {'locked': False, 'value': False}
        self.update_keywords()
    @property
@ -87,25 +98,47 @@ class Engine:
    def get_videos(self, user):
        ## Output is a dictionary of: user keyword scores, list of videos for each grid index (0-15),
        ## and parameters to be displayed on debug view.
        ## It implements "next clip" "next playlist" "stay same" grid allocation for the output video, depending on the user log history.
        # Update self_playlists to reflect user log history
        playlists = self.update_user_playlists(user)
        # Get the user keyword scores for debug view
        user_keywords = copy.deepcopy(user.get('keywords', {}))
        theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()}
        character_tags = {k:v for k,v in user_keywords.items() if k.isupper()}
        top_user_keywords = sorted([(k,v) for (k,v) in theme_tags.items()], key=lambda kv: kv[1])[-10:]
        top_user_characters = sorted([(k,v) for (k,v) in character_tags.items()], key=lambda kv: kv[1])[-10:]
        debug_index_output = defaultdict(list)
        # If the most recent event is "login," initialize grid videos.
        if user.get('events', [{}])[0].get("event")=="login":
-            return self.get_recommendations(user) 
+            rec = self.get_recommendations(playlists, user)
            return {
                'user': {
                    'keywords': user.get('keywords', {})
                },
                'videos': rec["videos"],
                "_debug": {
                    "top_user_keywords": top_user_keywords,
                    "top_user_characters": top_user_characters,
                    "top_user_playlists": rec["top_user_playlists"],
                    "top_global_playlists": rec["top_global_playlists"]
                }
            }
        channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
        sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
        grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
        # check if there were grid events for all indexes.
        grid_events = {}
        (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame"))
        video_num = nc + np + ns
-        # for event in user.get('events', []):
+        # collect the most recent grid event for each grid index and the grid index of the most recent play event.
-        #     if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
+        # the following requires "index" in play event data (previously unavailable)
        #         grid_events[event.get('data').get('index')] = event.get('data')
        #     if len(grid_events) == video_num:
        #         break
        # # The version where the loop also extract play_index (requires "index" in play event data):
        play_index = None
        for event in user.get('events', []):
            if event.get('event') == "grid" and event.get('data').get('index') not in grid_events:
@ -119,14 +152,26 @@ class Engine:
        # if there were no grid events for all, initialize all grids.
        if len(prev_grid_list) < video_num:
-            return self.get_recommendations(user)   
+            rec = self.get_recommendations(playlists, user)
            return {
                'user': {
                    'keywords': user.get('keywords', {})
                },
                'videos': rec["videos"],
                "_debug": {
                    "top_user_keywords": top_user_keywords,
                    "top_user_characters": top_user_characters,
                    "top_user_playlists": rec["top_user_playlists"],
                    "top_global_playlists": rec["top_global_playlists"]
                }
            }
        else:
            if play_index is None:
                video_indx = list(range(video_num))
                random.shuffle(video_indx)
            else:
-            # played index is excluded from the random shuffle and deterministically added to staySame pool.
+            # play index is excluded from the random shuffle and deterministically added to staySame pool.
                video_indx = [*range(play_index)]+[*range(play_index+1,video_num)]
                random.shuffle(video_indx)
                video_indx.append(play_index)
@ -136,76 +181,132 @@ class Engine:
            stay_same_index = video_indx[nc+np:]
            rec_list = []
-            # select next clip for nextClip pool except when the playlist has only one clip. 
+
            # nextClip pool: select next clip except when the playlist has only one clip. skip the clip with "pass":True when selecting the next clip.
            for i in next_clip_index:
-                if prev_grid_list[i].get('playlist') is None:
+                # add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists().
-                # add this to deal with the absence of "playlist" data in old grid event.
+                if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]:
-                # If there's no playlist data recorded, add the nextClip pool to nextPlaylist pool.
+                    next_playlist_index.append(i)
                    next_playlist_index.append(next_clip_index)
                    break
                else:
-                # if "playlist" and "playlistPostion" (if not, default to 0) exists in grid event
+                    for playlist in playlists:
                    for playlist in self.playlists:
                        if playlist.get('name')== prev_grid_list[i].get('playlist'):
                            unwatched_clips_indx = [j for j in range(len(playlist["clips"])) if not playlist["clips"][j].get("pass")]
                            if len(playlist["clips"]) == 1:
                                next_playlist_index.append(i)
                                break
                            # Discuss how this behavour should be: should it switch to a new playlist if it is the end of the playlist clip sequence already?
                            elif prev_grid_list[i].get('playlistPosition', 0) + 1 == len(playlist['clips']):
                                playlist_pos = 0
                            else:
-                                playlist_pos = prev_grid_list[i].get('playlistPosition', 0) + 1
+                                next_unwatched_indx = [j for j in unwatched_clips_indx if j > prev_grid_list[i]['playlistPosition']]
                                if len(next_unwatched_indx) == 0:
                                    if unwatched_clips_indx[0] != prev_grid_list[i]['playlistPosition']:
                                        playlist_pos = unwatched_clips_indx[0]
                                    else:
                                        next_playlist_index.append(i)
                                        break
                                else:
                                    playlist_pos = next_unwatched_indx[0]
-                            rec_list.append((i, {
+                                rec_list.append((i, {
-                                'clips': playlist['clips'],
+                                    'clips': playlist['clips'],
-                                # 'position': random.randrange(len(playlist['clips'])),
+                                    'position': playlist_pos,
-                                'position': playlist_pos,
+                                    'name': playlist['name'],
-                                'name': playlist['name'],
+                                    'tags': playlist['tags']
-                                'tags': playlist['tags'],
+                                }))
                            }))
-            # randomly select playlists (excluding the playlists from the current grid once "playlist" is recorded for grid events)
+                                debug_index_output["next_clip"].append((i,playlist['name']))
-            # for nextPlaylist pool.
+
            #staySame pool
            for i in stay_same_index:
                # add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists().
                if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]:
                    next_playlist_index.append(i)
                else:
                    rec_list.append((i,{}))
                    debug_index_output["stay_same"].append(i)
            # nextPlaylist pool: randomly select playlists (excluding the playlists from the current grid).
            vids_exclude = [e.get("playlist") for e in prev_grid_list]
            while None in vids_exclude:
                vids_exclude.remove(None)
-            video = self.get_recommendations(user, vids_exclude)
+            rec = self.get_recommendations(playlists, user, vids_exclude)
-            rec_list += [(i, video[i]) for i in next_playlist_index]
+            rec_list += [(i, rec['videos'][i]) for i in next_playlist_index]
            debug_index_output["new_playlist"] = [(i, rec['videos'][i]["name"]) for i in next_playlist_index]
            #staySame pool
            rec_list += [(i,{}) for i in stay_same_index]
            rec_list = sorted(rec_list, key=lambda k:k[0])
-            return [e[1] for e in rec_list]
+            videos_ = [e[1] for e in rec_list]
            return {
                'user': {
                    'keywords': user.get('keywords', {})
                },
                'videos': videos_,
                "_debug": {
                    "top_user_keywords": top_user_keywords, # list of (keyword, score)
                    "top_user_characters": top_user_characters, # list of (keyword, score)
                    "top_user_playlists": rec["top_user_playlists"], # list of (playlist name, score)
                    "top_global_playlists": rec["top_global_playlists"], # list of (playlist name, score)
                    "stay_same_index": debug_index_output["stay_same"], # list of integers
                    "next_clip_index": debug_index_output["next_clip"], # list of (integer, playlist name)
                    "new_playlist_index": debug_index_output["new_playlist"] # list of (integer, playlist name)
                }
            }
-    def get_recommendations(self, user, vids_exclude = []):
+    def get_recommendations(self, playlists, user, vids_exclude = []):
        channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
        sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
        gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()}
        userKeywordsWeights = {k: v.get('value', 1) for k, v in self.state['userKeywordsWeights'].items()}
        # Exclude playlists from the most recent grid 
        playlists = copy.deepcopy(self.playlists)
        if len(vids_exclude) > 0:
            for playlist in playlists:
                if playlist["name"] in vids_exclude:
                    playlists.remove(playlist)
-        # For each playlist, compute user keyword score
+        # Generate random weights if random option is chosen in the dashboard:
-        user_keywords = user.get('keywords', {})
+        if userKeywordsWeights.get('random'):
            themeWeights = random.random()
            charWeights = 1-themeWeights
        else:
            themeWeights = userKeywordsWeights['themeTags']
            charWeights = userKeywordsWeights['characterTags']
        # For each playlist, compute user keyword score by theme and character tags
        user_keywords = copy.deepcopy(user.get('keywords', {}))
        theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()}
        character_tags = {k:v for k,v in user_keywords.items() if k.isupper()}
        # manually modify some of the user keywords to match the playlist tags
        theme_tags["god"] = theme_tags.get("god - gods",0)
        theme_tags["visionary"] = theme_tags.get("visionary - enlightenment",0)
        theme_tags["enlightenment"] = theme_tags.get("visionary - enlightenment",0)
        character_tags["FEDOR MIKHAILOVICH SOFRONOV"] = character_tags.get("FYODOR MIKHAILOVICH SOFRONOV",0)
        character_tags["SHKABARNYA OLGA SERGEEVNA"] = character_tags.get("OLGA SERGEEVNA SHKABARNYA",0)
        character_tags["VICTORIA OLEGOVNA SKITSKAYA"] = character_tags.get("VIKTORIA OLEGOVNA SKITSKAYA",0)
        score = {}
        for playlist in playlists:
-            score[playlist['name']] = random.random()
+            score[playlist['name']] = random.random() * 0.1
-            for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:
+            for tag in playlist['tags']:
-                score[playlist['name']] += user_keywords[tag]
+                if tag in theme_tags:
                    score[playlist['name']] += theme_tags[tag] * themeWeights
                elif tag in character_tags:
                    score[playlist['name']] += character_tags[tag] * charWeights
        # Select highest scoring playlists
        playlists = sorted(
            playlists,
            key=lambda playlist: -score[playlist['name']]
        )
        # Record the following for debug view input
        top_user_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['userKeywords']]]
        # top_user_playlists = [{
        #     'name': playlist['name'],
        #     'tags': playlist['tags'],
        #     'score': score[playlist['name']],
        #     } for playlist in playlists[:channels['userKeywords']]]
        videos = playlists[:channels['userKeywords']]
        playlists = playlists[channels['userKeywords']:]
-        # For each playlist, compute global keyword score
+       # For each playlist, compute global keyword score
        score = {}
        for playlist in playlists:
            score[playlist['name']] = random.random()
@ -216,46 +317,43 @@ class Engine:
            playlists,
            key=lambda playlist: -score[playlist['name']]
        )
-        videos += playlists[:channels['globalKeywords']]
+        # Record the following for debug view input
-        playlists = playlists[channels['globalKeywords']:]
+        top_global_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['globalKeywords']]]
-        # Count products the user has seen
+        # top_global_playlists = [{
-        count = defaultdict(lambda: 0)
+        #     'name': playlist['name'],
-        for event in user.get('events', []):
+        #     'tags': playlist['tags'],
-            if event.get('data', {}).get('product'):
+        #     'score': score[playlist['name']],
-                count[event['data']['product']] += 1
+        #     } for playlist in playlists[:channels['globalKeywords']]]
-        # For each product in playlist tags, increment score by count
+
-        for playlist in playlists:
+        videos += playlists[:16 - channels['userKeywords']]
            score[playlist['name']] = random.random()
            for tag in set(playlist['tags']) & set(count):
                score[playlist['name']] += count[tag]
        # Select highest scoring playlists
        videos += sorted(
            playlists,
            key=lambda playlist: -score[playlist['name']]
        )[:16 - channels['userKeywords'] - channels['globalKeywords']]
        # Shuffle playlists (randomize layout) and shift clips (randomize start)
        random.shuffle(videos)
-        return [{
+        return {
-            'clips': video['clips'],
+            'videos': [{
-            'position': random.randrange(len(video['clips'])),
+                'clips': video['clips'],
-            'name': video['name'],
+                'position': random.choice([i for i in range(len(video["clips"])) if not video["clips"][i].get("pass")]),
-            'tags': video['tags'],
+                'name': video['name'],
-        } for video in videos]
+                'tags': video['tags'],
            } for video in videos],
            "top_user_playlists":top_user_playlists, 
            "top_global_playlists": top_global_playlists
        }
-    def update_user_playlists(playlists, user, watch_cutoff = 0.9):
+    def update_user_playlists(self, user, watch_cutoff = 0.9):
-    # Output: playlists with updated in/out time of clips that have been watched.
+    # Output: playlists with updated in/out time of clips that have been watched as well as "pass" indicators for the clips that has been watched for more than watch_cutoff.
    # Watched is defined as a video being played in full screen.
    # "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1]
    # + check (play, pause) pairs and eliminate unusual cases most likely due to a bug.
-    # + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, eliminate the clip from the playlist.
+    # + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, add "pass": True to the clip.
    # + Otherwise, find the last pause position of a clip and record it as "in" position of the clip.
-    # + If the clips are all eliminated from a playlist, eliminate the playlist.
+    # + If clips are all marked as "pass" in a playlist, elliminate the playlist from the user playlists.
        playlists = copy.deepcopy(self.playlists)
        play = {}
        watched = []
        clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs
        # The current max time of a clip duration is 10379.383333377269 from "DDLaunch: Erik Verlinde, Gravity as an emergent force (1956)"
-        for event in user["events"][::-1]:
+        # A user could potentially spend more than 3 hours if they keep watching after the clip enters into the subsequent "scene"
        for event in user.get('events', [])[::-1]:
            if event["event"] == "play" and event["data"].get("type") == "video":
                play = event
            elif event["event"] == "pause" and play!={} and event["data"].get("type") == "video":
@ -270,28 +368,37 @@ class Engine:
                                if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 15, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 15:
                                    # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 15secs buffer. There were newer edits of clip positions with 12 seconds difference.
                                    # instances where this might not be the case: clip in/out may be largely edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug)
-                                    if event["data"]["position"] >= ((playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]):
+                                    if "orig_in" not in playlist["clips"][i]:
-                                        watched.append((playlist["name"],i))
+                                        cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"]
                                    else:
                                        cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["orig_in"])*watch_cutoff + playlist["clips"][i]["orig_in"]
                                    if event["data"]["position"] >= cutoff_pos:
                                        playlist["clips"][i]["pass"] = True
                                    else:
                                        if "orig_in" not in playlist["clips"][i]:
                                            # record the original "in" position to calculate cutoff position in the future
                                            playlist["clips"][i]["orig_in"] = playlist["clips"][i]["in"]
                                        # update "in" position of the clip in the playlist
                                        playlist["clips"][i]["in"] = event["data"]["position"]
                                break
                play = {}
-        d_watched = defaultdict(set)
+        for playlist in playlists.copy():
-        for k, v in watched:
+            unwatched = [clip for clip in playlist["clips"] if not clip.get("pass")]
-            d_watched[k].add(v)
+            if not unwatched:
-        for k, v in d_watched.items():
+                playlists.remove(playlist)
-            for playlist in playlists:
+        # If the number of playlists is reduced to 30, reset it to the original.
-                if playlist["name"] == k:
+        if len(playlists) < 30:
-                    if len(v) == len(playlist["clips"]):
+            playlists = copy.deepcopy(self.playlists)
                        playlists.remove(playlist)
                    else:
                        playlist["clips"] = [playlist["clips"][i] for i in range(len(playlist["clips"])) if i not in v]
                    break
        return(playlists)
    def get_next(self, user, position):
        # Update self_playlists to reflect user log history
        playlists = self.update_user_playlists(user)
        grid_events = {}
        video_num = 16 
@ -302,8 +409,8 @@ class Engine:
                break
        prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index'])
        vids_exclude = [e.get("playlist") for e in prev_grid_list]
-        video = self.get_recommendations(user, vids_exclude)[position]
+        rec = self.get_recommendations(playlists, user, vids_exclude)
-        return video
+        return rec["videos"][position]
    def update_state(self, data):
        for key in data:
@ -365,10 +472,15 @@ class Engine:
            f.write(json.dumps(videos, indent=4, sort_keys=True))
        # Get video order
        order = {video['id']: int(video['order']) for video in videos}
        code = {video['id']: video['code'] for video in videos}
        # Sort clips
        clips = sorted(
            clips,
-            key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
+            key=lambda clip: (
                order[clip['id'].split('/')[0]],
                ox.sort_string(code[clip['id'].split('/')[0]]),
                clip['in']
            )
        )
        # Get and cache playlists
        self.playlists = [playlist for playlist in [{
@ -377,13 +489,13 @@ class Engine:
            'tags': storyline['tags'],
            'clips': [{
                'item': clip['id'].split('/')[0],
-                'id': clip['id'],
+                'id': '%s_%0.3f-%0.3f' % (clip['id'].split('/')[0], clip['in'], clip['out']),
                'in': clip['in'],
                'out': clip['out']
            } for clip in clips if clip['value'] == storyline['name']]
        } for storyline in storylines] if playlist['clips']]
        with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
-            f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
+            f.write(json.dumps(self.playlists, indent=4, sort_keys=True, ensure_ascii=False))
        self.update_keywords()
    def update_keywords(self):
Author	SHA1	Message	Date
j	d3f1a95c5f	dont default to random and don't fail if it is missing	2018-12-03 21:53:13 +01:00
j	3c80c721d3	typo	2018-12-01 19:47:06 +01:00
j	97a65ad52d	use code as secondary sort	2018-12-01 19:47:06 +01:00
pythagoraswitch	8c618ab988	changed the magnitude of random bias in user keyword ranking	2018-12-01 11:02:36 +01:00
pythagoraswitch	d96edf480f	changed debug output for dd-re; added stay_same_index etc	2018-12-01 11:00:46 +01:00
pythagoraswitch	6899cc5d37	added random option for userKeywordWeights	2018-12-01 10:11:16 +01:00
j	b749d66bac	typos	2018-11-30 08:52:42 +01:00
j	b41b56941c	use time based id	2018-11-30 08:50:49 +01:00
pythagoraswitch	e3c61853c3	added top scoring playlists for user and global keywords to dd-re debug view	2018-11-29 21:05:59 +01:00
pythagoraswitch	319e4d384e	updated comments, cleaned up nextClip and staySame process in get_videos, reflecting recent change from update_user_play().	2018-11-29 15:27:21 +01:00
pythagoraswitch	2405b8b7c7	added top scoring keywords to debug view output	2018-11-20 02:22:09 +01:00
pythagoraswitch	001e377003	bug fixes	2018-11-20 01:41:37 +01:00
pythagoraswitch	21781747a2	adjusted to the variable 'pass' for nextClip and staySame cases in get_videos	2018-11-19 22:24:41 +01:00
pythagoraswitch	4c2cb60d9c	adjusted to add the variable 'pass' for the clips that has been watched more than cutoff time	2018-11-19 21:07:04 +01:00
j	b5172bbc96	pass self to update_user_playlists	2018-11-19 20:32:54 +01:00
j	4fadcf7927	use update_user_playlists to limit playlist per user	2018-11-19 19:52:00 +01:00
j	d51748b1d0	fix merge conflicts	2018-11-19 19:51:21 +01:00
pythagoraswitch	b3d727a9d0	Merge branch 'iss2'	2018-11-16 23:57:31 +01:00
pythagoraswitch	834ba630f1	resolved conflist with master branch, added orig_in element for playlist, added update_user_playlists call in get_videos and get_next.	2018-11-16 23:47:02 +01:00
pythagoraswitch	81cfe9c9d8	removed the outdated portion: screening parameter etc	2018-09-26 23:56:29 +02:00
pythagoraswitch	024c1008fb	implemented user keyword score ranking	2018-09-26 22:14:58 +02:00
j	4bd4af703e	only change return signature of get_videos	2018-09-25 17:50:52 +02:00
j	0fe5752db3	return user keywords	2018-09-21 22:09:45 +02:00