''' Recommendation Engine ver 1 30 Nov 2018, 0x2620 ''' from collections import defaultdict import json import logging import os import random import time import copy import ox from utils import run_async logger = logging.getLogger(__name__) verbose = True class Engine: _pandora = None def __init__(self, path, **kwargs): self.path = path self.pandora_args = dict( url=kwargs.get('pandora', 'http://pandora.dmp/api/'), username=kwargs.get('username', 'dd.re'), password=kwargs.get('password', 'dd.re') ) filename = os.path.join(self.path, 'playlists.json') if os.path.exists(filename): with open(filename) as f: self.playlists = json.load(f) # ## the following is for testing purpose. # for playlist in self.playlists: # for clip in playlist["clips"]: # clip["pass"] = bool(random.getrandbits(1)) else: self.playlists = [] filename = os.path.join(self.path, 'state.json') if os.path.exists(filename): with open(filename) as f: self.state = json.load(f) else: self.state = { 'channels': { 'globalKeywords': {'locked': False, 'value': 8}, 'userKeywords': {'locked': False, 'value': 8} }, 'globalKeywords': {}, } if 'gridChange' not in self.state: self.state['gridChange'] = { 'nextClip': {'locked': False, 'value': 5}, 'nextPlaylist': {'locked': False, 'value': 8}, 'staySame': {'locked': True, 'value': 3} } if 'userKeywordsWeights' not in self.state: self.state['userKeywordsWeights'] = { 'themeTags': {'locked': False, 'value': 0.3}, 'characterTags': {'locked': False, 'value': 0.7}, 'random' : {'locked': False, 'value': True} } self.update_keywords() @property def pandora(self): while not self._pandora: try: self._pandora = Pandora(**self.pandora_args) except: logger.error('failed to connect to pandora, retry in 10 seconds') time.sleep(10) return self._pandora def _patch_clips(self, clips): inpoints = {} for index, clip in enumerate(clips): video_id = clip['id'].split('/')[0] inpoints[video_id] = inpoints.get(video_id, []) + [{ 'index': index, 'position': clip['in'] }] for video_id in inpoints: for i, inpoint in enumerate(sorted( inpoints[video_id], key=lambda inpoint: inpoint['position'] )): if i < len(inpoints[video_id]) - 1: clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position'] else: clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration'] return clips def get_videos(self, user): ## Output is a dictionary of: user keyword scores, list of videos for each grid index (0-15), ## and parameters to be displayed on debug view. ## It implements "next clip" "next playlist" "stay same" grid allocation for the output video, depending on the user log history. # Update self_playlists to reflect user log history playlists = self.update_user_playlists(user) # Get the user keyword scores for debug view user_keywords = copy.deepcopy(user.get('keywords', {})) theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()} character_tags = {k:v for k,v in user_keywords.items() if k.isupper()} top_user_keywords = sorted([(k,v) for (k,v) in theme_tags.items()], key=lambda kv: kv[1])[-10:] top_user_characters = sorted([(k,v) for (k,v) in character_tags.items()], key=lambda kv: kv[1])[-10:] debug_index_output = defaultdict(list) # If the most recent event is "login," initialize grid videos. if user.get('events', [{}])[0].get("event")=="login": rec = self.get_recommendations(playlists, user) return { 'user': { 'keywords': user.get('keywords', {}) }, 'videos': rec["videos"], "_debug": { "top_user_keywords": top_user_keywords, "top_user_characters": top_user_characters, "top_user_playlists": rec["top_user_playlists"], "top_global_playlists": rec["top_global_playlists"] } } channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} grid_events = {} (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame")) video_num = nc + np + ns # collect the most recent grid event for each grid index and the grid index of the most recent play event. # the following requires "index" in play event data (previously unavailable) play_index = None for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') if event.get('event') == "play" and event["data"].get("type") == "video" and not play_index: play_index = event.get('data').get('index') if len(grid_events) == video_num and play_index: break prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) # if there were no grid events for all, initialize all grids. if len(prev_grid_list) < video_num: rec = self.get_recommendations(playlists, user) return { 'user': { 'keywords': user.get('keywords', {}) }, 'videos': rec["videos"], "_debug": { "top_user_keywords": top_user_keywords, "top_user_characters": top_user_characters, "top_user_playlists": rec["top_user_playlists"], "top_global_playlists": rec["top_global_playlists"] } } else: if play_index is None: video_indx = list(range(video_num)) random.shuffle(video_indx) else: # play index is excluded from the random shuffle and deterministically added to staySame pool. video_indx = [*range(play_index)]+[*range(play_index+1,video_num)] random.shuffle(video_indx) video_indx.append(play_index) next_clip_index = video_indx[:nc] next_playlist_index = video_indx[nc:nc+np] stay_same_index = video_indx[nc+np:] rec_list = [] # nextClip pool: select next clip except when the playlist has only one clip. skip the clip with "pass":True when selecting the next clip. for i in next_clip_index: # add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists(). if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]: next_playlist_index.append(i) else: for playlist in playlists: if playlist.get('name')== prev_grid_list[i].get('playlist'): unwatched_clips_indx = [j for j in range(len(playlist["clips"])) if not playlist["clips"][j].get("pass")] if len(playlist["clips"]) == 1: next_playlist_index.append(i) else: next_unwatched_indx = [j for j in unwatched_clips_indx if j > prev_grid_list[i]['playlistPosition']] if len(next_unwatched_indx) == 0: if unwatched_clips_indx[0] != prev_grid_list[i]['playlistPosition']: playlist_pos = unwatched_clips_indx[0] else: next_playlist_index.append(i) break else: playlist_pos = next_unwatched_indx[0] rec_list.append((i, { 'clips': playlist['clips'], 'position': playlist_pos, 'name': playlist['name'], 'tags': playlist['tags'] })) debug_index_output["next_clip"].append((i,playlist['name'])) #staySame pool for i in stay_same_index: # add this to deal with the absence of "playlist" data in old grid event or the case where the playlist has been eliminated due to update_user_playlists(). if prev_grid_list[i].get("playlist") not in [playlist["name"] for playlist in playlists]: next_playlist_index.append(i) else: rec_list.append((i,{})) debug_index_output["stay_same"].append(i) # nextPlaylist pool: randomly select playlists (excluding the playlists from the current grid). vids_exclude = [e.get("playlist") for e in prev_grid_list] while None in vids_exclude: vids_exclude.remove(None) rec = self.get_recommendations(playlists, user, vids_exclude) rec_list += [(i, rec['videos'][i]) for i in next_playlist_index] debug_index_output["new_playlist"] = [(i, rec['videos'][i]["name"]) for i in next_playlist_index] rec_list = sorted(rec_list, key=lambda k:k[0]) videos_ = [e[1] for e in rec_list] return { 'user': { 'keywords': user.get('keywords', {}) }, 'videos': videos_, "_debug": { "top_user_keywords": top_user_keywords, # list of (keyword, score) "top_user_characters": top_user_characters, # list of (keyword, score) "top_user_playlists": rec["top_user_playlists"], # list of (playlist name, score) "top_global_playlists": rec["top_global_playlists"], # list of (playlist name, score) "stay_same_index": debug_index_output["stay_same"], # list of integers "next_clip_index": debug_index_output["next_clip"], # list of (integer, playlist name) "new_playlist_index": debug_index_output["new_playlist"] # list of (integer, playlist name) } } def get_recommendations(self, playlists, user, vids_exclude = []): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} userKeywordsWeights = {k: v.get('value', 1) for k, v in self.state['userKeywordsWeights'].items()} # Exclude playlists from the most recent grid if len(vids_exclude) > 0: for playlist in playlists: if playlist["name"] in vids_exclude: playlists.remove(playlist) # Generate random weights if random option is chosen in the dashboard: if userKeywordsWeights['random']: themeWeights = random.random() charWeights = 1-themeWeights else: themeWeights = userKeywordsWeights['themeTags'] charWeights = userKeywordsWeights['characterTags'] # For each playlist, compute user keyword score by theme and character tags user_keywords = copy.deepcopy(user.get('keywords', {})) theme_tags = {k.lower():v for k,v in user_keywords.items() if not k.isupper()} character_tags = {k:v for k,v in user_keywords.items() if k.isupper()} # manually modify some of the user keywords to match the playlist tags theme_tags["god"] = theme_tags.get("god - gods",0) theme_tags["visionary"] = theme_tags.get("visionary - enlightenment",0) theme_tags["enlightenment"] = theme_tags.get("visionary - enlightenment",0) character_tags["FEDOR MIKHAILOVICH SOFRONOV"] = character_tags.get("FYODOR MIKHAILOVICH SOFRONOV",0) character_tags["SHKABARNYA OLGA SERGEEVNA"] = character_tags.get("OLGA SERGEEVNA SHKABARNYA",0) character_tags["VICTORIA OLEGOVNA SKITSKAYA"] = character_tags.get("VIKTORIA OLEGOVNA SKITSKAYA",0) score = {} for playlist in playlists: score[playlist['name']] = random.random() * 0.1 for tag in playlist['tags']: if tag in theme_tags: score[playlist['name']] += theme_tags[tag] * themeWeights elif tag in character_tags: score[playlist['name']] += character_tags[tag] * charWeights # Select highest scoring playlists playlists = sorted( playlists, key=lambda playlist: -score[playlist['name']] ) # Record the following for debug view input top_user_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['userKeywords']]] # top_user_playlists = [{ # 'name': playlist['name'], # 'tags': playlist['tags'], # 'score': score[playlist['name']], # } for playlist in playlists[:channels['userKeywords']]] videos = playlists[:channels['userKeywords']] playlists = playlists[channels['userKeywords']:] # For each playlist, compute global keyword score score = {} for playlist in playlists: score[playlist['name']] = random.random() for tag in [tag for tag in playlist['tags'] if tag in sliders]: score[playlist['name']] += sliders[tag] # Select highest scoring playlists playlists = sorted( playlists, key=lambda playlist: -score[playlist['name']] ) # Record the following for debug view input top_global_playlists = [(playlist['name'], score[playlist['name']]) for playlist in playlists[:channels['globalKeywords']]] # top_global_playlists = [{ # 'name': playlist['name'], # 'tags': playlist['tags'], # 'score': score[playlist['name']], # } for playlist in playlists[:channels['globalKeywords']]] videos += playlists[:16 - channels['userKeywords']] # Shuffle playlists (randomize layout) and shift clips (randomize start) random.shuffle(videos) return { 'videos': [{ 'clips': video['clips'], 'position': random.choice([i for i in range(len(video["clips"])) if not video["clips"][i].get("pass")]), 'name': video['name'], 'tags': video['tags'], } for video in videos], "top_user_playlists":top_user_playlists, "top_global_playlists": top_global_playlists } def update_user_playlists(self, user, watch_cutoff = 0.9): # Output: playlists with updated in/out time of clips that have been watched as well as "pass" indicators for the clips that has been watched for more than watch_cutoff. # Watched is defined as a video being played in full screen. # "watch_cutoff" parameter: the portion of the clip duration to be determined as watched the whole clip. should be [0,1] # + check (play, pause) pairs and eliminate unusual cases most likely due to a bug. # + If (play, pause) pairs exceed XX(80-90?) percent of the clip length, add "pass": True to the clip. # + Otherwise, find the last pause position of a clip and record it as "in" position of the clip. # + If clips are all marked as "pass" in a playlist, elliminate the playlist from the user playlists. playlists = copy.deepcopy(self.playlists) play = {} clip_max_dur = 10800 # = 3 hours; arbitrary max duration allowed for (pause time - play time) to detect outlier/bugs # The current max time of a clip duration is 10379.383333377269 from "DDLaunch: Erik Verlinde, Gravity as an emergent force (1956)" # A user could potentially spend more than 3 hours if they keep watching after the clip enters into the subsequent "scene" for event in user.get('events', [])[::-1]: if event["event"] == "play" and event["data"].get("type") == "video": play = event elif event["event"] == "pause" and play!={} and event["data"].get("type") == "video": if "position" not in play["data"]: play = {} break if play["data"].get("playlist") == event["data"].get("playlist"): if event["data"]["position"] - play["data"]["position"] > 0 and event["data"]["position"] - play["data"]["position"] < clip_max_dur and event["data"].get("playlistPosition") == play["data"].get("playlistPosition") and event["data"].get("playlistPosition") is not None: i = event["data"]["playlistPosition"] for playlist in playlists: if playlist["name"] == event["data"]["playlist"] and i < len(playlist["clips"]): if play["data"]["position"] >= max(playlist["clips"][i]["in"] - 15, 0) and event["data"]["position"] <= playlist["clips"][i]["out"] + 15: # This assumes the (play, pause) fits inside the clip's (in, out) segment with +/- 15secs buffer. There were newer edits of clip positions with 12 seconds difference. # instances where this might not be the case: clip in/out may be largely edited (before after edit inconsistency); skip may trigger jump to a wrong clip (bug) if "orig_in" not in playlist["clips"][i]: cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["in"])*watch_cutoff + playlist["clips"][i]["in"] else: cutoff_pos = (playlist["clips"][i]["out"]-playlist["clips"][i]["orig_in"])*watch_cutoff + playlist["clips"][i]["orig_in"] if event["data"]["position"] >= cutoff_pos: playlist["clips"][i]["pass"] = True else: if "orig_in" not in playlist["clips"][i]: # record the original "in" position to calculate cutoff position in the future playlist["clips"][i]["orig_in"] = playlist["clips"][i]["in"] # update "in" position of the clip in the playlist playlist["clips"][i]["in"] = event["data"]["position"] break play = {} for playlist in playlists.copy(): unwatched = [clip for clip in playlist["clips"] if not clip.get("pass")] if not unwatched: playlists.remove(playlist) # If the number of playlists is reduced to 30, reset it to the original. if len(playlists) < 30: playlists = copy.deepcopy(self.playlists) return(playlists) def get_next(self, user, position): # Update self_playlists to reflect user log history playlists = self.update_user_playlists(user) grid_events = {} video_num = 16 for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') if len(grid_events) == video_num: break prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) vids_exclude = [e.get("playlist") for e in prev_grid_list] rec = self.get_recommendations(playlists, user, vids_exclude) return rec["videos"][position] def update_state(self, data): for key in data: if key in self.state: self.state[key].update(data[key]) else: self.state[key] = data[key] self.save_state() return self.state def save_state(self): filename = os.path.join(self.path, 'state.json') with open(filename, 'w') as f: json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True) def update(self): # Get all storylines with tags storylines = [{ 'id': entity['id'], 'name': entity['name'], 'nodename': entity['nodename'], 'tags': [t.strip() for t in entity['tags']] } for entity in self.pandora.find_entities({ 'conditions': [ {'key': 'type', 'operator': '==', 'value': 'storylines'}, ], 'operator': '&' }, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')] # Get list of storyline names names = list(set([storyline['name'] for storyline in storylines])) # Get list of items to use in DD items = [item['id'] for item in self.pandora.find({ 'conditions': [ {'key': 'list', 'operator': '==', 'value': 'dau:DD'} ] }, ['id'])] # Get all clips annotated with storyline references clips = [clip for clip in self.pandora.find_annotations({ 'conditions': [ {'key': 'layer', 'operator': '==', 'value': 'storylines'} ], 'operator': '&' }, ['id', 'in', 'out', 'value']) if clip['value'] in names and clip['id'].split('/')[0] in items] # Get list of ids for videos with clips ids = list(set([clip['id'].split('/')[0] for clip in clips])) # Get and cache video data filename = os.path.join(self.path, 'videos.json') if os.path.exists(filename): with open(filename) as f: videos_ = json.loads(f.read()) ids_ = [video['id'] for video in videos_] else: videos_, ids_ = [], [] videos = sorted(videos_ + [ self.pandora.get(id, ['code', 'id', 'order', 'title']) for id in ids if not id in ids_ ], key=lambda video: int(video['order'])) with open(filename, 'w') as f: f.write(json.dumps(videos, indent=4, sort_keys=True)) # Get video order order = {video['id']: int(video['order']) for video in videos} code = {video['id']: video['code'] for video in videos} # Sort clips clips = sorted( clips, key=lambda clip: ( order[clip['id'].split('/')[0]], ox.sort_string(code[clip['id'].split('/')[0]], clip['in'] ) ) # Get and cache playlists self.playlists = [playlist for playlist in [{ 'id': storyline['id'], 'name': storyline['nodename'].strip(), 'tags': storyline['tags'], 'clips': [{ 'item': clip['id'].split('/')[0], 'id': '%s_%0.3f-%0.3f' % (clip['id'].split('/')[0], clip['in'], clip['out']), 'in': clip['in'], 'out': clip['out'] } for clip in clips if clip['value'] == storyline['name']] } for storyline in storylines] if playlist['clips']] with open(os.path.join(self.path, 'playlists.json'), 'w') as f: f.write(json.dumps(self.playlists, indent=4, sort_keys=True)) self.update_keywords() def update_keywords(self): changed = False if 'globalKeywords' not in self.state: self.state['globalKeywords'] = {} changed = True existing_tags = set() for playlist in self.playlists: for tag in playlist.get('tags', []): if not tag.isupper() and tag: existing_tags.add(tag) if not tag.isupper() and tag not in self.state['globalKeywords']: self.state['globalKeywords'][tag] = {'value': 0} changed = True for tag in set(self.state['globalKeywords']) - existing_tags: del self.state['globalKeywords'][tag] changed = True if changed: self.save_state() @run_async def update_async(self): self.update() class Pandora: # pan.do/ra API wrapper def __init__(self, url, username, password): self.api = ox.API(url) self.api.signin(username=username, password=password) def find(self, query, keys): # print('FIND', query, keys) return self.api.find({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def find_annotations(self, query, keys): # print('FIND ANNOTATIONS', query, keys) return self.api.findAnnotations({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def find_entities(self, query, keys): # print('FIND ENTITIES', query, keys) return self.api.findEntities({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def get(self, id, keys): # print('GET', id, keys) return self.api.get({ 'id': id, 'keys': keys })['data'] if __name__ == '__main__': engine = Engine('json') engine.update()