''' Recommendation Engine Example 1 Nov 2017, 0x2620 ''' from collections import defaultdict import json import logging import os import random import time import ox from utils import run_async logger = logging.getLogger(__name__) verbose = True class Engine: _pandora = None def __init__(self, path, **kwargs): self.path = path self.pandora_args = dict( url=kwargs.get('pandora', 'http://pandora.dmp/api/'), username=kwargs.get('username', 'dd.re'), password=kwargs.get('password', 'dd.re') ) filename = os.path.join(self.path, 'playlists.json') if os.path.exists(filename): with open(filename) as f: self.playlists = json.load(f) else: self.playlists = [] filename = os.path.join(self.path, 'state.json') if os.path.exists(filename): with open(filename) as f: self.state = json.load(f) else: self.state = { 'channels': { 'globalKeywords': {'locked': False, 'value': 7}, 'userKeywords': {'locked': False, 'value': 7}, 'screenings': {'locked': True, 'value': 2} }, 'globalKeywords': {}, 'gridChange': { 'nextClip': {'locked': False, 'value': 4}, 'nextPlaylist': {'locked': False, 'value': 4}, 'staySame': {'locked': False, 'value': 8} } } self.update_keywords() @property def pandora(self): while not self._pandora: try: self._pandora = Pandora(**self.pandora_args) except: logger.error('failed to connect to pandora, retry in 10 seconds') time.sleep(10) return self._pandora def _patch_clips(self, clips): inpoints = {} for index, clip in enumerate(clips): video_id = clip['id'].split('/')[0] inpoints[video_id] = inpoints.get(video_id, []) + [{ 'index': index, 'position': clip['in'] }] for video_id in inpoints: for i, inpoint in enumerate(sorted( inpoints[video_id], key=lambda inpoint: inpoint['position'] )): if i < len(inpoints[video_id]) - 1: clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position'] else: clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration'] return clips def get_videos(self, user): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} grid_change = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} # check if there were grid events for all indexes. grid_events = {} (nc, np, ns) = (grid_change.get("nextClip"), grid_change.get("nextPlaylist"), grid_change.get("staySame")) # this assumes np + nc + ns = total number of videos in the grid view (16). # Make sure sanity check exists in front-end (error if it does not add up to 16). video_num = nc + np + ns for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') if len(grid_events) == video_num: break prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) # if there were no grid events for all, initialize all grids. if len(prev_grid_list) < video_num: return self.get_recommendations(user) else: # Once the "index" is added to play event data, played index will be excluded from the random shuffle # and deterministically added to staySame pool. # video_indx = [*range(playIndex)]+[*range(playIndex+1,video_num)] video_indx = list(range(video_num)) random.shuffle(video_indx) rec_list = [] # For now, randomly choose a clip in the same playlist instead of switching to the next clip. # This will be changed once the clip position is tracked in the user data as "playlistPostion" for i in video_indx[:nc]: if prev_grid_list[i].get('playlist') is None: # add this to deal with the absence of "playlist" data in a grid event temporarily. # If there's no playlist data recorded, add the nextClip pool to nextPlaylist pool for now. np += nc nc = 0 break else: # if "playlist" and "playlistPostion" data exists in grid event (it should be in the future) for playlist in playlists: if playlist.get('name')== prev_grid_list[i].get('playlist'): # Discuss how this behavour should be: should it switch to a new playlist if it is the end of the playlist clip sequence already? # if prev_grid_list[i].get('index') == len(playlist['clips']): # playlist_pos = 0 # else: # playlist_pos = prev_grid_list[i].get('index') + 1 rec_list.append((i, { 'clips': playlist['clips'], 'position': random.randrange(len(playlist['clips'])), # 'position': playlist_pos, 'name': playlist['name'], 'tags': playlist['tags'], })) # randomly select playlists (excluding the playlists from the current grid once "playlist" is recorded for grid events) # for nextPlaylist pool. vids_exclude = [] # vids_exclude = [e.get("playlist") for e in prev_grid_list] video = self.get_recommendations(user, vids_exclude) rec_list += [(i, video[i]) for i in video_indx[nc:nc+np]] #staySame pool # video_indx.append(playIndex) rec_list += [(i,{}) for i in video_indx[nc+np:]] rec_list = sorted(rec_list, key=lambda k:k[0]) return [e[1] for e in rec_list] # NOTE for future improvement: vids_exclude element unit could be clip or in/out time pairs, rather than playlist. # The same playlist could be played in the grid view as long as these are differenct clips or separate times. def get_recommendations(self, user, vids_exclude = []): channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()} sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()} gridChange = {k: v.get('value', 0) for k, v in self.state['gridChange'].items()} # Exclude playlists from the most recent grid playlists = self.playlists if len(vids_exclude) > 0: for playlist in playlists: if playlist["name"] in vids_exclude: playlists.remove(playlist) # For each playlist, compute user keyword score user_keywords = user.get('keywords', {}) score = {} for playlist in playlists: score[playlist['name']] = random.random() for tag in [tag for tag in playlist['tags'] if tag in user_keywords]: score[playlist['name']] += user_keywords[tag] # Select highest scoring playlists playlists = sorted( playlists, key=lambda playlist: -score[playlist['name']] ) videos = playlists[:channels['userKeywords']] playlists = playlists[channels['userKeywords']:] # For each playlist, compute global keyword score score = {} for playlist in playlists: score[playlist['name']] = random.random() for tag in [tag for tag in playlist['tags'] if tag in sliders]: score[playlist['name']] += sliders[tag] # Select highest scoring playlists playlists = sorted( playlists, key=lambda playlist: -score[playlist['name']] ) videos += playlists[:channels['globalKeywords']] playlists = playlists[channels['globalKeywords']:] # Count products the user has seen count = defaultdict(lambda: 0) for event in user.get('events', []): if event.get('data', {}).get('product'): count[event['data']['product']] += 1 # For each product in playlist tags, increment score by count for playlist in playlists: score[playlist['name']] = random.random() for tag in set(playlist['tags']) & set(count): score[playlist['name']] += count[tag] # Select highest scoring playlists videos += sorted( playlists, key=lambda playlist: -score[playlist['name']] )[:16 - channels['userKeywords'] - channels['globalKeywords']] # Shuffle playlists (randomize layout) and shift clips (randomize start) random.shuffle(videos) return [{ 'clips': video['clips'], 'position': random.randrange(len(video['clips'])), 'name': video['name'], 'tags': video['tags'], } for video in videos] def get_next(self, user, position): grid_events = {} video_num = 16 for event in user.get('events', []): if event.get('event') == "grid" and event.get('data').get('index') not in grid_events: grid_events[event.get('data').get('index')] = event.get('data') if len(grid_events) == video_num: break prev_grid_list = sorted([v for v in grid_events.values()], key=lambda k:k['index']) vids_exclude = [e.get("playlist") for e in prev_grid_list] video = self.get_recommendations(user, vids_exclude)[position] return video def update_state(self, data): for key in data: if key in self.state: self.state[key].update(data[key]) else: self.state[key] = data[key] self.save_state() return self.state def save_state(self): filename = os.path.join(self.path, 'state.json') with open(filename, 'w') as f: json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True) def update(self): # Get all storylines with tags storylines = [{ 'id': entity['id'], 'name': entity['name'], 'nodename': entity['nodename'], 'tags': [t.strip() for t in entity['tags']] } for entity in self.pandora.find_entities({ 'conditions': [ {'key': 'type', 'operator': '==', 'value': 'storylines'}, ], 'operator': '&' }, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')] # Get list of storyline names names = list(set([storyline['name'] for storyline in storylines])) # Get list of items to use in DD items = [item['id'] for item in self.pandora.find({ 'conditions': [ {'key': 'list', 'operator': '==', 'value': 'dau:DD'} ] }, ['id'])] # Get all clips annotated with storyline references clips = [clip for clip in self.pandora.find_annotations({ 'conditions': [ {'key': 'layer', 'operator': '==', 'value': 'storylines'} ], 'operator': '&' }, ['id', 'in', 'out', 'value']) if clip['value'] in names and clip['id'].split('/')[0] in items] # Get list of ids for videos with clips ids = list(set([clip['id'].split('/')[0] for clip in clips])) # Get and cache video data filename = os.path.join(self.path, 'videos.json') if os.path.exists(filename): with open(filename) as f: videos_ = json.loads(f.read()) ids_ = [video['id'] for video in videos_] else: videos_, ids_ = [], [] videos = sorted(videos_ + [ self.pandora.get(id, ['code', 'id', 'order', 'title']) for id in ids if not id in ids_ ], key=lambda video: int(video['order'])) with open(filename, 'w') as f: f.write(json.dumps(videos, indent=4, sort_keys=True)) # Get video order order = {video['id']: int(video['order']) for video in videos} # Sort clips clips = sorted( clips, key=lambda clip: (order[clip['id'].split('/')[0]], clip['in']) ) # Get and cache playlists self.playlists = [playlist for playlist in [{ 'id': storyline['id'], 'name': storyline['nodename'].strip(), 'tags': storyline['tags'], 'clips': [{ 'item': clip['id'].split('/')[0], 'id': clip['id'], 'in': clip['in'], 'out': clip['out'] } for clip in clips if clip['value'] == storyline['name']] } for storyline in storylines] if playlist['clips']] with open(os.path.join(self.path, 'playlists.json'), 'w') as f: f.write(json.dumps(self.playlists, indent=4, sort_keys=True)) self.update_keywords() def update_keywords(self): changed = False if 'globalKeywords' not in self.state: self.state['globalKeywords'] = {} changed = True existing_tags = set() for playlist in self.playlists: for tag in playlist.get('tags', []): if not tag.isupper() and tag: existing_tags.add(tag) if not tag.isupper() and tag not in self.state['globalKeywords']: self.state['globalKeywords'][tag] = {'value': 0} changed = True for tag in set(self.state['globalKeywords']) - existing_tags: del self.state['globalKeywords'][tag] changed = True if changed: self.save_state() @run_async def update_async(self): self.update() class Pandora: # pan.do/ra API wrapper def __init__(self, url, username, password): self.api = ox.API(url) self.api.signin(username=username, password=password) def find(self, query, keys): # print('FIND', query, keys) return self.api.find({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def find_annotations(self, query, keys): # print('FIND ANNOTATIONS', query, keys) return self.api.findAnnotations({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def find_entities(self, query, keys): # print('FIND ENTITIES', query, keys) return self.api.findEntities({ 'keys': keys, 'query': query, 'range': [0, 1000000] })['data']['items'] def get(self, id, keys): # print('GET', id, keys) return self.api.get({ 'id': id, 'keys': keys })['data'] if __name__ == '__main__': engine = Engine('json') engine.update()