dd-re/recommendation_engine.py

'''
Recommendation Engine Example
1 Nov 2017, 0x2620
'''

from collections import defaultdict
import json
import logging
import os
import random
import time

import ox

from utils import run_async

logger = logging.getLogger(__name__)


class Engine:
    _pandora = None

    def __init__(self, path, **kwargs):
        self.path = path
        self.pandora_args = dict(
            url=kwargs.get('pandora', 'http://pandora.dmp/api/'),
            username=kwargs.get('username', 'dd.re'),
            password=kwargs.get('password', 'dd.re')
        )
        filename = os.path.join(self.path, 'playlists.json')
        if os.path.exists(filename):
            with open(filename) as f:
                self.playlists = json.load(f)
        else:
            self.playlists = []

        filename = os.path.join(self.path, 'state.json')
        if os.path.exists(filename):
            with open(filename) as f:
                self.state = json.load(f)
        else:
            self.state = {
                'channels': {
                    'globalKeywords': {'locked': False, 'value': 7},
                    'userKeywords': {'locked': False, 'value': 7},
                    'screenings': {'locked': True, 'value': 2}
                },
                'globalKeywords': {},
            }
        self.update_keywords()

    @property
    def pandora(self):
        while not self._pandora:
            try:
                self._pandora = Pandora(**self.pandora_args)
            except:
                logger.error('failed to connect to pandora, retry in 10 seconds')
                time.sleep(10)
        return self._pandora

    def _patch_clips(self, clips):
        inpoints = {}
        for index, clip in enumerate(clips):
            video_id = clip['id'].split('/')[0]
            inpoints[video_id] = inpoints.get(video_id, []) + [{
                'index': index,
                'position': clip['in']
            }]
        for video_id in inpoints:
            for i, inpoint in enumerate(sorted(
                inpoints[video_id], key=lambda inpoint: inpoint['position']
            )):
                if i < len(inpoints[video_id]) - 1:
                    clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position']
                else:
                    clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']
        return clips

    def get_videos(self, user):
        channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}
        sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}
        # For each playlist, compute user keyword score
        user_keywords = user.get('keywords', {})
        score = {}
        for playlist in self.playlists:
            score[playlist['name']] = random.random()
            for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:
                score[playlist['name']] += user_keywords[tag]
        # Select highest scoring playlists
        playlists = sorted(
            self.playlists,
            key=lambda playlist: -score[playlist['name']]
        )
        videos = playlists[:channels['userKeywords']]
        playlists = playlists[channels['userKeywords']:]
        # For each playlist, compute global keyword score
        score = {}
        for playlist in self.playlists:
            score[playlist['name']] = random.random()
            for tag in [tag for tag in playlist['tags'] if tag in sliders]:
                score[playlist['name']] += sliders[tag]
        # Select highest scoring playlists
        playlists = sorted(
            self.playlists,
            key=lambda playlist: -score[playlist['name']]
        )
        videos += playlists[:channels['globalKeywords']]
        playlists = playlists[channels['globalKeywords']:]
        # Count tags for the user
        count = defaultdict(lambda: 0)
        for event in user.get('events', []):
            if event.get('data', {}).get('product'):
                count[event['data']['product']] += 1
        # For each tag in playlist, increment score by count
        for playlist in playlists:
            score[playlist['name']] = random.random()
            for tag in [tag for tag in playlist['tags'] if tag not in sliders]:
                score[playlist['name']] += count[tag]
        # Select highest scoring playlists
        videos += sorted(
            playlists,
            key=lambda playlist: -score[playlist['name']]
        )[:16 - channels['userKeywords'] - channels['globalKeywords']]
        # Shuffle playlists (randomize layout) and shift clips (randomize start)
        random.shuffle(videos)
        return [{
            'clips': video['clips'],
            'position': random.randrange(len(video['clips'])),
            'name': video['name'],
            'tags': video['tags'],
        } for video in videos]

    def update_state(self, data):
        for key in data:
            if key in self.state:
                self.state[key].update(data[key])
            else:
                self.state[key] = data[key]
        self.save_state()
        return self.state

    def save_state(self):
        filename = os.path.join(self.path, 'state.json')
        with open(filename, 'w') as f:
            json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True)

    def update(self):
        # Get all storylines with tags
        storylines = [{
            'id': entity['id'],
            'name': entity['name'],
            'nodename': entity['nodename'],
            'tags': entity['tags']
        } for entity in self.pandora.find_entities({
            'conditions': [
                {'key': 'type', 'operator': '==', 'value': 'storylines'},
            ],
            'operator': '&'
        }, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')]
        # Get list of storyline names
        names = list(set([storyline['name'] for storyline in storylines]))
        # Get all clips annotated with storyline references
        clips = [clip for clip in self.pandora.find_annotations({
            'conditions': [
                {'key': 'layer', 'operator': '==', 'value': 'storylines'}
            ],
            'operator': '&'
        }, ['id', 'in', 'out', 'value']) if clip['value'] in names]
        # Get list of ids for videos with clips
        ids = list(set([clip['id'].split('/')[0] for clip in clips]))
        # Get and cache video data
        filename = os.path.join(self.path, 'videos.json')
        if os.path.exists(filename):
            with open(filename) as f:
                videos_ = json.loads(f.read())
                ids_ = [video['id'] for video in videos_]
        else:
            videos_, ids_ = [], []
        videos = sorted(videos_ + [
            self.pandora.get(id, ['code', 'id', 'order', 'title'])
            for id in ids if not id in ids_
        ], key=lambda video: int(video['order']))
        with open(filename, 'w') as f:
            f.write(json.dumps(videos, indent=4, sort_keys=True))
        # Get video order
        order = {video['id']: int(video['order']) for video in videos}
        # Sort clips
        clips = sorted(
            clips,
            key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])
        )
        # Get and cache playlists
        self.playlists = [playlist for playlist in [{
            'id': storyline['id'],
            'name': storyline['nodename'].strip(),
            'tags': storyline['tags'],
            'clips': [{
                'item': clip['id'].split('/')[0],
                'id': clip['id'],
                'in': clip['in'],
                'out': clip['out']
            } for clip in clips if clip['value'] == storyline['name']]
        } for storyline in storylines] if playlist['clips']]
        with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
            f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
        self.update_keywords()

    def update_keywords(self):
        changed = False
        if 'globalKeywords' not in self.state:
            self.state['globalKeywords'] = {}
            changed = True
        existing_tags = set()
        for playlist in self.playlists:
            for tag in playlist.get('tags', []):
                if not tag.isupper() and tag:
                    existing_tags.add(tag)
                if not tag.isupper() and tag not in self.state['globalKeywords']:
                    self.state['globalKeywords'][tag] = {'value': 0}
                    changed = True
        for tag in set(self.state['globalKeywords']) - existing_tags:
            del self.state['globalKeywords'][tag]
            changed = True
        if changed:
            self.save_state()

    @run_async
    def update_async(self):
        self.update()


class Pandora:

    # pan.do/ra API wrapper

    def __init__(self, url, username, password):
        self.api = ox.API(url)
        self.api.signin(username=username, password=password)

    def find_annotations(self, query, keys):
        # print('FIND ANNOTATIONS', query, keys)
        return self.api.findAnnotations({
            'keys': keys,
            'query': query,
            'range': [0, 1000000]
        })['data']['items']

    def find_entities(self, query, keys):
        # print('FIND ENTITIES', query, keys)
        return self.api.findEntities({
            'keys': keys,
            'query': query,
            'range': [0, 1000000]
        })['data']['items']

    def get(self, id, keys):
        # print('GET', id, keys)
        return self.api.get({
            'id': id,
            'keys': keys
        })['data']


if __name__ == '__main__':
    engine = Engine('json')
    engine.update()
rename 2017-11-01 16:38:58 +00:00			`'''`
formatting 2017-11-01 22:56:33 +00:00			`Recommendation Engine Example`
			`1 Nov 2017, 0x2620`
rename 2017-11-01 16:38:58 +00:00			`'''`

use activities for screenings 2018-01-24 15:48:50 +00:00			`from collections import defaultdict`
rename 2017-11-01 16:38:58 +00:00			`import json`
only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`import logging`
rename 2017-11-01 16:38:58 +00:00			`import os`
			`import random`
only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`import time`
rename 2017-11-01 16:38:58 +00:00
			`import ox`

run every 15m 2017-11-02 08:40:02 +00:00			`from utils import run_async`

only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`logger = logging.getLogger(__name__)`


rename 2017-11-01 16:38:58 +00:00			`class Engine:`
only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`_pandora = None`
rename 2017-11-01 16:38:58 +00:00
settings 2018-01-18 20:32:14 +00:00			`def __init__(self, path, **kwargs):`
rename 2017-11-01 16:38:58 +00:00			`self.path = path`
only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`self.pandora_args = dict(`
settings 2018-01-18 20:32:14 +00:00			`url=kwargs.get('pandora', 'http://pandora.dmp/api/'),`
			`username=kwargs.get('username', 'dd.re'),`
			`password=kwargs.get('password', 'dd.re')`
rename 2017-11-01 16:38:58 +00:00			`)`
recommendation state 2018-02-05 14:15:39 +00:00			`filename = os.path.join(self.path, 'playlists.json')`
rename 2017-11-01 16:38:58 +00:00			`if os.path.exists(filename):`
			`with open(filename) as f:`
recommendation state 2018-02-05 14:15:39 +00:00			`self.playlists = json.load(f)`
rename 2017-11-01 16:38:58 +00:00			`else:`
			`self.playlists = []`

recommendation state 2018-02-05 14:15:39 +00:00			`filename = os.path.join(self.path, 'state.json')`
			`if os.path.exists(filename):`
			`with open(filename) as f:`
			`self.state = json.load(f)`
			`else:`
			`self.state = {`
			`'channels': {`
adding user keywords... 2018-02-15 16:06:18 +00:00			`'globalKeywords': {'locked': False, 'value': 7},`
reorder 2018-02-15 16:09:57 +00:00			`'userKeywords': {'locked': False, 'value': 7},`
adding user keywords... 2018-02-15 16:06:18 +00:00			`'screenings': {'locked': True, 'value': 2}`
recommendation state 2018-02-05 14:15:39 +00:00			`},`
defaults 2018-04-21 15:36:23 +00:00			`'globalKeywords': {},`
recommendation state 2018-02-05 14:15:39 +00:00			`}`
			`self.update_keywords()`

only connect to pandora as needed 2018-02-05 10:47:38 +00:00			`@property`
			`def pandora(self):`
			`while not self._pandora:`
			`try:`
			`self._pandora = Pandora(**self.pandora_args)`
			`except:`
			`logger.error('failed to connect to pandora, retry in 10 seconds')`
			`time.sleep(10)`
			`return self._pandora`

patch clip out points (next in point or end of scene) 2018-01-26 10:48:40 +00:00			`def _patch_clips(self, clips):`
			`inpoints = {}`
			`for index, clip in enumerate(clips):`
			`video_id = clip['id'].split('/')[0]`
			`inpoints[video_id] = inpoints.get(video_id, []) + [{`
			`'index': index,`
			`'position': clip['in']`
			`}]`
			`for video_id in inpoints:`
fix clip patch 2018-01-26 11:01:36 +00:00			`for i, inpoint in enumerate(sorted(`
patch clip out points (next in point or end of scene) 2018-01-26 10:48:40 +00:00			`inpoints[video_id], key=lambda inpoint: inpoint['position']`
fix clip patch 2018-01-26 11:01:36 +00:00			`)):`
patch clip out points (next in point or end of scene) 2018-01-26 10:48:40 +00:00			`if i < len(inpoints[video_id]) - 1:`
fix clip patch 2018-01-26 11:01:36 +00:00			`clips[inpoint['index']]['out'] = inpoints[video_id][i + 1]['position']`
patch clip out points (next in point or end of scene) 2018-01-26 10:48:40 +00:00			`else:`
fix clip patch 2018-01-26 11:01:36 +00:00			`clips[inpoint['index']]['out'] = self.pandora.get(video_id, ['duration'])['duration']`
patch clip out points (next in point or end of scene) 2018-01-26 10:48:40 +00:00			`return clips`

rename 2017-11-01 16:38:58 +00:00			`def get_videos(self, user):`
use channel and keyword values 2018-02-05 14:25:46 +00:00			`channels = {k: v.get('value', 0) for k, v in self.state['channels'].items()}`
adding user keywords... 2018-02-15 16:06:18 +00:00			`sliders = {k: v.get('value', 0) for k, v in self.state['globalKeywords'].items()}`
			`# For each playlist, compute user keyword score`
			`user_keywords = user.get('keywords', {})`
			`score = {}`
			`for playlist in self.playlists:`
			`score[playlist['name']] = random.random()`
			`for tag in [tag for tag in playlist['tags'] if tag in user_keywords]:`
			`score[playlist['name']] += user_keywords[tag]`
			`# Select highest scoring playlists`
			`playlists = sorted(`
			`self.playlists,`
			`key=lambda playlist: -score[playlist['name']]`
			`)`
			`videos = playlists[:channels['userKeywords']]`
			`playlists = playlists[channels['userKeywords']:]`
			`# For each playlist, compute global keyword score`
update recommendation engine 2018-01-22 11:55:02 +00:00			`score = {}`
			`for playlist in self.playlists:`
			`score[playlist['name']] = random.random()`
			`for tag in [tag for tag in playlist['tags'] if tag in sliders]:`
			`score[playlist['name']] += sliders[tag]`
			`# Select highest scoring playlists`
			`playlists = sorted(`
			`self.playlists,`
			`key=lambda playlist: -score[playlist['name']]`
			`)`
typo 2018-02-15 16:26:36 +00:00			`videos += playlists[:channels['globalKeywords']]`
adding user keywords... 2018-02-15 16:06:18 +00:00			`playlists = playlists[channels['globalKeywords']:]`
implement get_videos 2017-11-01 18:14:15 +00:00			`# Count tags for the user`
use activities for screenings 2018-01-24 15:48:50 +00:00			`count = defaultdict(lambda: 0)`
use events not activities 2018-01-25 12:49:12 +00:00			`for event in user.get('events', []):`
only count events with product 2018-02-04 15:05:55 +00:00			`if event.get('data', {}).get('product'):`
			`count[event['data']['product']] += 1`
implement get_videos 2017-11-01 18:14:15 +00:00			`# For each tag in playlist, increment score by count`
update recommendation engine 2018-01-22 11:55:02 +00:00			`for playlist in playlists:`
implement get_videos 2017-11-01 18:14:15 +00:00			`score[playlist['name']] = random.random()`
update recommendation engine 2018-01-22 11:55:02 +00:00			`for tag in [tag for tag in playlist['tags'] if tag not in sliders]:`
use activities for screenings 2018-01-24 15:48:50 +00:00			`score[playlist['name']] += count[tag]`
update recommendation engine 2018-01-22 11:55:02 +00:00			`# Select highest scoring playlists`
			`videos += sorted(`
			`playlists,`
implement get_videos 2017-11-01 18:14:15 +00:00			`key=lambda playlist: -score[playlist['name']]`
adding user keywords... 2018-02-15 16:06:18 +00:00			`)[:16 - channels['userKeywords'] - channels['globalKeywords']]`
implement get_videos 2017-11-01 18:14:15 +00:00			`# Shuffle playlists (randomize layout) and shift clips (randomize start)`
update recommendation engine 2018-01-22 11:55:02 +00:00			`random.shuffle(videos)`
implement get_videos 2017-11-01 18:14:15 +00:00			`return [{`
return position 2018-01-27 14:13:28 +00:00			`'clips': video['clips'],`
			`'position': random.randrange(len(video['clips'])),`
remove no longer existing tags 2018-04-21 16:13:02 +00:00			`'name': video['name'],`
			`'tags': video['tags'],`
update recommendation engine 2018-01-22 11:55:02 +00:00			`} for video in videos]`
rename 2017-11-01 16:38:58 +00:00
recommendation state 2018-02-05 14:15:39 +00:00			`def update_state(self, data):`
			`for key in data:`
			`if key in self.state:`
			`self.state[key].update(data[key])`
			`else:`
			`self.state[key] = data[key]`
			`self.save_state()`
			`return self.state`

			`def save_state(self):`
			`filename = os.path.join(self.path, 'state.json')`
			`with open(filename, 'w') as f:`
			`json.dump(self.state, f, indent=4, ensure_ascii=False, sort_keys=True)`

rename 2017-11-01 16:38:58 +00:00			`def update(self):`
			`# Get all storylines with tags`
			`storylines = [{`
use nodename as name 2018-01-25 20:54:38 +00:00			`'id': entity['id'],`
rename 2017-11-01 16:38:58 +00:00			`'name': entity['name'],`
use nodename as name 2018-01-25 20:54:38 +00:00			`'nodename': entity['nodename'],`
rename 2017-11-01 16:38:58 +00:00			`'tags': entity['tags']`
			`} for entity in self.pandora.find_entities({`
			`'conditions': [`
			`{'key': 'type', 'operator': '==', 'value': 'storylines'},`
			`],`
			`'operator': '&'`
ignore storylines without nodename 2018-02-14 16:27:39 +00:00			`}, ['id', 'name', 'tags', 'nodename']) if entity.get('tags', []) and entity.get('nodename')]`
rename 2017-11-01 16:38:58 +00:00			`# Get list of storyline names`
			`names = list(set([storyline['name'] for storyline in storylines]))`
			`# Get all clips annotated with storyline references`
			`clips = [clip for clip in self.pandora.find_annotations({`
			`'conditions': [`
			`{'key': 'layer', 'operator': '==', 'value': 'storylines'}`
			`],`
			`'operator': '&'`
			`}, ['id', 'in', 'out', 'value']) if clip['value'] in names]`
			`# Get list of ids for videos with clips`
			`ids = list(set([clip['id'].split('/')[0] for clip in clips]))`
implement get_videos 2017-11-01 18:14:15 +00:00			`# Get and cache video data`
rename 2017-11-01 16:38:58 +00:00			`filename = os.path.join(self.path, 'videos.json')`
			`if os.path.exists(filename):`
			`with open(filename) as f:`
			`videos_ = json.loads(f.read())`
			`ids_ = [video['id'] for video in videos_]`
			`else:`
			`videos_, ids_ = [], []`
			`videos = sorted(videos_ + [`
			`self.pandora.get(id, ['code', 'id', 'order', 'title'])`
			`for id in ids if not id in ids_`
cast order to int 2018-04-21 16:28:08 +00:00			`], key=lambda video: int(video['order']))`
rename 2017-11-01 16:38:58 +00:00			`with open(filename, 'w') as f:`
			`f.write(json.dumps(videos, indent=4, sort_keys=True))`
implement get_videos 2017-11-01 18:14:15 +00:00			`# Get video order`
cast order to int 2018-04-21 16:28:08 +00:00			`order = {video['id']: int(video['order']) for video in videos}`
rename 2017-11-01 16:38:58 +00:00			`# Sort clips`
			`clips = sorted(`
			`clips,`
disable _patch_clips 2018-01-27 18:40:02 +00:00			`key=lambda clip: (order[clip['id'].split('/')[0]], clip['in'])`
rename 2017-11-01 16:38:58 +00:00			`)`
implement get_videos 2017-11-01 18:14:15 +00:00			`# Get and cache playlists`
rename 2017-11-01 16:38:58 +00:00			`self.playlists = [playlist for playlist in [{`
use nodename as name 2018-01-25 20:54:38 +00:00			`'id': storyline['id'],`
strip whitespace 2018-02-04 18:28:35 +00:00			`'name': storyline['nodename'].strip(),`
rename 2017-11-01 16:38:58 +00:00			`'tags': storyline['tags'],`
disable _patch_clips 2018-01-27 18:40:02 +00:00			`'clips': [{`
			`'item': clip['id'].split('/')[0],`
implement get_videos 2017-11-01 18:14:15 +00:00			`'id': clip['id'],`
			`'in': clip['in'],`
			`'out': clip['out']`
disable _patch_clips 2018-01-27 18:40:02 +00:00			`} for clip in clips if clip['value'] == storyline['name']]`
rename 2017-11-01 16:38:58 +00:00			`} for storyline in storylines] if playlist['clips']]`
			`with open(os.path.join(self.path, 'playlists.json'), 'w') as f:`
			`f.write(json.dumps(self.playlists, indent=4, sort_keys=True))`
recommendation state 2018-02-05 14:15:39 +00:00			`self.update_keywords()`

			`def update_keywords(self):`
			`changed = False`
adding user keywords... 2018-02-15 16:06:18 +00:00			`if 'globalKeywords' not in self.state:`
			`self.state['globalKeywords'] = {}`
recommendation state 2018-02-05 14:15:39 +00:00			`changed = True`
remove no longer existing tags 2018-04-21 16:13:02 +00:00			`existing_tags = set()`
recommendation state 2018-02-05 14:15:39 +00:00			`for playlist in self.playlists:`
			`for tag in playlist.get('tags', []):`
remove no longer existing tags 2018-04-21 16:13:02 +00:00			`if not tag.isupper() and tag:`
			`existing_tags.add(tag)`
adding user keywords... 2018-02-15 16:06:18 +00:00			`if not tag.isupper() and tag not in self.state['globalKeywords']:`
			`self.state['globalKeywords'][tag] = {'value': 0}`
recommendation state 2018-02-05 14:15:39 +00:00			`changed = True`
remove no longer existing tags 2018-04-21 16:13:02 +00:00			`for tag in set(self.state['globalKeywords']) - existing_tags:`
			`del self.state['globalKeywords'][tag]`
			`changed = True`
recommendation state 2018-02-05 14:15:39 +00:00			`if changed:`
			`self.save_state()`

run every 15m 2017-11-02 08:40:02 +00:00			`@run_async`
			`def update_async(self):`
			`self.update()`

formatting 2017-11-01 22:56:33 +00:00
			`class Pandora:`

update comments 2017-11-02 11:12:56 +00:00			`# pan.do/ra API wrapper`

formatting 2017-11-01 22:56:33 +00:00			`def __init__(self, url, username, password):`
			`self.api = ox.API(url)`
			`self.api.signin(username=username, password=password)`

			`def find_annotations(self, query, keys):`
			`# print('FIND ANNOTATIONS', query, keys)`
			`return self.api.findAnnotations({`
			`'keys': keys,`
			`'query': query,`
			`'range': [0, 1000000]`
			`})['data']['items']`

			`def find_entities(self, query, keys):`
			`# print('FIND ENTITIES', query, keys)`
			`return self.api.findEntities({`
			`'keys': keys,`
			`'query': query,`
			`'range': [0, 1000000]`
			`})['data']['items']`

			`def get(self, id, keys):`
			`# print('GET', id, keys)`
			`return self.api.get({`
			`'id': id,`
			`'keys': keys`
			`})['data']`


rename 2017-11-01 16:38:58 +00:00			`if __name__ == '__main__':`
			`engine = Engine('json')`
			`engine.update()`