dd-re/recommendation_engine.py

168 lines
5.7 KiB
Python
Raw Normal View History

2017-11-01 16:38:58 +00:00
'''
2017-11-01 22:56:33 +00:00
Recommendation Engine Example
1 Nov 2017, 0x2620
2017-11-01 16:38:58 +00:00
'''
2018-01-24 15:48:50 +00:00
from collections import defaultdict
2017-11-01 16:38:58 +00:00
import json
import os
import random
import ox
2017-11-02 08:40:02 +00:00
from utils import run_async
2017-11-01 16:38:58 +00:00
class Engine:
2018-01-18 20:32:14 +00:00
def __init__(self, path, **kwargs):
2017-11-01 16:38:58 +00:00
self.path = path
self.pandora = Pandora(
2018-01-18 20:32:14 +00:00
url=kwargs.get('pandora', 'http://pandora.dmp/api/'),
username=kwargs.get('username', 'dd.re'),
password=kwargs.get('password', 'dd.re')
2017-11-01 16:38:58 +00:00
)
filename = os.path.join(self.path, 'playlists.json')
if os.path.exists(filename):
with open(filename) as f:
self.playlists = json.loads(f.read())
else:
self.playlists = []
def _shift_clips(self, clips):
index = random.randrange(len(clips))
return clips[index:] + clips[:index - 1]
def get_videos(self, user):
2018-01-22 11:55:02 +00:00
channels = {'keywords': 7, 'screenings': 7, 'random': 2}
sliders = {'dau': -1, 'physics': 0, 'sex': 1}
# For each playlist, compute keyword score
score = {}
for playlist in self.playlists:
score[playlist['name']] = random.random()
for tag in [tag for tag in playlist['tags'] if tag in sliders]:
score[playlist['name']] += sliders[tag]
# Select highest scoring playlists
playlists = sorted(
self.playlists,
key=lambda playlist: -score[playlist['name']]
)
videos = playlists[:channels['keywords']]
playlists = playlists[channels['keywords']:]
2017-11-01 18:14:15 +00:00
# Count tags for the user
2018-01-24 15:48:50 +00:00
count = defaultdict(lambda: 0)
2018-01-25 12:49:12 +00:00
for event in user.get('events', []):
count[event['data']['product']] += 1
2017-11-01 18:14:15 +00:00
# For each tag in playlist, increment score by count
2018-01-22 11:55:02 +00:00
for playlist in playlists:
2017-11-01 18:14:15 +00:00
score[playlist['name']] = random.random()
2018-01-22 11:55:02 +00:00
for tag in [tag for tag in playlist['tags'] if tag not in sliders]:
2018-01-24 15:48:50 +00:00
score[playlist['name']] += count[tag]
2018-01-22 11:55:02 +00:00
# Select highest scoring playlists
videos += sorted(
playlists,
2017-11-01 18:14:15 +00:00
key=lambda playlist: -score[playlist['name']]
2018-01-22 11:55:02 +00:00
)[:16 - channels['keywords']]
2017-11-01 18:14:15 +00:00
# Shuffle playlists (randomize layout) and shift clips (randomize start)
2018-01-22 11:55:02 +00:00
random.shuffle(videos)
2017-11-01 18:14:15 +00:00
return [{
2018-01-22 11:55:02 +00:00
'clips': self._shift_clips(video['clips']),
'name': video['name']
} for video in videos]
2017-11-01 16:38:58 +00:00
def update(self):
# Get all storylines with tags
storylines = [{
'name': entity['name'],
'tags': entity['tags']
} for entity in self.pandora.find_entities({
'conditions': [
{'key': 'type', 'operator': '==', 'value': 'storylines'},
],
'operator': '&'
}, ['id', 'name', 'tags']) if entity.get('tags', [])]
# Get list of storyline names
names = list(set([storyline['name'] for storyline in storylines]))
# Get all clips annotated with storyline references
clips = [clip for clip in self.pandora.find_annotations({
'conditions': [
{'key': 'layer', 'operator': '==', 'value': 'storylines'}
],
'operator': '&'
}, ['id', 'in', 'out', 'value']) if clip['value'] in names]
# Get list of ids for videos with clips
ids = list(set([clip['id'].split('/')[0] for clip in clips]))
2017-11-01 18:14:15 +00:00
# Get and cache video data
2017-11-01 16:38:58 +00:00
filename = os.path.join(self.path, 'videos.json')
if os.path.exists(filename):
with open(filename) as f:
videos_ = json.loads(f.read())
ids_ = [video['id'] for video in videos_]
else:
videos_, ids_ = [], []
videos = sorted(videos_ + [
self.pandora.get(id, ['code', 'id', 'order', 'title'])
for id in ids if not id in ids_
], key=lambda video: video['order'])
with open(filename, 'w') as f:
f.write(json.dumps(videos, indent=4, sort_keys=True))
2017-11-01 18:14:15 +00:00
# Get video order
2017-11-01 16:38:58 +00:00
order = {video['id']: video['order'] for video in videos}
# Sort clips
clips = sorted(
clips,
key=lambda clip: order[clip['id'].split('/')[0]] * 1000000 + clip['in']
)
2017-11-01 18:14:15 +00:00
# Get and cache playlists
2017-11-01 16:38:58 +00:00
self.playlists = [playlist for playlist in [{
'name': storyline['name'],
'tags': storyline['tags'],
2017-11-01 18:14:15 +00:00
'clips': [{
'id': clip['id'],
'in': clip['in'],
'out': clip['out']
} for clip in clips if clip['value'] == storyline['name']]
2017-11-01 16:38:58 +00:00
} for storyline in storylines] if playlist['clips']]
with open(os.path.join(self.path, 'playlists.json'), 'w') as f:
f.write(json.dumps(self.playlists, indent=4, sort_keys=True))
2017-11-02 08:40:02 +00:00
@run_async
def update_async(self):
self.update()
2017-11-01 22:56:33 +00:00
class Pandora:
2017-11-02 11:12:56 +00:00
# pan.do/ra API wrapper
2017-11-01 22:56:33 +00:00
def __init__(self, url, username, password):
self.api = ox.API(url)
self.api.signin(username=username, password=password)
def find_annotations(self, query, keys):
# print('FIND ANNOTATIONS', query, keys)
return self.api.findAnnotations({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def find_entities(self, query, keys):
# print('FIND ENTITIES', query, keys)
return self.api.findEntities({
'keys': keys,
'query': query,
'range': [0, 1000000]
})['data']['items']
def get(self, id, keys):
# print('GET', id, keys)
return self.api.get({
'id': id,
'keys': keys
})['data']
2017-11-01 16:38:58 +00:00
if __name__ == '__main__':
engine = Engine('json')
engine.update()