artificial intelligence - Q-learning in game not working as expected -
i have attempted implement q-learning in simple game have written. game based around player having "jump" avoid oncoming boxes.
i have designed system 2 actions; jump , do_nothing , states distances next block (divided , floored ensure there not large number of states).
my issue seems implementation of algorithm isn't considering "future reward", , ends jumping @ wrong times.
here implementation of q-learning algorithm;
jumpgameaiclass.prototype.getq = function getq(state) { if (!this.q.hasownproperty(state)) { this.q[state] = {}; (var actionindex = 0; actionindex < this.actions.length; actionindex++) { var action = this.actions[actionindex]; this.q[state][action] = 0; } } return this.q[state]; }; jumpgameaiclass.prototype.getblockdistance = function getblockdistance() { var closest = -1; (var blockindex = 0; blockindex < this.blocks.length; blockindex++) { var block = this.blocks[blockindex]; var distance = block.x - this.playerx; if (distance >= 0 && (closest === -1 || distance < closest)) { closest = distance; } } return math.max(0, math.floor(closest * this.resolution)); }; jumpgameaiclass.prototype.getactionwithhighestq = function getactionwithhighestq(distance) { var jumpreward = this.getq(distance)[this.actions[0]]; var donothingreward = this.getq(distance)[this.actions[1]]; if (jumpreward > donothingreward) { return this.actions[0]; } else if (donothingreward > jumpreward) { return this.actions[1]; } else { if (!this.canjump()) { return this.actions[1]; } return this.actions[math.floor(math.random() * this.actions.length)]; } }; jumpgameaiclass.prototype.getactionepsilongreedy = function getactionepsilongreedy() { // can't jump while in mid-air if (!this.canjump()) { return this.actions[1]; } if (math.random() < this.epsilon) { return this.actions[math.floor(math.random() * this.actions.length)]; } else { return this.getactionwithhighestq(this.getblockdistance()); } }; jumpgameaiclass.prototype.think = function think() { var reward = this.livereward; if (this.score !== this.lastscore) { this.lastscore = this.score; reward = this.scorereward; } else if (!this.playeralive) { reward = this.deathreward; } this.drawdistance(); var distance = this.getblockdistance(), maxq = this.getq(distance)[this.getactionwithhighestq(distance)], previousq = this.getq(this.lastdistance)[this.lastaction]; this.getq(this.lastdistance)[this.lastaction] = previousq + this.alpha * (reward + (this.gamma * maxq) - previousq); this.lastaction = this.getactionepsilongreedy(); this.lastdistance = distance; switch (this.lastaction) { case this.actions[0]: this.jump(); break; } }; and here of properties used it:
epsilon: 0.05, alpha: 1, gamma: 1, resolution: 0.1, actions: [ 'jump', 'do_nothing' ], q: {}, livereward: 0, scorereward: 100, deathreward: -1000, lastaction: 'do_nothing', lastdistance: 0, lastscore: 0 i having use lastaction/lastdistance calculate q, cannot use current data (would acting on action performed in frame before).
the think method called once every frame after rendering , game stuff done (physics, controls, death, etc).
var jumpgameaiclass = function jumpgame(canvas) { game.jumpgame.call(this, canvas); object.defineproperties(this, { epsilon: { value: 0.05 }, alpha: { value: 1 }, gamma: { value: 1 }, resolution: { value: 0.1 }, actions: { value: [ 'jump', 'do_nothing' ] }, q: { value: { }, writable: true }, livereward: { value: 0 }, scorereward: { value: 100 }, deathreward: { value: -1000 }, lastaction: { value: 'do_nothing', writable: true }, lastdistance: { value: 0, writable: true }, lastscore: { value: 0, writable: true } }); }; jumpgameaiclass.prototype = object.create(game.jumpgame.prototype); jumpgameaiclass.prototype.getq = function getq(state) { if (!this.q.hasownproperty(state)) { this.q[state] = {}; (var actionindex = 0; actionindex < this.actions.length; actionindex++) { var action = this.actions[actionindex]; this.q[state][action] = 0; } } return this.q[state]; }; jumpgameaiclass.prototype.getblockdistance = function getblockdistance() { var closest = -1; (var blockindex = 0; blockindex < this.blocks.length; blockindex++) { var block = this.blocks[blockindex]; var distance = block.x - this.playerx; if (distance >= 0 && (closest === -1 || distance < closest)) { closest = distance; } } return math.max(0, math.floor(closest * this.resolution)); }; jumpgameaiclass.prototype.getactionwithhighestq = function getactionwithhighestq(distance) { var jumpreward = this.getq(distance)[this.actions[0]]; var donothingreward = this.getq(distance)[this.actions[1]]; if (jumpreward > donothingreward) { return this.actions[0]; } else if (donothingreward > jumpreward) { return this.actions[1]; } else { if (!this.canjump()) { return this.actions[1]; } return this.actions[math.floor(math.random() * this.actions.length)]; } }; jumpgameaiclass.prototype.getactionepsilongreedy = function getactionepsilongreedy() { if (!this.canjump()) { return this.actions[1]; } if (math.random() < this.epsilon) { return this.actions[math.floor(math.random() * this.actions.length)]; } else { return this.getactionwithhighestq(this.getblockdistance()); } }; jumpgameaiclass.prototype.ondeath = function ondeath() { this.restart(); }; jumpgameaiclass.prototype.think = function think() { var reward = this.livereward; if (this.score !== this.lastscore) { this.lastscore = this.score; reward = this.scorereward; } else if (!this.playeralive) { reward = this.deathreward; } this.drawdistance(); var distance = this.getblockdistance(), maxq = this.getq(distance)[this.getactionwithhighestq(distance)], previousq = this.getq(this.lastdistance)[this.lastaction]; this.getq(this.lastdistance)[this.lastaction] = previousq + this.alpha * (reward + (this.gamma * maxq) - previousq); this.lastaction = this.getactionepsilongreedy(); this.lastdistance = distance; switch (this.lastaction) { case this.actions[0]: this.jump(); break; } }; jumpgameaiclass.prototype.drawdistance = function drawdistance() { this.context.save(); this.context.textalign = 'center'; this.context.textbaseline = 'bottom'; this.context.filltext('distance: ' + this.getblockdistance(), this.canvaswidth / 2, this.canvasheight / 4); this.context.textbaseline = 'top'; this.context.filltext('last distance: ' + this.lastdistance, this.canvaswidth / 2, this.canvasheight / 4); this.context.restore(); }; jumpgameaiclass.prototype.onframe = function onframe() { game.jumpgame.prototype.onframe.apply(this, arguments); this.think(); } game.jumpgameai = jumpgameaiclass; body { background-color: #eeeeee; text-align: center; } canvas#game { background-color: #ffffff; border: 1px solid #dddddd; } <!doctype html> <html lang="en"> <head> <title>jump</title> </head> <body> <canvas id="game" width="512" height="512"> <h1>your browser doesn't support canvas!</h1> </canvas> <script src="https://raw.githubusercontent.com/cagosta/requestanimationframe/master/app/requestanimationframe.js"></script> <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 --> <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script> <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script> <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script> <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script> </body>
you have simplified version of :

source: flappy bird rl
i used values :
epsilon: { value: 0.01 }, alpha: { value: 0.7 }, gamma: { value: 0.9 }, resolution: { value: 0.1 }, livereward: { value: 10 }, scorereward: { value: -100 }, deathreward: { value: 1000 }, it had no trouble of getting beyond 100 in first 20 attempts.
q-learning can described temporal logic
q(s, a)=r(s,a)+gamma*max_a'(q(s', a')) where
r(s,a)=r= immediate rewardgamma= relative value of delayed vs. immediate rewards (0 1)s'= new state after actionaa= action in statesa'= action in states'
you should execute as
select action , execute it
- for each state-action pair (s, a), initialize table entry q(s, a) zero
- observe current state s
- do forever:
- select action a , execute it
- receive immediate reward r aka q(s, a)
- observe new state s'
- update table entry q(s, a)=r(s,a)+gamma*max_a'(q(s', a'))
- s=s'
Comments
Post a Comment