trainer = {
const runEnv = (policy, reward, particles) => {
let s = policy.sample(particles);
let deviation = s.sub(policy.mean);
return [s, deviation, reward.tf(s)];
}
var trainer = async(runInfo, callback) => {
const optimizer = runInfo.optimizer(runInfo.learningRate);
const policy = runInfo.policy;
const reward = runInfo.reward;
const base = tf.variable(tf.tensor1d([0]));
const beta = tf.scalar(runInfo.baselineDecayRate);
for (let i = 0; i < runInfo.nbOptSteps; i++) {
var [states, rewards] = tf.tidy(() => {
const [states, deviation, rewards] = runEnv(policy, runInfo.reward, runInfo.nbParticles);
optimizer.minimize(
() => runInfo.loss.loss(policy, reward.tf, states, rewards, deviation, base),
false,
policy.getTFParams
);
if( runInfo.useBaseline ) {
base.assign(base.mul(beta.sub(1).neg()).add(rewards.mean().mul(beta)));
}
return [states, rewards];
});
if( runInfo.loss.fit ) {
await runInfo.loss.fit(states, rewards);
}
if( runInfo.runId != currentRunId.value )
break;
if( typeof(callback) == 'function' ) {
await callback(i, states, rewards, policy, base);
}
}
base.dispose();
beta.dispose();
}
return trainer
}