Dr. Mark Humphrys School of Computing. Dublin City University. My big idea: Ancient Brain Search:

# Sample code - How to implement Q-learning

```
const float GAMMA = 0.6;

const float maxQTEMPERATURE = 1.0 / 2;
const float minQTEMPERATURE = 1.0 / 50;

// still retains *somewhat* stochastic policy

// each (x,a) has its own varying ALPHAQ:

float alphaQ ( long int i )
{
return ( 1.0 / i );
}

```

# An Agent is a "behaviour-producing module":

```
class Agent
{
public:
// keep track of Q-values for my actions:

StateActionSpace	Q;

// what really defines me is my reward function
// (defined in subclasses):

virtual float  reward ( state x, state y ) { return 0; }

// noQ(x,a) counts the no. of times we have "visited (x,a)"
// (tried out action a in state x)
// so e.g. we can have declining alphaQ:

StateActionSpace	noQ;

// the action I suggest to execute:

action         ai;

// temporary variable:

action         af;

alloc()
{
Q.allocvector ( cf, df );
noQ.allocvector ( cf, df );
ai.allocvector ( df );
af.allocvector ( df );
}

int j;
#define foractions_j           for ( j=0; j<=(ai.no-1); j++ )

int randomAction() { return randomAtoB ( 0, ai.no-1 ); }
};

```

# The Learning algorithm - how we update the Q-values:

```
Agent :: updateQ ( state x, action a, state y )
{
float r = reward(x,y);		// defined later
float total = r + (GAMMA * Q.max(y));

noQ.at(x,a)++;
float ALPHA = alphaQ ( noQ.at(x,a) );

Q.at(x,a) = ((1-ALPHA) * Q.at(x,a)) + (ALPHA * total);
}

```

# How to use a Boltzmann "soft max" control policy with variable temperature:

```
// first, the sum of the exp(Q/T) terms:

Agent :: calculateSigma ( state x, float QTEMPERATURE )
{
sigma = 0;
foractions_j
{
af.from(j);
sigma = sigma + exp ( Q.at(x,af)/QTEMPERATURE );
}
}

// can show how probable each action is to be tried:

Agent :: printProb ( ostream& stream, state x, float QTEMPERATURE )
{
foractions_j
{
af.from(j);
double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;
}
}

// suggest an action ai:

Agent :: suggestBoltz ( state x, float QTEMPERATURE )
{
calculateSigma ( x, QTEMPERATURE );
float p = random0to1exclusive();
float sum = 0;
j = 0;

while ( sum < p )
{
af.from(j);
double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;
sum = sum + prob;
j++;
}

// just hit p
ai = af;
}

// suggest action with reasonable (declining) temperature:

Agent :: suggestReasonable ( state x )
{
suggestBoltz ( x, reasonableTemperature() );
}

float Agent :: reasonableTemperature()
{
long int total = noQ.totalNumberOfExperiences();

if ( total >= ceiling )
return minQTEMPERATURE;
else
{
float e = total / ceiling;
return ( minQTEMPERATURE + (1-e)*(maxQTEMPERATURE-minQTEMPERATURE) );
}
}

// no exploration, demo mode:

Agent :: exploit ( state x )
{
suggestBoltz ( x, minQTEMPERATURE );
}

```

# A Creature may contain one or multiple Agents inside its head:

```
class Creature
{
protected:
state          s;       // temporary variables
state		x;
state		y;

action	ak;		// each Agent suggests an action ai
// one action ak wins and is executed

virtual        observe() {}         // observe() fills up state s
virtual        execute ( action a ) {}

AgentArray     A;		// a list of agents 1..n

#define foragents_i            for ( int i=1; i<=A.n; i++ )

int    randomAgent() { return randomAtoB ( 1, A.n ); }

public:

Creature()
{
s.allocvector ( cf );
x.allocvector ( cf );
y.allocvector ( cf );
ak.allocvector ( df );
}
};

```

# The interact() function:

```
// interact with the world just once:

Creature :: interact ( int mode )
{
// observe state, each agent suggests an action:

observe(); x = s;

if (mode == _learnQ)
{
ak[1] = randomAction();
}
else if (mode == _exploit)
{
foragents_i
A[i]->exploit(x);
}
else
{
foragents_i
A[i]->suggestReasonable(x);
}

// somehow go through the ai's
// and pick an agent and execute its action:

execute(ak);

// observe new state, all agents learn:

observe(); y = s;

if (mode == _learnQ)
foragents_i
A[i]->updateQ ( x, ak, y );
}

```
```
```
ancientbrain.com      w2mind.org      humphrysfamilytree.com

On the Internet since 1987.

Wikipedia: Sometimes I link to Wikipedia. I have written something In defence of Wikipedia. It is often a useful starting point but you cannot trust it. Linking to it is like linking to a Google search. A starting point, not a destination. I automatically highlight in red all links to Wikipedia and Google search and other possibly-unreliable user-generated content.