Dr. Mark Humphrys

School of Computing. Dublin City University.

Home      Blog      Teaching      Research      Contact

Search:

CA249      CA318      CA425      CA651

w2mind.computing.dcu.ie      w2mind.org

Missing
DCU student

CASE3 student Paul Bunbury is missing since Thur 2 Feb 2012.
See appeals on crime.ie and garda.ie and facebook.

He is a great coder. See DCU page and boards.ie page.
He won major coding contests in 2010 and 2011.
He is author of the brilliant "FloodItWorld".
DCU can confirm that in Jan 2012 he passed all 6 modules comfortably.


Sample code - How to implement Q-learning


const float GAMMA = 0.6;

const float maxQTEMPERATURE = 1.0 / 2;  
const float minQTEMPERATURE = 1.0 / 50;           

	// still retains *somewhat* stochastic policy


// each (x,a) has its own varying ALPHAQ:

float alphaQ ( long int i )
{
 return ( 1.0 / i );                    
}

An Agent is a "behaviour-producing module":


class Agent     
{
public:
// keep track of Q-values for my actions:

 	StateActionSpace	Q;

// what really defines me is my reward function
// (defined in subclasses):

 	virtual float  reward ( state x, state y ) { return 0; }

// noQ(x,a) counts the no. of times we have "visited (x,a)"
// (tried out action a in state x)
// so e.g. we can have declining alphaQ:

 	StateActionSpace	noQ;	

// the action I suggest to execute:

 	action         ai;

// temporary variable:

 	action         af;


 alloc()
 {
   Q.allocvector ( cf, df );     
  noQ.allocvector ( cf, df );
   ai.allocvector ( df );
   af.allocvector ( df );
 }


 int j;
 #define foractions_j           for ( j=0; j<=(ai.no-1); j++ )

 int randomAction() { return randomAtoB ( 0, ai.no-1 ); }
};



The Learning algorithm - how we update the Q-values:


Agent :: updateQ ( state x, action a, state y )
{
  float r = reward(x,y);		// defined later
  float total = r + (GAMMA * Q.max(y));

  noQ.at(x,a)++;
  float ALPHA = alphaQ ( noQ.at(x,a) );

  Q.at(x,a) = ((1-ALPHA) * Q.at(x,a)) + (ALPHA * total);
}


How to use a Boltzmann "soft max" control policy with variable temperature:


// first, the sum of the exp(Q/T) terms:

Agent :: calculateSigma ( state x, float QTEMPERATURE )
{
 sigma = 0;
 foractions_j
 {
  af.from(j);
  sigma = sigma + exp ( Q.at(x,af)/QTEMPERATURE );
 }
}

// can show how probable each action is to be tried:

Agent :: printProb ( ostream& stream, state x, float QTEMPERATURE )
{
 foractions_j
 {
  af.from(j);
  double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;   
 }
}


// suggest an action ai:

Agent :: suggestBoltz ( state x, float QTEMPERATURE )
{
 calculateSigma ( x, QTEMPERATURE );
 float p = random0to1exclusive();               
 float sum = 0;
 j = 0;

 while ( sum < p )
 {
  af.from(j);
  double prob = exp ( Q.at(x,af)/QTEMPERATURE ) / sigma;
  sum = sum + prob;
  j++;
 }

 // just hit p
 ai = af;
}



// suggest action with reasonable (declining) temperature:

Agent :: suggestReasonable ( state x )
{
 suggestBoltz ( x, reasonableTemperature() );
}


float Agent :: reasonableTemperature()
{
 long int total = noQ.totalNumberOfExperiences();

 if ( total >= ceiling )
  return minQTEMPERATURE;
 else
 {
  float e = total / ceiling;
  return ( minQTEMPERATURE + (1-e)*(maxQTEMPERATURE-minQTEMPERATURE) );
 }
}



// no exploration, demo mode:

Agent :: exploit ( state x )            
{
 suggestBoltz ( x, minQTEMPERATURE );
}

A Creature may contain one or multiple Agents inside its head:


class Creature
{
protected:
 state          s;       // temporary variables
 state		x;
 state		y;                  
                     
 action	ak;		// each Agent suggests an action ai
				// one action ak wins and is executed

 virtual        observe() {}         // observe() fills up state s                   
 virtual        execute ( action a ) {}         


 AgentArray     A;		// a list of agents 1..n

 #define foragents_i            for ( int i=1; i<=A.n; i++ )

 int    randomAgent() { return randomAtoB ( 1, A.n ); }

public:

 Creature()
 {
  s.allocvector ( cf );
  x.allocvector ( cf );
  y.allocvector ( cf );
  ak.allocvector ( df );
 }
};

The interact() function:


// interact with the world just once:

Creature :: interact ( int mode )
{
// observe state, each agent suggests an action:

 observe(); x = s;

 if (mode == _learnQ)
 {
  ak[1] = randomAction();
 }
 else if (mode == _exploit)
 {
  foragents_i
   A[i]->exploit(x);
 }
 else
 {
  foragents_i
   A[i]->suggestReasonable(x);
 }


// somehow go through the ai's
// and pick an agent and execute its action:

 execute(ak);


// observe new state, all agents learn:

 observe(); y = s;

 if (mode == _learnQ)
  foragents_i
   A[i]->updateQ ( x, ak, y );                          
}


Feeds      HumphrysFamilyTree.com

Bookmark and Share           On Internet since 1987.