rl.cc - Methods file for rl.h (for basic classes dealing with CAR simulations)



/* rl.cc - methods for Classes for models of reinforcement learning systems */ #include "rl.h" const char *const SLBK0::ERROR=" >>---> SLBK0 error : " ; const char *const SLBK1::ERROR=" >>---> SLBK1 error : " ; const char *const SLBK2::ERROR=" >>---> SLBK2 error : " ; const char *const CAR0::ERROR=" >>---> CAR0 error : " ; const char *const CAR1::ERROR=" >>---> CAR1 error : " ; const char *const CAR2::ERROR=" >>---> CAR2 error : " ; const char *const CAR3::ERROR=" >>---> CAR3 error : " ; //------------------------------------------------------------------------------ /// A.a. very basic CAR model of Andrew J Smith et al 05 SLBK0::SLBK0() : V(), r(), s() // call def. ctors. { // Make sure random seed is initialised for ANSI C rand() // Time-dep random seed. 604800 is no. of sec in a week : srand( ( time( NULL ) % 604800) ) ; int i,j; vectVarNo = 2; ActNo = 2; // 2 actions: stay = 1; shuttle = 2; // labels for actions safetyS = 7; shockS = 6; // labels for states t_start = 0; t_end = t_start+ StatNo-2 ; // is StatNo-2 in SLBK, as go forward EITHER // to st.7 or to st.6 biasDA = 0.0 ; // Additive DA bias assoc = 1.0 ; // multiplicative 'associativity' lrnR = 0.5 ; // Learning rate - huge in basic expt. D = 0.93 ; // Discount rate averAtt = 0.0 ; // These are just for sizing and setting to zero V, Pol and Q: Vector auxVect(StatNo); Vector auxVect2(t_end-t_start+1); V = auxVect ; s = auxVect2 ; r = auxVect ; // init to zero; non-zero-return states below: r[shockS] = 1; // Matrix auxMat(ActNo,StatNo); Q = auxMat; Pol = auxMat; // for(i=1; i<=StatNo; i++) Pol(1,i) = 1; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLBK0::~SLBK0() { // if (ptrToVecVar) // delete ptrToVecVar ; dtor_aux(); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - inline void SLBK0::CheckState( int s) { if (s<1 || s>StatNo) Error("%s%s \n", SLBK0::ERROR,"State out of range." ); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - inline void SLBK0::CheckAction( int a) { if (a<1 || a>ActNo) Error("%s%s \n", SLBK0::ERROR,"Action out of range." ); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int SLBK0::SmithNextS(int t_now, int action) { /* - - once jump into Safety State, that's it - you can't jump back ! - - - - */ // Goes : timestep: 0 1 2 3 4 5 // index of vector s: 1 2 3 4 5 6 // poss. states: 1 2,7 3,7 4,7 5,7 6,7 if (t_nowt_end-1) Error("%s%s \n", SLBK0::ERROR,"init. timestep out of range in next_S(...)" ) ; if (action<1 || action>ActNo) Error("%s%s \n", SLBK0::ERROR,"input action out of range in next_S(...)" ) ; if (action == shuttle) return safetyS; // next state is Safety even if you're there ! if (action == stay) if ( roundfl(s[t_now+1]) == safetyS) // if already in safety ... return safetyS ; // ... stay there else return t_now+2 ; // labels like indices for rest of states return -1; // never reached. }// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int SLBK0::nextS(int t_now, int action) { // Goes : timestep: 0 1 2 3 4 5 // index of vector s: 1 2 3 4 5 6 // poss. states: 1 2,7 3,7 4,7 5,7 6,7 if (t_nowt_end-1) Error("%s%s \n", SLBK0::ERROR,"init. timestep out of range in next_S(...)" ) ; if (action<1 || action>ActNo) Error("%s%s \n", SLBK0::ERROR,"input action out of range in next_S(...)" ) ; if (action == shuttle) if ( roundfl(s[t_now+1]) != safetyS) // if not already in safety return safetyS; // next state is Safety else // but if already in safety ... return t_now+2 ; // labels one behind time indices for rest of states if (action == stay) if ( roundfl(s[t_now+1]) == safetyS) // if already in safety ... return safetyS ; // ... stay there else return t_now+2 ; // labels like indices for rest of states return -1; // never reached. } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float SLBK0::totPEsc_Smith() { int i=1; float auxP=1; for (i=1; i<=5; i++) auxP *= (1 - VtoP( V[i] )); // Tot. prob. of NOT having esc. to end return 1-auxP ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float SLBK0::delta2(int this_S, int next_S) { float delta = delta0(this_S, next_S); float bDA = biasDA; float avAt= averAtt; // now put some boundaries on biases, to avoid bizarre effects: if (bDA < -1) bDA = -1; if (avAt < -1) avAt = -1; return ( (delta < 0) ? (1+bDA)*delta : (1+avAt)*delta ) ; } //------------------------------------------------------------------------------ //A.b. CAR model of Andrew J Smith et al 05 with Q values SLBK1::SLBK1() : SLBK0(), a() { int n=StatNo+1; int i; eps = 0.1 ; // init / default epsilon. Vector auxQ( ActNo ) ; for (i=1; i<=n; i++) Q[i] = auxQ ; Vector auxVect3(t_end-t_start+1); // as per Vector s above. a = auxVect3 ; vectVarNo = 1+1+1+StatNo ; //1 each for s,V,a and StatNo for Q // if (ptrToVecVar) delete ptrToVecVar ; ptrToVecVar = new Vector* [vectVarNo+1]; //+1 as goes 1->vVarNum;0 wasted ptrToVecVar[1]= &s ; ptrToVecVar[2]= &V ; ptrToVecVar[3]= &a; for (i=4; i<= vectVarNo; i++) ptrToVecVar[i] = &Q[i-3] ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLBK1::~SLBK1() { if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar = NULL; } // delete[] Q ; // hopefully this suffices to call dtor repeatedly ! Delete(&a) ; dtor_aux() ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float SLBK1::grQPol( int s, int a) { int i; int nMax = 1; // no. of actions sharing maximum Q value. int n = Q[s].Size(); // no. of possible actions in this state if ((a<1 || a>n) || (s<1 || s>StatNo)) Error("%s%s \n", SLBK1::ERROR,"arg s or a out of range in fn. grQPol" ) ; float Qmax= Q[s].GetValue(1); int sMax=1; //initialise to state 1 for (i=2; i<=n; i++) { if ( DAbs(Q[s].GetValue(i) - Qmax) < SMALLFLTSIG) //float 'equality' nMax +=1 ; else if ( Q[s].GetValue(i) > Qmax ) { // we have found a new max. Q nMax = 1 ; // so need to reset nMax Qmax = Q[s].GetValue(i) ; } } if ( DAbs(Q[s].GetValue(a) - Qmax) < SMALLFLTSIG) // if one of the max Q actions ... return (1.0-eps+eps*nMax/n)/nMax; // S&B p. 122 else return eps/n ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void SLBK1::Reset() { int i; s *= 0.0; s[1] = 1 ; V *= 0.0; a *= 0.0; for (i=1; i<=StatNo; i++) Q[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void SLBK1::reStart() { s *= 0.0; s[1] = 1 ; a *= 0.0; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float SLBK1::Qlearn1d(int thisS, int thisA, int nextS) { CheckState(thisS); CheckState(nextS); CheckAction(thisA); // first find & remember max_over_a[Q(nextS,a)] : int n = Q[nextS].Size(); // no. of possible actions in state nextS float Q_a_max = Q[nextS].GetValue(1); float Q_aux; for (int i=2; i<=n; i++) if ( (Q_aux=Q[nextS].GetValue(i)) > Q_a_max) Q_a_max = Q_aux; // Now calculate delta for 1-step Q learning: return -r[nextS]+ D*Q_a_max - Q[thisS].GetValue(thisA); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float SLBK1::Qlearn1d_DAmod(int thisS, int thisA, int nextS) { float delta = Qlearn1d(thisS, thisA, nextS); float bDA = biasDA; // now put some boundaries on bias, to avoid bizarre effects: if (bDA < -1) bDA = -1; return ( (delta > 0) ? (1+bDA)*delta : delta ) ; } //------------------------------------------------------------------------------ //A.c. CAR model of Andrew J Smith et al 05 also w. Actor-Critic SLBK2::SLBK2() : SLBK1() { int n=StatNo+1; int i; Vector aux_p( ActNo ) ; for (i=1; i<=n; i++) p[i] = aux_p ; vectVarNo = 1+1+1+StatNo +StatNo ; //1 each for s,V,a and StatNo for each of Q and p if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar=NULL; } ptrToVecVar = new Vector* [vectVarNo+1]; //+1 as goes 1->vVarNum;0 wasted ptrToVecVar[1]= &s ; ptrToVecVar[2]= &V ; ptrToVecVar[3]= &a; for (i=3+1; i<= 3+StatNo; i++) ptrToVecVar[i] = &Q[i-3] ; for (i=3+StatNo+1; i<= vectVarNo; i++) ptrToVecVar[i] = &p[i-(3+StatNo)] ; // initially set the learning rate for modifying policy parametres as // opposite to the one for States etc., as in this states take positive values // for aversive events: lrnRPol = -lrnR ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLBK2::~SLBK2() { if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar = NULL; } // p and Q hopefully automatically deleted when go out of scope ! Delete(&a) ; dtor_aux() ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void SLBK2::Reset2() { int i; s *= 0.0; s[1] = 1 ; V *= 0.0; a *= 0.0; for (i=1; i<=StatNo; i++) Q[i] *= 0.0 ; for (i=1; i<=StatNo; i++) p[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double SLBK2::GibbsLen(int state) { if (state<1 || state>StatNo) Error("%s%s \n",SLBK2::ERROR,"GibbsLen: input state off range." ) ; int n=p[state].Size(); double Sum_j=0.0; for(int i=1; i<=n; i++) Sum_j += exp( p[state].GetValue(i) ) ; return Sum_j ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double SLBK2::pGibbs(int state, int action) { int n=p[state].Size(); if (action<1 || action >n) Error("%s%s \n",SLBK2::ERROR,"pGibbs: input action off range." ) ; double Denom= GibbsLen(state) ; return exp(p[state].GetValue(action))/Denom ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int SLBK2::selGAct(int state) { double rn = rand01() ; // in interval 0 to 1 int i=1; int N= p[state].Size(); double denom = GibbsLen(state) ; double sum= exp(p[state].GetValue(1))/denom ; while ((sumStatNo) Error("%s%s \n", CAR0::ERROR,"State out of range." ); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - inline void CAR0::CheckAction( int a) { if (a<1 || a>ActNo) Error("%s%s \n", CAR0::ERROR,"Action out of range." ); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int CAR0:: latency() { int i; float auxP = 1.0 ; for (i=1; i<= sSize(); i++) auxP *= s[i] ; if (auxP < SMALLFLTSIG) Error("%s%s \n", CAR0::ERROR,"latency() : Unfinished trial !"); i=1; while( (s[i+1] == s[i]+1) && (it_end-1) //NB this covers terminal states 6 & 11 Error("%s%s \n", CAR0::ERROR,"init. timestep out of range in next_S(...)" ) ; if (action<1 || action>ActNo) Error("%s%s \n", CAR0::ERROR,"input action out of range in next_S(...)" ) ; if (action == shuttle) { if ( roundfl(s[t_now+1]) < shockS) // if on shock side return roundfl(s[t_now+1])+6; // next state on Safe side. else // but if already on safe side ... return roundfl(s[t_now+1])-4; } else if (action == stay) return roundfl(s[t_now+1])+1 ; return -1; // never reached. } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // This is the version that doesn't allowe return from unsafe to safe side. int CAR0::nextSNoRet(int t_now, int action) { // Goes : timestep: 0 1 2 3 4 5 // index of vector s: 1 2 3 4 5 6 // poss. states: 1 2,7 3,8 4,9 5,10 6,11 if (t_nowt_end-1) //NB this covers terminal states 6 & 11 Error("%s%s \n", CAR0::ERROR,"init. timestep out of range in next_S(...)" ) ; if (action<1 || action>ActNo) Error("%s%s \n", CAR0::ERROR,"input action out of range in next_S(...)" ) ; if (action == shuttle) { if ( roundfl(s[t_now+1]) < shockS) // if on shock side return roundfl(s[t_now+1])+6; // next state on Safe side. else return roundfl(s[t_now+1])+1 ; // attempting to shuttle from safe side } // is disalowed ! else if (action == stay) return roundfl(s[t_now+1])+1 ; return -1; // never reached. } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR0::delta2(int this_S, int next_S) { float delta = delta0(this_S, next_S); float bDA = appetMod; float avM= averMod; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (bDA < -1) bDA = -1; if (avM < -1) avM = -1; return ( (delta < 0) ? (1+bDA)*delta : (1+avM)*delta ) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR0::delta3(int this_S, int this_A, int next_S) { CheckState(this_S); CheckState(next_S); CheckAction(this_A); float delta = delta0(this_S, next_S); if (this_A == shuttle) delta += shuttleRet; else delta += stayRet ; float apM = appetMod; float avM= averMod; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) apM = -1; if (avM < -1) avM = -1; return ( (delta < 0) ? (1+apM)*delta : (1+avM)*delta ) ; } //------------------------------------------------------------------------------ //B.b. CAR model with Q values CAR1::CAR1() : CAR0(), a() { int n=StatNo+1; int i; eps = 0.1 ; // init / default epsilon. Temp = 0.2 ; // default denom. of gibbs-type exponents Vector auxQ( ActNo ) ; for (i=1; i<=n; i++) Q[i] = auxQ ; Vector auxVect3(t_end-t_start+1); // as per Vector s above. a = auxVect3 ; vectVarNo = 1+1+1+StatNo ; //1 each for s,V,a and StatNo for Q // if (ptrToVecVar) delete ptrToVecVar ; ptrToVecVar = new Vector* [vectVarNo+1]; //+1 as goes 1->vVarNum;0 wasted ptrToVecVar[1]= &s ; ptrToVecVar[2]= &V ; ptrToVecVar[3]= &a; for (i=4; i<= vectVarNo; i++) ptrToVecVar[i] = &Q[i-3] ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - CAR1::~CAR1() { if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar = NULL; } // delete[] Q ; // hopefully this suffices to call dtor repeatedly ! Delete(&a) ; dtor_aux() ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR1::grQPol( int s, int a) { int i; int nMax = 1; // no. of actions sharing maximum Q value. int n = Q[s].Size(); // no. of possible actions in this state if ((a<1 || a>n) || (s<1 || s>StatNo)) Error("%s%s \n", CAR1::ERROR,"arg s or a out of range in fn. grQPol" ) ; float Qmax= Q[s].GetValue(1); int sMax=1; //initialise to state 1 for (i=2; i<=n; i++) { if ( DAbs(Q[s].GetValue(i) - Qmax) < SMALLFLTSIG) //float 'equality' nMax +=1 ; else if ( Q[s].GetValue(i) > Qmax ) { // we have found a new max. Q nMax = 1 ; // so need to reset nMax Qmax = Q[s].GetValue(i) ; } } if ( DAbs(Q[s].GetValue(a) - Qmax) < SMALLFLTSIG) // if one of the max Q actions ... return (1.0-eps+eps*nMax/n)/nMax; // S&B p. 122 else return eps/n ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR1::sarsa_mod1(int this_S, int this_A, int next_S, int next_A) { CheckState(this_S); CheckState(next_S); CheckAction(this_A); float deltaQ; // build up deltaQ depending on whether next_S is terminal. r[...] has -ve sign // as aversive r is coded as positive, but reduces Qs. deltaQ = -r[next_S]-Q[this_S].GetValue(this_A) ; // common to term. & non-term. if (sNotTerminal(next_S)) { CheckAction(next_A); // if S_next was terminal, this would be no good. deltaQ += D*Q[next_S].GetValue(next_A) ; } // modulate according to 'DA blockade' etc : float apM = appetMod; float avM = averMod; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) apM = -1+SMALLFLTSIG ; //clip to avoid zero division later. if (avM < -1) avM = -1 ; // Note high values not clipped. if (viM < 0 ) viM = 0.0 ; if (viM > 1 ) viM = 1.0 ; // clip viM. // Add direct cost of action BEFORE multiplicative modulation for this delta, // unlike delta_p1, _p2, _m1, _m2 - here more conservative. if (this_A == shuttle) deltaQ -= shuttleRet/(1+apM*viM) ; else deltaQ -= stayRet ; if (deltaQ > 0) deltaQ *= 1+apM; else deltaQ *= 1+avM; return deltaQ ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void CAR1::Reset() { int i; s *= 0.0; s[1] = 1 ; V *= 0.0; a *= 0.0; for (i=1; i<=StatNo; i++) Q[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void CAR1::reStart() { s *= 0.0; s[1] = 1 ; a *= 0.0; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR1::Qlearn1d(int thisS, int thisA, int nextS) { CheckState(thisS); CheckState(nextS); CheckAction(thisA); // first find & remember max_over_a[Q(nextS,a)] : int n = Q[nextS].Size(); // no. of possible actions in state nextS float Q_a_max = Q[nextS].GetValue(1); float Q_aux; for (int i=2; i<=n; i++) if ( (Q_aux=Q[nextS].GetValue(i)) > Q_a_max) Q_a_max = Q_aux; // Now calculate delta for 1-step Q learning: return -r[nextS]+ D*Q_a_max - Q[thisS].GetValue(thisA); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR1::Qlearn1d_mod(int thisS, int thisA, int nextS) { float delta = Qlearn1d(thisS, thisA, nextS); float bDA = appetMod; float avM= averMod; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (avM < -1) avM = -1; if (bDA < -1) bDA = -1; return ( (delta > 0) ? (1+bDA)*delta : (1+avM)*delta ) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR1::gibbsQPol(int s, int a) { CheckState(s); CheckAction(a); gibbs.SetT(Temp); gibbs.SetEPtr( &Q[s] ); return gibbs.Py0(a); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - /* int CAR1::selActQGibbs(int state) { } */ //------------------------------------------------------------------------------ //B.c. CAR model w. Actor-Critic CAR2::CAR2() : CAR1() { int n=StatNo+1; int i; Vector aux_p( ActNo ) ; for (i=1; i<=n; i++) m[i] = aux_p ; vectVarNo = 1+1+1+StatNo +StatNo ; //1 each for s,V,a and StatNo for each of Q and p if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar=NULL; } ptrToVecVar = new Vector* [vectVarNo+1]; //+1 as goes 1->vVarNum;0 wasted ptrToVecVar[1]= &s ; ptrToVecVar[2]= &V ; ptrToVecVar[3]= &a; for (i=3+1; i<= 3+StatNo; i++) ptrToVecVar[i] = &Q[i-3] ; for (i=3+StatNo+1; i<= vectVarNo; i++) ptrToVecVar[i] = &m[i-(3+StatNo)] ; // initially set the learning rate for modifying policy parametres as a smallish fraction // of the one for States etc. NOTE EFFECT / CORRECT SIGN DEPENDS ON // SPECIFIC IMPLEMENTATION OF VARIABLE INCREMENT !!!!! lrnRPol = 0.15 * lrnR ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - CAR2::~CAR2() { if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar = NULL; } // p and Q hopefully automatically deleted when go out of scope ! Delete(&a) ; dtor_aux() ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void CAR2::Reset2() { int i; s *= 0.0; s[1] = 1 ; V *= 0.0; a *= 0.0; for (i=1; i<=StatNo; i++) Q[i] *= 0.0 ; for (i=1; i<=StatNo; i++) m[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR2::GibbsLen(int state) { if (state<1 || state>StatNo) Error("%s%s \n",CAR2::ERROR,"GibbsLen: input state off range." ) ; int n=m[state].Size(); double Sum_j=0.0; for(int i=1; i<=n; i++) Sum_j += exp( m[state].GetValue(i) / Temp ) ; return Sum_j ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR2::pGibbs(int state, int action) { int n=m[state].Size(); if (action<1 || action >n) Error("%s%s \n",CAR2::ERROR,"pGibbs: input action off range." ) ; double Denom= GibbsLen(state) ; return exp( m[state].GetValue(action)/Temp )/Denom ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int CAR2::selGAct(int state) { double rn = rand01() ; // in interval 0 to 1 int i=1; int N= m[state].Size(); double denom = GibbsLen(state) ; double sum= exp( m[state].GetValue(1)/Temp )/denom ; while ((sum 0) delta *= 1+apM; else delta *= 1+avM; // strong form : add direct cost of action AFTER modulation ! May be // overdone ? if (this_A == shuttle) delta -= shuttleRet; else delta -= stayRet ; return delta - m[this_S].GetValue(this_A) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::delta_p2(int this_S, int this_A, int next_S) { CheckState(this_S); CheckState(next_S); CheckAction(this_A); // Basic delta has opposite signs as +ve V's and r's are aversive here. float delta = -r[next_S] - D*V[next_S] + V[this_S] ; // modulate according to 'DA blockade' etc : float apM = appetMod; float avM = averMod; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) // clip to avoid zero division later. Note high apM = -1+SMALLFLTSIG ; // values not clipped. if (avM < -1) avM = -1 ; if (viM < 0 ) viM = 0.0 ; // clip viM. if (viM > 1 ) viM = 1.0 ; if (delta > 0) delta *= 1+apM; else delta *= 1+avM; // strong form : add direct cost of action AFTER modulation ! May be // overdone ? if (this_A == shuttle) delta -= shuttleRet/(1+apM*viM) ; else delta -= stayRet ; return delta - m[this_S].GetValue(this_A) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::delta_p3(int this_S, int this_A, int next_S) { CheckState(this_S); CheckState(next_S); CheckAction(this_A); // Basic delta has opposite signs as +ve V's and r's are aversive here. float delta = -r[next_S] - D*V[next_S] + V[this_S] ; // modulate according to 'DA blockade' etc : float apM = appetMod ; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) // clip to avoid zero division later. Note high apM = -1+SMALLFLTSIG ; // values not clipped. if (viM < 0 ) viM = 0.0 ; // clip viM. if (viM > 1 ) viM = 1.0 ; delta *= 1+apM; // all prediction errors are used and are modulated // according to the 'appetitive' NT, putatively DA. // strong form : add direct cost of action AFTER modulation ! May be // overdone ? if (this_A == shuttle) delta -= shuttleRet/(1+apM*viM) ; else delta -= stayRet ; return delta - m[this_S].GetValue(this_A) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::delta_p4(int this_S, int this_A, int next_S) // _p4 is again advantage-like, but only advantage learning that // takes place happens for better-than-expected outcomes and is mediated by // the 'reward' (most likely Dopamine) NT : { CheckState(this_S); CheckState(next_S); CheckAction(this_A); // Basic delta has opposite signs as +ve V's and r's are aversive here. float delta = -r[next_S] - D*V[next_S] + V[this_S] ; // modulate according to 'DA blockade' etc : float apM = appetMod ; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) // clip to avoid zero division later. Note high apM = -1+SMALLFLTSIG ; // values not clipped. if (viM < 0 ) viM = 0.0 ; // clip viM. if (viM > 1 ) viM = 1.0 ; // Add direct cost of action before main modulation - more conservative if (this_A == shuttle) delta -= shuttleRet/(1+apM*viM) ; else delta -= stayRet ; if (delta > 0) delta *= 1+apM; // positive (better-than here !) prediction errors are // modulated acc. to the 'appetitive' NT, putatively DA. else delta = 0.0 ; return delta - m[this_S].GetValue(this_A) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::delta_m2(int this_S, int this_A, int next_S) // much like delta_p2, but for act-crit rather than adv learning. { CheckState(this_S); CheckState(next_S); CheckAction(this_A); // Basic delta has opposite signs to those used for V learning, as +ve V's // and r's are aversive here, while greater policy values map to 'more // likely that action taken' float delta = -r[next_S] - D*V[next_S] + V[this_S] ; // modulate according to 'DA blockade' etc : float apM = appetMod; float avM = averMod; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) // clip to avoid zero division later. Note high apM = -1+SMALLFLTSIG ; // values not clipped. if (avM < -1) avM = -1 ; if (viM < 0 ) viM = 0.0 ; // clip viM. The sign of the vigour-modulation ... if (viM > 1 ) viM = 1.0 ; // ... term depends on apM only (below). // in delta_m2, modulation as in delta_p2 : if (delta > 0) delta *= 1+apM; else delta *= 1+avM; // strong form : add direct cost of action AFTER modulation ! May be // overdone ? if (this_A == shuttle) delta -= shuttleRet/(1+apM*viM) ; else delta -= stayRet ; return delta ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::delta_m3(int this_S, int this_A, int next_S) { CheckState(this_S); CheckState(next_S); CheckAction(this_A); // Basic delta has opposite signs to those used for V learning, as +ve V's // and r's are aversive here, while greater policy values map to 'more // likely that action taken' float delta = -r[next_S] - D*V[next_S] + V[this_S] ; // modulate according to 'DA blockade' etc : float apM = appetMod; float viM = vigourMod ; // now put some boundaries on biases, to avoid changing 'sides' from pain // to pleasure : if (apM < -1) // clip to avoid zero division later. Note high apM = -1+SMALLFLTSIG ; // values not clipped. if (viM < 0 ) viM = 0.0 ; // clip viM. The sign of the vigour-modulation ... if (viM > 1 ) viM = 1.0 ; // ... term depends on apM only (below). // in delta_m3, all values of delta are treated as represented by the // NT that is involved in appetitive signalling in V-learning (while // delta_m3 is about policy learning : delta *= 1+apM; // strong form : add direct cost of action AFTER modulation ! May be // overdone ? if (this_A == shuttle) delta -= shuttleRet/(1+apM*viM) ; else delta -= stayRet ; return delta ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - float CAR2::PAvoidG() { float PSafeThis, PSafePrev; PSafePrev = pGibbs(1,shuttle); // this is Py of being in state 7 for(int i=7; i<=10; i++) { PSafeThis = PSafePrev*pGibbs(i,stay)+(1-PSafePrev)*pGibbs(i-5,shuttle); PSafePrev = PSafeThis ; } return PSafeThis ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Version with shuttling back not allowed. float CAR2::PAvoidGNoRet() // Here P(safe_state,stay) is always 1 { float PSafeThis, PSafePrev; PSafePrev = pGibbs(1,shuttle); // this is Py of being in state 7 for(int i=7; i<=10; i++) { PSafeThis = PSafePrev+(1-PSafePrev)*pGibbs(i-5,shuttle); PSafePrev = PSafeThis ; } return PSafeThis ; } //------------------------------------------------------------------------------ //B.d. CAR model with 'degenerate' actions CAR3::CAR3() : CAR2(), dgn() { int i; // just for sizing and setting to zero the vector dgn : Vector auxDgn(ActNo); dgn = auxDgn ; // init to zero; non-zero-return states below: dgn[shuttle] = 1; dgn[stay] = 6; // Don't know how much this should be ... eligThresh = 0.01; // just less than .7^13 or .9^44 lambda = 0.0 ; Vector auxElV(StatNo); elV = auxElV; Vector auxElSA( ActNo ); for (i=1; i<=StatNo+1; i++) elSA[i] = auxElSA ; vectVarNo = 1+1+1+StatNo+StatNo+1+StatNo ; //1 each for s,V,a,elV ... // and StatNo for each of Q,m,elSA if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar=NULL; } ptrToVecVar = new Vector* [vectVarNo+1]; //+1 as goes 1->vVarNum;0 wasted ptrToVecVar[1]= &s ; ptrToVecVar[2]= &V ; ptrToVecVar[3]= &a; for (i=3+1; i<= 3+StatNo; i++) ptrToVecVar[i] = &Q[i-3] ; for (i=3+StatNo+1; i<= 3+2*StatNo; i++) ptrToVecVar[i] = &m[i-(3+StatNo)] ; ptrToVecVar[3+2*StatNo+1] = &elV ; for (i=4+2*StatNo+1; i<= 4+3*StatNo; i++) ptrToVecVar[i] = &elSA[i-(4+2*StatNo)] ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - CAR3::~CAR3() { if (ptrToVecVar) { delete ptrToVecVar ; ptrToVecVar = NULL; } // m, Q,elSA hopefully automatically deleted when go out of scope ! Delete(&a) ; Delete(&dgn) ; Delete(&elV) ; dtor_aux() ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR3::GibbsLenDgn(int state) { if (state<1 || state>StatNo) Error("%s%s \n",CAR2::ERROR,"GibbsLen: input state off range." ) ; int n=m[state].Size(); double Sum_j=0.0; for(int i=1; i<=n; i++) Sum_j += dgn.GetValue(i) * exp( m[state].GetValue(i) / Temp ) ; return Sum_j ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void CAR3::Reset3() { int i; s *= 0.0; s[1] = 1 ; V *= 0.0; a *= 0.0; for (i=1; i<=StatNo; i++) Q[i] *= 0.0 ; for (i=1; i<=StatNo; i++) m[i] *= 0.0 ; elV *= 0.0 ; for (i=1; i<=StatNo; i++) elSA[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - void CAR3::reStart3() { int i; s *= 0.0; s[1] = 1 ; a *= 0.0; elV *= 0.0; for (i=1; i<=StatNo; i++) elSA[i] *= 0.0 ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - int CAR3::selGActNoRetDgn(int state) { CheckState(state); if (state < shockS) { double rn = rand01() ; // in interval 0 to 1 int act=1; // consider first action, whatever it is. int N= m[state].Size(); // as many actions as there are policy params double denom = GibbsLenDgn(state) ; double sum= dgn.GetValue(1)* exp( m[state].GetValue(1)/Temp )/denom ; while ((sumn) Error("%s%s \n",CAR3::ERROR,"pGibbsDgn: input action off range." ) ; double Denom= GibbsLenDgn(state) ; return (dgn.GetValue(action)*exp(m[state].GetValue(action)/Temp))/Denom ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Version with shuttling back not allowed. float CAR3::PAvoidGNoRetDgn() // Here P(safe_state,stay) is always 1 { float PSafeThis, PSafePrev; PSafePrev = pGibbsDgn(1,shuttle); // this is Py of being in state 7 for(int i=7; i<=10; i++) { PSafeThis = PSafePrev+(1-PSafePrev)*pGibbsDgn(i-5,shuttle); PSafePrev = PSafeThis ; } return PSafeThis ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR3::decEligSA(int state, int action, int s_this, int a_this) { CheckState(state); CheckState(s_this); CheckAction(action); CheckAction(a_this); if ((state == s_this) && (action != a_this)) return 0.0 ; else return lambda*D*elSA[state].GetValue(action) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - double CAR3::decElActC(int state, int action, int s_this, int a_this) { CheckState(state); CheckState(s_this); CheckAction(action); CheckAction(a_this); if ((state == s_this) && (action != a_this)) return -pGibbsDgn(state,action) ; else return lambda*D*elSA[state].GetValue(action) ; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //------------------------------------------------------------------------------ // eof