rl.h: basic classes for the Conditioned Avoidance generative model



// rl.h - Classes for models of reinforcement learning systems #ifndef RL_H #define RL_H #include #include #include #include #include //input & output streams #include // i-o parametrized manipulators #include // in-memory string manipulations for i/o, new standard #include #include #include using namespace std; #include "maths.h" #include "nr_ut.h" #include "conx2.h" //full connections class #include "in_out.h" // A.a. very basic CAR model of Andrew J Smith et al 05 // b. very basic CAR model with Q learning // c. very basic CAR model with Q learning and Actor-Critic SLBK2 // B.a. CAR model with timed safety states CAR0 // b. very basic CAR model with Q learning CAR1 // c. very basic CAR model with Q learning and Actor-Critic CAR2 // d. less basic CAR model with multiple 'stay' actions. CAR3 // ---------------------------------------------------------------------------- // A.a. very basic CAR model of Andrew J Smith et al 05 class SLBK0 { friend class RetVectMap ; // we want the return map machinery to have // full access to the stuff here, in future ! protected: static const char *const ERROR; static const int StatNo = 7; // States available = 1 to 7 int vectVarNo ; int ActNo ; // 2 actions: int stay, shuttle ; // labels for actions -> stay=1 and shuttle=2 int safetyS, shockS ; // labels for states -> safetyS=7 and shockS=6 int t_start, t_end ; // t_end-t_start is StatNo-2 in SLBK, as go forward EITHER // to 7 (safety) or 6 (shock) LinSig V2P; // linear sigmoid to map V to probability of action // used in destructors void dtor_aux(){ Delete(&V); Delete(&s); Delete(&r); }; inline void CheckState( int s) ; // just check that state in range. inline void CheckAction( int a); // just check that action in range. public: // Parametres - don't change during trial : float biasDA ; // Additive DA bias float assoc ; // def= 1.0 multiplicative 'associativity' float lrnR ; // def= 0.5 float D ; // def= 0.93 Discount rate float averAtt ; // def= 0 atten. of painful events param. Vector r ; // return of getting into corresp. state // variables - change during trial : Vector V ; // to keep current Values for states 1 to 7 Vector s ; // to keep record of states visited within trial // methods SLBK0() ; // - - - - ctor - - - - - - - - ~SLBK0() ; // - - - - dtor - - - - - - - - int vvNum(){ return vectVarNo ;} ; // - - - looking things up - - - int tStart(){ return t_start; } ; int tEnd(){ return t_end; } ; int StateNum(){ return StatNo; } ; int VSize(){ return V.Size(); }; // = StatNo = 7 usually ! int sSize(){ return s.Size(); }; // = StatNo-1 = 6 usually ! int Stay(){ return stay; }; int Shuttle(){ return shuttle; }; float LRate(){ return lrnR ; }; void zero_s_record(){ s *= 0.0 ; } ; // - - - (re) initialising - - - void zero_Values() { V *=0.0 ; } ; void set_rnd_Val(float scale) { for (int i=1; i<=V.Size(); i++) V[i]=scale*(2.0*rand()/RAND_MAX - 1) ; } ; float VtoP(float Val){ return V2P.LinV(Val); }; // auxiliary int selAction(float P) { return bin_dec(P) ? shuttle : stay ; }; /* - - - - - - some substantial RL functions - - - - - - - - - - - - */ int nextS(int t_now, int action); // allowed to jump out of safety int SmithNextS(int t_now, int action); // not allowed to jump out of safety float nextR(int nextS) { return (nextS == shockS) ? 1.0:0.0 ; }; // TD error Delta for state Values. Note sign of retrur is as in Smith et al // for this version of delta, return only depends state where landed, // not how we got there, so fn. doesn't need to be given nextR. float delta0(int thisS, int nextS){ return r[nextS] + D*V[nextS] - V[thisS]; }; // delta1 below also includes Smith et all assumptions re. DA blockade // and Latent Inhibition manipulations : float delta1(int this_S, int next_S){ return assoc*(delta0(this_S, next_S) + biasDA); }; // Smith appears not to correct Vs if next state is safety: float deltaSmith(int this_S, int next_S){ return (next_S==safetyS) ? 0 : assoc*(delta0(this_S, next_S) + biasDA); }; float totPEsc_Smith() ; // total Py of escape during a trial, using current // V values. // delta2 assumes that DA reports rewarding deltas, which in our formulation // so far are negative, so DA block attenuates -ve deltas only. It uses // averAtt to attenuate +ve deltas. float delta2(int this_S, int next_S); } ; // ---------------------------------------------------------------------------- // A.b. very basic CAR model with Q learning class SLBK1 : public SLBK0 { protected: static const char *const ERROR; float eps; // epsilon for eps-greedy etc. policies. Vector **ptrToVecVar ; // Used to communicate with iterration engines etc. public: Vector Q[StatNo+1] ; // each state has a vector of Q values, w. |A(s)| entries. Vector a ; // record of actions for each state visited SLBK1(); ~SLBK1(); int aSize(){ return a.Size(); }; // = StatNo-1 = 6 usually ! Vector **VecVarPtr(){ return ptrToVecVar ; }; float QVal(int s, int a){ return Q[s].GetValue(a) ; } ; void Reset() ; // Ready to start from scratch again void reStart() ; // retain learning, reset s, a, float grQPol( int s, int a); // Py of sel. a under max-Q eps-greedy policy // SARSA: On-policy TD Control, S&B p. 145 // SARSA Q-'error', again return only depends state where landed, // not how we got there, so fn. doesn't need to be given nextR. Note -r // as r above is +1 for the shock event. float sarsa0d(int thisS, int thisA, int nextS, int nextA){ return -r[nextS]+D*Q[nextS].GetValue(nextA)-Q[thisS].GetValue(thisA); }; // In the sarsa1d version of the error, the Smith et al assumptions re. // 'associativity' and 'receptor blockade' are incorporated : float sarsa1d(int this_S, int this_A, int next_S, int next_A){ return assoc*( sarsa0d(this_S, this_A, next_S, next_A) + biasDA); }; // Q-Learning: Off-policy TD control, S&B p. 148, // using same conventions as SARSA above. REM Q-learning delta uses // not next action actually taken, but next action that maximises next Q. // First, delta for 1-step Q learning, WITHOUT DA BLOCK, ASSOC/TY etc: float Qlearn1d(int thisS, int thisA, int nextS); // Now, with DA block/facil. modulation. If Qlearn1d is positive (which would // be reported by DA), multiply by (1-biasDA): float Qlearn1d_DAmod(int thisS, int thisA, int nextS); }; // ----------------------------------------------------------------------------- // A.c. very basic CAR model with Q learning and Actor-Critic class SLBK2 : public SLBK1 { protected: static const char *const ERROR; double GibbsLen(int state); public: Vector p[StatNo+1] ; // each state has a vector of modifiable policy // p values, also w. |A(s)| entries. float lrnRPol ; // learning rate for altering policy params. Negative // for aversive events signified by positive state V's SLBK2(); ~SLBK2(); float pVal(int s, int a){ return p[s].GetValue(a) ; } ; void Reset2() ; // Ready to start from scratch again - resets s,V,a,Q,p // void reStart() is just like base fn. retain learning, reset s, a only // Probability calculated acc. to Gibbs softmax, exp(p(s,a))/Sum_j(exp(p(s,j))) double pGibbs(int state, int action) ; // return an action, randomly chosen acc. to params. of actions in // a particular state and Gibbs softmax Probs.: int selGAct(int state); // Corrections applied to policy params, much as at SLBK0 deltas, // but incl. learning rates. Note sign convetions for V's and lrnRPol: double corrP0(int sThis, int sNext) { return lrnRPol*delta1(sThis,sNext); }; // change weighed by policy Gibbs-Py of action that got us from sThis to sNext: double corrPGibbs(int s_this, int s_next, int act_this) { return corrP0(s_this, s_next)*(1-pGibbs(s_this,act_this)); }; // Version 2, for biasDA modulating reward and averAtt modulating pain learning. // Note sign convetions for V's and lrnRPol: double corrP2(int sThis, int sNext) { return lrnRPol*delta2(sThis,sNext); }; // change weighed by policy Gibbs-Py of action that got us from sThis to sNext: double corrPGibbs2(int s_this, int s_next, int act_this) { return corrP2(s_this, s_next)*(1-pGibbs(s_this,act_this)); }; }; // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // B.a. CAR model with timed safety states : class CAR0 { friend class RetVectMap ; // we want the return map machinery to have // full access to the stuff here, in future ! protected: static const char *const ERROR; static const int StatNo = 11; // States available = 1: CS. 2-5: be on shock side. // 6: Shock. 7-11: on safe side. 6 and 11 end trial, // at objective time = 5. int vectVarNo ; int ActNo ; // 2 actions: int stay, shuttle ; // labels for actions -> stay=1 and shuttle=2 int shockS, safeEndS ; // labels for states -> shockS=6, safeEndS=StatNo int t_start, t_end ; // t_end-t_start is 5 in this CAR, as go forward EITHER // to 7 (safety) or 6 (shock) // used in destructors void dtor_aux(){ Delete(&V); Delete(&s); Delete(&r); }; inline void CheckState( int s) ; // just check that state in range. inline void CheckAction( int a); // just check that action in range. public: // Parametres - don't change during trial : float appetMod ; // Appetitive-signal modulation / DA bias float averMod ; // def= 0 atten. of painful events param. float vigourMod ; // How much 'energy' the animal has to be active. float lrnR ; // def= 0.2 float D ; // def= 0.93 Discount rate - 1.0 for testing. float shuttleRet, stayRet; // Intrinsic returns of actions. Vector r ; // return of getting into corresp. state // variables - change during trial : Vector V ; // to keep current Values for states 1 to 7 Vector s ; // to keep record of states visited within trial // methods CAR0() ; // - - - - ctor - - - - - - - - ~CAR0() ; // - - - - dtor - - - - - - - - int vvNum(){ return vectVarNo ;} ; // - - - looking things up - - - int tStart(){ return t_start; } ; int tEnd(){ return t_end; } ; int StateNum(){ return StatNo; } ; int ActNum(){ return ActNo ; } ; float VVal(int s){ return V.GetValue(s); } ; int VSize(){ return V.Size(); }; // = StatNo = 11 int sSize(){ return s.Size(); }; // = 6 usually ! int Stay(){ return stay; }; int Shuttle(){ return shuttle; }; int ShockS(){ return shockS; }; // usu. state 6 float LRate(){ return lrnR ; }; int sNotTerminal(int s){ return ((s==shockS) || (s==safeEndS)) ? 0:1 ;} ; void zero_s_record(){ s *= 0.0 ; } ; // - - - (re) initialising - - - - void zero_Values() { V *=0.0 ; } ; void set_rnd_Val(float scale) { for (int i=1; i<=V.Size(); i++) V[i]=scale*(2.0*rand()/RAND_MAX - 1) ; } ; void setReturn(int state, float newRet) { // - - - set cost (or gain) - - - - CheckState(state); r[state] = newRet; }; void setShockSAmpl( float shockAmplitude ) { setReturn(shockS,shockAmplitude); }; int selAction(float P) { return bin_dec(P) ? shuttle : stay ; }; /* - - - - - - some substantial RL functions - - - - - - - - - - - - */ int latency(); // Return (first) state where shuttling occured, or shock state. // Only use this function when a whole trial has been completed and // stored in s[] int nextS(int t_now, int action); // without time or other uncertainty. int nextSNoRet(int t_now, int action); // disallowes shuttling back to unsafe side. float nextR(int nextS) { return (nextS == shockS) ? 1.0:0.0 ; }; // TD error Delta for state Values. Note sign of return is as in Smith et al // for this version of delta, return only depends state where landed, // not how we got there, so fn. doesn't need to be given nextR. float delta0(int thisS, int nextS){ return r[nextS] + D*V[nextS] - V[thisS]; }; // delta2 assumes that DA reports rewarding deltas, which in our formulation // so far are negative, so DA block attenuates -ve deltas only. It uses // averMod to attenuate +ve deltas. Still ret. dep.s on where landed only. float delta2(int this_S, int next_S); // In delta3 there are costs / benefits integral to shuttling; otherwise as // in delta2 float delta3(int this_S, int this_A, int next_S); } ; // ---------------------------------------------------------------------------- // B.b. very basic CAR model with Q learning class CAR1 : public CAR0 { protected: static const char *const ERROR; float eps; // epsilon for eps-greedy etc. policies. Vector **ptrToVecVar ; // Used to communicate with iterration engines etc. Gibbs gibbs; // Aux. class for Gibbs / softmax computations. public: Vector Q[StatNo+1] ; // each state has a vector of Q values, w. |A(s)| entries. Vector a ; // record of actions for each state visited float Temp ; // for Py etc. estimations ~ exp( E/T) CAR1(); ~CAR1(); int aSize(){ return a.Size(); }; // 6 usually ! Vector **VecVarPtr(){ return ptrToVecVar ; }; float QVal(int s, int a){ return Q[s].GetValue(a) ; } ; void Reset() ; // Ready to start from scratch again void reStart() ; // retain learning, reset s, a, float grQPol( int s, int a); // Py of sel. a under max-Q eps-greedy policy // SARSA: On-policy TD Control, S&B p. 145 // SARSA Q-'error', again return only depends state where landed, // not how we got there, so fn. doesn't need to be given nextR. Note -r // as r above is +1 for the shock event. float sarsa0d(int thisS, int thisA, int nextS, int nextA){ return -r[nextS]+D*Q[nextS].GetValue(nextA)-Q[thisS].GetValue(thisA); }; // In the sarsa_mod1 there is modulation of pred. error. w. appetMod af- // fecting +ve deltas only, averMod only -ve, and vigour modulation. // Also deals w. terminal states (terminal S_next) by ignoring // the A_next and using Q(s_t+1,a_t+1) zero as per S&B p. 145 ! float sarsa_mod1(int this_S, int this_A, int next_S, int next_A); // Q-Learning: Off-policy TD control, S&B p. 148, // using same conventions as SARSA above. REM Q-learning delta uses // not next action actually taken, but next action that maximises next Q. // First, delta for 1-step Q learning, WITHOUT DA BLOCK, ASSOC/TY etc: float Qlearn1d(int thisS, int thisA, int nextS); // Now, with DA block/facil. modulation. If Qlearn1d is positive (which would // be reported by DA), multiply by (1+appetMod), else by 1+averNMod float Qlearn1d_mod(int thisS, int thisA, int nextS); // --------- Methods for Gibbs-based action choice ----------- void SetT(double T){ gibbs.SetT(Temp=T); }; double gibbsQPol( int s, int a); // Py of sel. a under Gibbs-Q-softmax policy. // int selActQGibbs( int s) ; }; // ----------------------------------------------------------------------------- // B.c. very basic CAR model with Q learning and Actor-Critic class CAR2 : public CAR1 { protected: static const char *const ERROR; double GibbsLen(int state); public: Vector m[StatNo+1] ; // each state has a vector of modifiable policy // m values, also w. |A(s)| entries.'m' follows D&A book convention. float lrnRPol ; // learning rate for altering policy params. Negative // for aversive events signified by positive state V's CAR2(); ~CAR2(); float mVal(int s, int a){ return m[s].GetValue(a) ; } ; float LrnRPol(){ return lrnRPol ; } ; void Reset2() ; // Ready to start from scratch again - resets s,V,a,Q,m // void reStart() is just like base fn. retain learning, reset s, a only // Probability calculated acc. to Gibbs softmax, exp(p(s,a))/Sum_j(exp(p(s,j))) double pGibbs(int state, int action) ; // return an action, randomly chosen acc. to params. of actions in // a particular state and Gibbs softmax Probs. First, allow returns to unsafe side: int selGAct(int state); // and selGAct doesn't allow shuttling back : int selGActNoRet(int state); // Corrections applied to policy params, much as at CAR0 deltas, // but incl. learning rates. Note sign convetions for V's and lrnRPol: double corrP0(int sThis, int sNext) { return lrnRPol*delta0(sThis,sNext); }; // change weighed by policy Gibbs-Py of action that got us from sThis to sNext: double corrPGibbs(int s_this, int s_next, int act_this) { return corrP0(s_this, s_next)*(1-pGibbs(s_this,act_this)); }; // In the following, appetMod modulating reward and averMod modulating pain learning. // Note sign convetions for V's and lrnRPol. Cost of actions has been incorporated. double corrP3(int sThis, int actThis, int sNext) { return lrnRPol*delta3(sThis, actThis, sNext); } ; // change weighed by policy Gibbs-Py of action that got us from sThis to sNext: double corrPGibbs3(int s_this, int s_next, int act_this) { return corrP3(s_this, act_this, s_next)*(1-pGibbs(s_this,act_this)); }; // The following deltas have the conventional sign - positive for 'better than // normal' when aversive Values are also positive. They are intended for policy // learning under the original Smith conventions for V etc. // _p1: Advantage-learning-like : float delta_p1(int this_S, int this_A, int next_S); // another version, here Appet. block also increases effective cost of motor // response, "don't have the energy to do this shuttling" float delta_p2(int this_S, int this_A, int next_S); // _p3 is again advantage-like, but 'reward' NT for all adv. learning : float delta_p3(int this_S, int this_A, int next_S); // _p4 is again advantage-like, but only advantage learning that // takes place happens for better-than-expected outcomes and is mediated by // the 'reward' (most likely Dopamine) NT. Also a vigour modulation // takes place first. float delta_p4(int this_S, int this_A, int next_S); // deltas for actor-critic where policy learning only mediated by 'reward' NT: // delta_m2 much like delta_p2, except last line ('advantage' difference): // being denoted by m, away from the S&B where they are p's. float delta_m2(int this_S, int this_A, int next_S); // delta_m3 is modulated by appetMod for both -ve and +ve excursions. Called // _m3 rather than _p3 to adopt the Dayan & Abbot notation on policy parameters float delta_m3(int this_S, int this_A, int next_S); // Probability to 'avoid' shock (reach state 11, acc. to // act-crit parametres and Gibbs calculation - at each step, i.e. shuttling // forth *and back* is reflected on act-crit params and Gibbs Py's. float PAvoidG() ; // Similar, but no returns allowed by construction: float PAvoidGNoRet() ; }; // ----------------------------------------------------------------------------- // B.d. Slightly less basic CAR model with multiple 'stay' actions. // Dgn stands for 'degeneracy' ie no of identical-property actions. It is // understood that these actions are identical for the purpose of the // particular experimental setup. They may be different for the learner // (e.g. reading, scratching, sniffing are all 'stay' actions - not only // are they not 'flee' actions but this is well-known to the learner's CNS. // // Assumes that all states have the same actions - vector dgn applies to all. class CAR3 : public CAR2 { protected: static const char *const ERROR; double GibbsLenDgn(int state); Vector dgn; // two-component vector, dgn[shuttle] usu.=1 and dgn[stay] usu.=6. float eligThresh ; // for updating values w. elig. traces above this only (S&B p.189) public: float lambda ; Vector elV; // Each state has an eligibility for state-value learning Vector elSA[StatNo+1] ; // each state has a vector of eligibility traces for each // pair of (state,action) - similar structure to m(s,a),Q(s,a) ... CAR3(); ~CAR3(); float EligSA(int s, int a){ return elSA[s].GetValue(a); }; float EligV(int s){ return elV.GetValue(s) ; }; int nonTrivElig( float x) { return ((x-eligThresh)*(x+eligThresh)>0) ? 1:0 ;} ; int StayDgn(){ return roundfl(dgn[stay]) ;} ; int ShuttleDgn() {return roundfl(dgn[shuttle]) ;} ; void Reset3(); // Ready to start from scratch - resets s,V,a,Q,m,elV,elSA void reStart3(); // retain learning - only reset s,a,elV,elSA // Probability calculated acc. to modified Gibbs softmax, // dgn[a]*exp(p(s,a)/Temp)/Sum_j(dgn[j]*exp(p(s,j)/Temp)) double pGibbsDgn(int state, int action) ; // return an action, randomly chosen acc. to params. of actions in // selGActNoRetDeg doesn't allow shuttling back : int selGActNoRetDgn(int state); // Probability to 'avoid' shock (reach state 11, acc. to // act-crit parametres and MODIFIED Gibbs calculation - at each step, // i.e. shuttling forth (NOT back) is reflected on act-crit params and // MODIFIED Gibbs Py's. float PAvoidGNoRetDgn() ; // Some more correction terms for Actor-Critic models : // corr_m_Gib3: as in corr_m_Gib4 below (coded earlier !) but 'reward' NT // (via appetMod) for +ve deltas, 'aversive' NT (via averMod) for -ve deltas. double corr_m_Gib3(int act, int a_this, int s_this, int s_next) { return lrnRPol*delta_m2(s_this,a_this,s_next)*(KronD(act,a_this)-pGibbsDgn(s_this,act)); }; // corr_m_Gib4 uses the 'reward' NT only in policy learning, and the // Kroneker delta formulation to return correction for action act in state s_this // given what action was actually taken (a_this) and what happened (s_next etc.). // Also uses 'degenerate' formulation (of 'stay', basically) : double corr_m_Gib4(int act, int a_this, int s_this, int s_next) { return lrnRPol*delta_m3(s_this,a_this,s_next)*(KronD(act,a_this)-pGibbsDgn(s_this,act)); }; // TD(lambda) related methods : double dec1Elg(){ return lambda*D ; }; // factor needed to calc. elig. of state // s after step of decay, by lambda*gamma. S&B p. 174 fig 7.7 // Next is fn. for replace-trace eligibilities for sarsa (based on S&B p.188, eq 7.14, // NB THIS IS ONLY THE DECAY PART - AUGMENTING THE CURRENT elSA(s_this,a_this) // HAS TO BE DONE SEPARATELY. double decEligSA(int state, int action, int s_this, int a_this); // And the following the equivalent, replacing thing for the actor learning in // actor-critic, similar to eq. 9.25 of A&D TN, as guessed by myself ! Uses // degeneracy-modified probabilities. double decElActC(int state, int action, int s_this, int a_this); }; // ---------------------------------------------------------------------------- #endif // RL_H