From 5b27e9d273c43fd59905a7f126ddf8edfab7fae7 Mon Sep 17 00:00:00 2001 From: Neil Kollack Date: Sun, 7 Nov 2021 14:34:08 -0600 Subject: initial commit --- src/learningAgents.py | 258 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 src/learningAgents.py (limited to 'src/learningAgents.py') diff --git a/src/learningAgents.py b/src/learningAgents.py new file mode 100644 index 0000000..ef6c51c --- /dev/null +++ b/src/learningAgents.py @@ -0,0 +1,258 @@ +# learningAgents.py +# ----------------- +# Licensing Information: You are free to use or extend these projects for +# educational purposes provided that (1) you do not distribute or publish +# solutions, (2) you retain this notice, and (3) you provide clear +# attribution to UC Berkeley, including a link to http://ai.berkeley.edu. +# +# Attribution Information: The Pacman AI projects were developed at UC Berkeley. +# The core projects and autograders were primarily created by John DeNero +# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu). +# Student side autograding was added by Brad Miller, Nick Hay, and +# Pieter Abbeel (pabbeel@cs.berkeley.edu). + + +from game import Directions, Agent, Actions + +import random,util,time + +class ValueEstimationAgent(Agent): + """ + Abstract agent which assigns values to (state,action) + Q-Values for an environment. As well as a value to a + state and a policy given respectively by, + + V(s) = max_{a in actions} Q(s,a) + policy(s) = arg_max_{a in actions} Q(s,a) + + Both ValueIterationAgent and QLearningAgent inherit + from this agent. While a ValueIterationAgent has + a model of the environment via a MarkovDecisionProcess + (see mdp.py) that is used to estimate Q-Values before + ever actually acting, the QLearningAgent estimates + Q-Values while acting in the environment. + """ + + def __init__(self, alpha=1.0, epsilon=0.05, gamma=0.8, numTraining = 10): + """ + Sets options, which can be passed in via the Pacman command line using -a alpha=0.5,... + alpha - learning rate + epsilon - exploration rate + gamma - discount factor + numTraining - number of training episodes, i.e. no learning after these many episodes + """ + self.alpha = float(alpha) + self.epsilon = float(epsilon) + self.discount = float(gamma) + self.numTraining = int(numTraining) + + #################################### + # Override These Functions # + #################################### + def getQValue(self, state, action): + """ + Should return Q(state,action) + """ + util.raiseNotDefined() + + def getValue(self, state): + """ + What is the value of this state under the best action? + Concretely, this is given by + + V(s) = max_{a in actions} Q(s,a) + """ + util.raiseNotDefined() + + def getPolicy(self, state): + """ + What is the best action to take in the state. Note that because + we might want to explore, this might not coincide with getAction + Concretely, this is given by + + policy(s) = arg_max_{a in actions} Q(s,a) + + If many actions achieve the maximal Q-value, + it doesn't matter which is selected. + """ + util.raiseNotDefined() + + def getAction(self, state): + """ + state: can call state.getLegalActions() + Choose an action and return it. + """ + util.raiseNotDefined() + +class ReinforcementAgent(ValueEstimationAgent): + """ + Abstract Reinforcemnt Agent: A ValueEstimationAgent + which estimates Q-Values (as well as policies) from experience + rather than a model + + What you need to know: + - The environment will call + observeTransition(state,action,nextState,deltaReward), + which will call update(state, action, nextState, deltaReward) + which you should override. + - Use self.getLegalActions(state) to know which actions + are available in a state + """ + #################################### + # Override These Functions # + #################################### + + def update(self, state, action, nextState, reward): + """ + This class will call this function, which you write, after + observing a transition and reward + """ + util.raiseNotDefined() + + #################################### + # Read These Functions # + #################################### + + def getLegalActions(self,state): + """ + Get the actions available for a given + state. This is what you should use to + obtain legal actions for a state + """ + return self.actionFn(state) + + def observeTransition(self, state,action,nextState,deltaReward): + """ + Called by environment to inform agent that a transition has + been observed. This will result in a call to self.update + on the same arguments + + NOTE: Do *not* override or call this function + """ + self.episodeRewards += deltaReward + self.update(state,action,nextState,deltaReward) + + def startEpisode(self): + """ + Called by environment when new episode is starting + """ + self.lastState = None + self.lastAction = None + self.episodeRewards = 0.0 + + def stopEpisode(self): + """ + Called by environment when episode is done + """ + if self.episodesSoFar < self.numTraining: + self.accumTrainRewards += self.episodeRewards + else: + self.accumTestRewards += self.episodeRewards + self.episodesSoFar += 1 + if self.episodesSoFar >= self.numTraining: + # Take off the training wheels + self.epsilon = 0.0 # no exploration + self.alpha = 0.0 # no learning + + def isInTraining(self): + return self.episodesSoFar < self.numTraining + + def isInTesting(self): + return not self.isInTraining() + + def __init__(self, actionFn = None, numTraining=100, epsilon=0.5, alpha=0.5, gamma=1): + """ + actionFn: Function which takes a state and returns the list of legal actions + + alpha - learning rate + epsilon - exploration rate + gamma - discount factor + numTraining - number of training episodes, i.e. no learning after these many episodes + """ + if actionFn == None: + actionFn = lambda state: state.getLegalActions() + self.actionFn = actionFn + self.episodesSoFar = 0 + self.accumTrainRewards = 0.0 + self.accumTestRewards = 0.0 + self.numTraining = int(numTraining) + self.epsilon = float(epsilon) + self.alpha = float(alpha) + self.discount = float(gamma) + + ################################ + # Controls needed for Crawler # + ################################ + def setEpsilon(self, epsilon): + self.epsilon = epsilon + + def setLearningRate(self, alpha): + self.alpha = alpha + + def setDiscount(self, discount): + self.discount = discount + + def doAction(self,state,action): + """ + Called by inherited class when + an action is taken in a state + """ + self.lastState = state + self.lastAction = action + + ################### + # Pacman Specific # + ################### + def observationFunction(self, state): + """ + This is where we ended up after our last action. + The simulation should somehow ensure this is called + """ + if not self.lastState is None: + reward = state.getScore() - self.lastState.getScore() + self.observeTransition(self.lastState, self.lastAction, state, reward) + return state + + def registerInitialState(self, state): + self.startEpisode() + if self.episodesSoFar == 0: + print 'Beginning %d episodes of Training' % (self.numTraining) + + def final(self, state): + """ + Called by Pacman game at the terminal state + """ + deltaReward = state.getScore() - self.lastState.getScore() + self.observeTransition(self.lastState, self.lastAction, state, deltaReward) + self.stopEpisode() + + # Make sure we have this var + if not 'episodeStartTime' in self.__dict__: + self.episodeStartTime = time.time() + if not 'lastWindowAccumRewards' in self.__dict__: + self.lastWindowAccumRewards = 0.0 + self.lastWindowAccumRewards += state.getScore() + + NUM_EPS_UPDATE = 100 + if self.episodesSoFar % NUM_EPS_UPDATE == 0: + print 'Reinforcement Learning Status:' + windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE) + if self.episodesSoFar <= self.numTraining: + trainAvg = self.accumTrainRewards / float(self.episodesSoFar) + print '\tCompleted %d out of %d training episodes' % ( + self.episodesSoFar,self.numTraining) + print '\tAverage Rewards over all training: %.2f' % ( + trainAvg) + else: + testAvg = float(self.accumTestRewards) / (self.episodesSoFar - self.numTraining) + print '\tCompleted %d test episodes' % (self.episodesSoFar - self.numTraining) + print '\tAverage Rewards over testing: %.2f' % testAvg + print '\tAverage Rewards for last %d episodes: %.2f' % ( + NUM_EPS_UPDATE,windowAvg) + print '\tEpisode took %.2f seconds' % (time.time() - self.episodeStartTime) + self.lastWindowAccumRewards = 0.0 + self.episodeStartTime = time.time() + + if self.episodesSoFar == self.numTraining: + msg = 'Training Done (turning off epsilon and alpha)' + print '%s\n%s' % (msg,'-' * len(msg)) -- cgit v1.2.3-70-g09d2