我遵循了 TensorFlow 代理教程和多武装老虎机教程,现在我正在尝试从示例中制作一个已经实现的代理,在我自己的环境中工作。基本上我的环境存在 5 个动作和 5 个观察。应用一个动作 i 会导致相同的状态 i。一个动作包含另一个步骤,即通过套接字将该 ID 发送到不同的程序,并解释来自程序的答案以获得奖励。我的环境似乎工作正常,我使用下面的小测试脚本来测试观察和操作功能。我知道这不是一个完整的证明,但表明它至少有效。现在我错过了将观察映射到动作的部分,因此缺少代理与他的策略。我遵循示例的结构,但我在环境中尝试的每个代理都有不同的错误。我似乎将它们错误地应用于我的环境,但无法弄清楚我做错了什么。
我是否无法像说明的那样从示例中应用这些端到端代理之一?我搜索了有关 tensorflow 的所有教程和文档,但没有得到任何答案。我的环境应该足够简单。我似乎错过了一些重要的步骤。
每个代理的错误:
Greedy:
Input 0 of layer "dense_3" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (50,)
Call arguments received:
• observation=tf.Tensor(shape=(), dtype=int32)
• step_type=tf.Tensor(shape=(), dtype=int32)
• network_state=()
• training=False
Linucb:
ValueError: Global observation shape is expected to be [None, 1]. Got [].
LinThompson:
lib/python3.8/site-packages/tf_agents/bandits/policies/linear_bandit_policy.py", line 242, in _distribution
raise ValueError(
ValueError: Global observation shape is expected to be [None, 1]. Got [].
Exp3:
lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 7107, in raise_from_not_ok_status
raise core._status_to_exception(e) from None # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: cannot compute Mul as input #1(zero-based) was expected to be a int32 tensor but is a float tensor [Op:Mul]
环境:
nest = tf.nest
#https://www.tensorflow.org/agents/tutorials/2_environments_tutorial
# Statemachine environment
#
# Actions:
# n Actions: Every state of the statemachine represents one bandit with one action.
# for now it is 5 states
#
# Observations:
# one of the 5 states
class AFLEnvironment(bandit_py_environment.BanditPyEnvironment):
def __init__(self):
action_spec = tensor_spec.BoundedTensorSpec(
shape=(), dtype=np.int32, minimum=0, maximum=4, name='action') #actions: 0,1,2,3,4 for 5 states.
observation_spec = tensor_spec.BoundedTensorSpec(
shape=(), dtype=np.int32, minimum=0, maximum = 4,name='observation')#5 possible states
self._state = tf.constant(0)
super(AFLEnvironment, self).__init__(observation_spec, action_spec)
def _observe(self):
self._observation = self._state
return self._observation
# implementation of taking the action
def _apply_action(self, action):
sock = self.__connectToSocket()
#answer: NO_FAULT = 0, FSRV_RUN_TMOUT = 1, FSRV_RUN_CRASH = 2, FSRV_RUN_ERROR = 3
answer = self.__fuzz(action, sock)
if answer == "0":
reward = 0.0
elif answer == "1":
reward = 1.0
elif answer == "2":
reward = 1.0
elif answer == "3":
reward = 1.0
else:
print("Error in return value from fuzzing: %s" % answer)
sys.exit(1)
self._state = tf.constant(action)
print("Step ended, reward is: %s" % reward)
return reward
不同的代理:
nest = tf.nest
flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_enum(
'agent', 'EXP3', ['GREEDY', 'LINUCB', 'LINTHOMPSON', 'EXP3'],
'Which agent to use. Possible values are `GREEDY`, `LINUCB`, `LINTHOMPSON` and `EXP3`. Default is GREEDY.')
FLAGS = flags.FLAGS
# From example, change here for training parameters
BATCH_SIZE = 8
TRAINING_LOOPS = 200
STEPS_PER_LOOP = 2
CONTEXT_DIM = 15
# LinUCB agent constants.
AGENT_ALPHA = 10.0
# epsilon Greedy constants.
EPSILON = 0.05
LAYERS = (50, 50, 50)
LR = 0.005
def main(unused_argv):
tf.compat.v1.enable_v2_behavior() # The trainer only runs with V2 enabled.
with tf.device('/CPU:0'): # due to b/128333994
env = AFLEnvironment()
#'GREEDY', 'LINUCB', 'LINTHOMPSON', 'EXP3'
if FLAGS.agent == 'GREEDY':
network = q_network.QNetwork(
input_tensor_spec=env.time_step_spec().observation,
action_spec=env.action_spec(),
fc_layer_params=LAYERS)
agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
time_step_spec=env.time_step_spec(),
action_spec=env.action_spec(),
reward_network=network,
optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
epsilon=EPSILON)
elif FLAGS.agent == 'LINUCB':
agent = lin_ucb_agent.LinearUCBAgent(
time_step_spec=env.time_step_spec(),
action_spec=env.action_spec(),
alpha=AGENT_ALPHA,
gamma=0.95, #wird teilweise in den examples weggelassen
emit_log_probability=False,
dtype=tf.float32)
elif FLAGS.agent == 'LINTHOMPSON':
agent = lin_ts_agent.LinearThompsonSamplingAgent(
time_step_spec=env.time_step_spec(),
action_spec=env.action_spec())
elif FLAGS.agent == 'EXP3':
agent = exp3_agent.Exp3Agent(
time_step_spec = env.time_step_spec(),
action_spec = env.action_spec(),
learning_rate = 1)
replay_buffer = []
metric = py_metrics.AverageReturnMetric()
observers = [replay_buffer.append, metric]
driver = dynamic_step_driver.DynamicStepDriver(
env=env,
policy=agent.collect_policy,
observers=observers,
num_steps = 200)
initial_time_step = env.reset()
print("initial_time_step")
print(initial_time_step)
final_time_step, _ = driver.run(initial_time_step)
print('Replay Buffer:')
for traj in replay_buffer:
print(traj)
if __name__ == '__main__':
app.run(main)
测试脚本:
env = AFLEnvironment()
observation = env.reset().observation
print("observation: %d" % observation)
action = 1 #@param
print("action: %d" % action)
reward = env.step(action).reward
print("reward: %f" % reward)
print("observation : %d", env._observe())