python - 尝试将示例中的 TensorFlow 代理应用到自定义环境

Question

我遵循了 TensorFlow 代理教程和多武装老虎机教程，现在我正在尝试从示例中制作一个已经实现的代理，在我自己的环境中工作。基本上我的环境存在 5 个动作和 5 个观察。应用一个动作 i 会导致相同的状态 i。一个动作包含另一个步骤，即通过套接字将该 ID 发送到不同的程序，并解释来自程序的答案以获得奖励。我的环境似乎工作正常，我使用下面的小测试脚本来测试观察和操作功能。我知道这不是一个完整的证明，但表明它至少有效。现在我错过了将观察映射到动作的部分，因此缺少代理与他的策略。我遵循示例的结构，但我在环境中尝试的每个代理都有不同的错误。我似乎将它们错误地应用于我的环境，但无法弄清楚我做错了什么。

我是否无法像说明的那样从示例中应用这些端到端代理之一？我搜索了有关 tensorflow 的所有教程和文档，但没有得到任何答案。我的环境应该足够简单。我似乎错过了一些重要的步骤。

每个代理的错误：

Greedy:
Input 0 of layer "dense_3" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (50,)

Call arguments received:
  • observation=tf.Tensor(shape=(), dtype=int32)
  • step_type=tf.Tensor(shape=(), dtype=int32)
  • network_state=()
  • training=False

Linucb:
ValueError: Global observation shape is expected to be [None, 1]. Got [].

LinThompson:
lib/python3.8/site-packages/tf_agents/bandits/policies/linear_bandit_policy.py", line 242, in _distribution
    raise ValueError(
ValueError: Global observation shape is expected to be [None, 1]. Got [].
Exp3:
lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 7107, in raise_from_not_ok_status
    raise core._status_to_exception(e) from None  # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: cannot compute Mul as input #1(zero-based) was expected to be a int32 tensor but is a float tensor [Op:Mul]

环境：

nest = tf.nest

#https://www.tensorflow.org/agents/tutorials/2_environments_tutorial

# Statemachine environment
# 
# Actions:
# n Actions: Every state of the statemachine represents one bandit with one action.
# for now it is 5 states
#
# Observations:
# one of the 5 states

class AFLEnvironment(bandit_py_environment.BanditPyEnvironment):
  def __init__(self):
    action_spec = tensor_spec.BoundedTensorSpec(
        shape=(), dtype=np.int32, minimum=0, maximum=4, name='action') #actions: 0,1,2,3,4 for 5 states.
    observation_spec = tensor_spec.BoundedTensorSpec(
        shape=(), dtype=np.int32, minimum=0, maximum = 4,name='observation')#5 possible states
    self._state = tf.constant(0)
    super(AFLEnvironment, self).__init__(observation_spec, action_spec)

  def _observe(self):
    self._observation = self._state
    return self._observation
  
  # implementation of taking the action
  def _apply_action(self, action):

    sock = self.__connectToSocket()
    #answer: NO_FAULT = 0, FSRV_RUN_TMOUT = 1, FSRV_RUN_CRASH = 2, FSRV_RUN_ERROR = 3
    answer = self.__fuzz(action, sock)
    if answer == "0":
      reward = 0.0
    elif answer == "1":
      reward = 1.0
    elif answer == "2":
      reward = 1.0
    elif answer == "3":
      reward = 1.0
    else:
      print("Error in return value from fuzzing: %s" % answer)
      sys.exit(1)

    self._state = tf.constant(action)
    print("Step ended, reward is: %s" % reward)
    return reward

不同的代理：

nest = tf.nest

flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
                    'Root directory for writing logs/summaries/checkpoints.')

flags.DEFINE_enum(
    'agent', 'EXP3', ['GREEDY', 'LINUCB', 'LINTHOMPSON', 'EXP3'],
    'Which agent to use. Possible values are `GREEDY`, `LINUCB`, `LINTHOMPSON` and `EXP3`. Default is GREEDY.')

FLAGS = flags.FLAGS

# From example, change here for training parameters
BATCH_SIZE = 8
TRAINING_LOOPS = 200
STEPS_PER_LOOP = 2

CONTEXT_DIM = 15

# LinUCB agent constants.
AGENT_ALPHA = 10.0

# epsilon Greedy constants.
EPSILON = 0.05
LAYERS = (50, 50, 50)
LR = 0.005

def main(unused_argv):
  tf.compat.v1.enable_v2_behavior()  # The trainer only runs with V2 enabled.

  with tf.device('/CPU:0'):  # due to b/128333994

    env = AFLEnvironment()

    #'GREEDY', 'LINUCB', 'LINTHOMPSON', 'EXP3'
    if FLAGS.agent == 'GREEDY':
      network = q_network.QNetwork(
          input_tensor_spec=env.time_step_spec().observation,
          action_spec=env.action_spec(),
          fc_layer_params=LAYERS)
      agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
          time_step_spec=env.time_step_spec(),
          action_spec=env.action_spec(),
          reward_network=network,
          optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR),
          epsilon=EPSILON)
    elif FLAGS.agent == 'LINUCB':
      agent = lin_ucb_agent.LinearUCBAgent(
          time_step_spec=env.time_step_spec(),
          action_spec=env.action_spec(),
          alpha=AGENT_ALPHA,
          gamma=0.95, #wird teilweise in den examples weggelassen
          emit_log_probability=False,
          dtype=tf.float32)
    elif FLAGS.agent == 'LINTHOMPSON':
      agent = lin_ts_agent.LinearThompsonSamplingAgent(
          time_step_spec=env.time_step_spec(),
          action_spec=env.action_spec())
    elif FLAGS.agent == 'EXP3':
      agent = exp3_agent.Exp3Agent(
          time_step_spec = env.time_step_spec(),
          action_spec = env.action_spec(),
          learning_rate = 1)
    
    replay_buffer = []
    metric = py_metrics.AverageReturnMetric()
    observers = [replay_buffer.append, metric]
    driver = dynamic_step_driver.DynamicStepDriver(
        env=env,
        policy=agent.collect_policy,
        observers=observers,
        num_steps = 200)
    initial_time_step = env.reset()
    print("initial_time_step")
    print(initial_time_step)
    final_time_step, _ = driver.run(initial_time_step)
    print('Replay Buffer:')
    for traj in replay_buffer:
      print(traj)


if __name__ == '__main__':
  app.run(main)

测试脚本：

env = AFLEnvironment()
observation = env.reset().observation
print("observation: %d" % observation)

action = 1 #@param

print("action: %d" % action)
reward = env.step(action).reward
print("reward: %f" % reward)
print("observation : %d", env._observe())

python - 尝试将示例中的 TensorFlow 代理应用到自定义环境

0 回答 0

Related

Reference