coach 模块化最好的强化学习框架

add agent

http://coach.nervanasys.com/contributing/add_agent/index.html

class Agent(object):
class PolicyOptimizationAgent(Agent):
class ActorCriticAgent(PolicyOptimizationAgent):

presets.py

class Carla_A3C(Preset):
    def __init__(self):
        Preset.__init__(self, ActorCritic, Carla, EntropyExploration)
        self.agent.embedder_complexity = EmbedderComplexity.Deep
        self.agent.policy_gradient_rescaler = 'GAE'
        self.learning_rate = 0.0001
        self.num_heatup_steps = 0
        # self.env.reward_scaling = 1.0e9
        self.agent.discount = 0.99
        self.agent.apply_gradients_every_x_episodes = 1
        self.agent.num_steps_between_gradient_updates = 30
        self.agent.gae_lambda = 1
        self.agent.beta_entropy = 0.01
        self.clip_gradients = 40
        self.agent.middleware_type = MiddlewareTypes.FC


class Carla_DDPG(Preset):
    def __init__(self):
        Preset.__init__(self, DDPG, Carla, OUExploration)
        self.agent.embedder_complexity = EmbedderComplexity.Deep
        self.learning_rate = 0.0001
        self.num_heatup_steps = 1000
        self.agent.num_consecutive_training_steps = 5


class Carla_BC(Preset):
    def __init__(self):
        Preset.__init__(self, BC, Carla, ExplorationParameters)
        self.agent.embedder_complexity = EmbedderComplexity.Deep
        self.agent.load_memory_from_file_path = 'datasets/carla_town1.p'
        self.learning_rate = 0.0005
        self.num_heatup_steps = 0
        self.evaluation_episodes = 5
        self.batch_size = 120
        self.evaluate_every_x_training_iterations = 5000

配置参数:

class Preset(GeneralParameters):
    def __init__(self, agent, env, exploration, visualization=VisualizationParameters):
        """
        :type agent: AgentParameters
        :type env: EnvironmentParameters
        :type exploration: ExplorationParameters
        :type visualization: VisualizationParameters
        """
        self.visualization = visualization
        self.agent = agent
        self.env = env
        self.exploration = exploration
class ActorCritic(AgentParameters):
    type = 'ActorCriticAgent'
    input_types = {'observation': InputTypes.Observation}
    output_types = [OutputTypes.V, OutputTypes.Pi]
    loss_weights = [0.5, 1.0]
    stop_gradients_from_head = [False, False]
    num_episodes_in_experience_replay = 2
    policy_gradient_rescaler = 'A_VALUE'
    hidden_layers_activation_function = 'elu'
    apply_gradients_every_x_episodes = 5
    beta_entropy = 0
    num_steps_between_gradient_updates = 5000  # this is called t_max in all the papers
    gae_lambda = 0.96
    shared_optimizer = True
    estimate_value_using_gae = False
    async_training = True


class PolicyGradient(AgentParameters):
    type = 'PolicyGradientsAgent'
    input_types = {'observation': InputTypes.Observation}
    output_types = [OutputTypes.Pi]
    loss_weights = [1.0]
    num_episodes_in_experience_replay = 2
    policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP'
    apply_gradients_every_x_episodes = 5
    beta_entropy = 0
    num_steps_between_gradient_updates = 20000  # this is called t_max in all the papers
    async_training = True


class DDPG(AgentParameters):
    type = 'DDPGAgent'
    input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
    output_types = [OutputTypes.V]  # V is used because we only want a single Q value
    loss_weights = [1.0]
    hidden_layers_activation_function = 'relu'
    num_episodes_in_experience_replay = 10000
    num_steps_between_copying_online_weights_to_target = 1
    rate_for_copying_weights_to_target = 0.001
    shared_optimizer = True
    async_training = True
class AgentParameters(Parameters):
    agent = ''

    # Architecture parameters
    input_types = {'observation': InputTypes.Observation}
    output_types = [OutputTypes.Q]
    middleware_type = MiddlewareTypes.FC
    loss_weights = [1.0]
    stop_gradients_from_head = [False]
    embedder_complexity = EmbedderComplexity.Shallow
    num_output_head_copies = 1
    use_measurements = False
    use_accumulated_reward_as_measurement = False
    add_a_normalized_timestep_to_the_observation = False
    l2_regularization = 0
    hidden_layers_activation_function = 'relu'
    optimizer_type = 'Adam'
    async_training = False
    use_separate_networks_per_head = False

    # Agent parameters
    num_consecutive_playing_steps = 1
    num_consecutive_training_steps = 1
    update_evaluation_agent_network_after_every_num_steps = 3000
    bootstrap_total_return_from_old_policy = False
    n_step = -1
    num_episodes_in_experience_replay = 200
    num_transitions_in_experience_replay = None
    discount = 0.99
    policy_gradient_rescaler = 'A_VALUE'
    apply_gradients_every_x_episodes = 5
    beta_entropy = 0
    num_steps_between_gradient_updates = 20000  # t_max
    num_steps_between_copying_online_weights_to_target = 1000
    rate_for_copying_weights_to_target = 1.0
    monte_carlo_mixing_rate = 0.1
    gae_lambda = 0.96
    step_until_collecting_full_episodes = False
    targets_horizon = 'N-Step'
    replace_mse_with_huber_loss = False
    load_memory_from_file_path = None
    collect_new_data = True
    input_rescaler = 255.0

    # PPO related params
    target_kl_divergence = 0.01
    initial_kl_coefficient = 1.0
    high_kl_penalty_coefficient = 1000
    value_targets_mix_fraction = 0.1
    clip_likelihood_ratio_using_epsilon = None
    use_kl_regularization = True
    estimate_value_using_gae = False

    # DFP related params
    num_predicted_steps_ahead = 6
    goal_vector = [1.0, 1.0]
    future_measurements_weights = [0.5, 0.5, 1.0]

    # NEC related params
    dnd_size = 500000
    l2_norm_added_delta = 0.001
    new_value_shift_coefficient = 0.1
    number_of_knn = 50
    DND_key_error_threshold = 0.01

    # Framework support
    neon_support = False
    tensorflow_support = True

    # distributed agents params
    shared_optimizer = True
    share_statistics_between_workers = True


class EnvironmentParameters(Parameters):
    type = 'Doom'
    level = 'basic'
    observation_stack_size = 4
    frame_skip = 4
    desired_observation_width = 76
    desired_observation_height = 60
    normalize_observation = False
    crop_observation = False
    random_initialization_steps = 0
    reward_scaling = 1.0
    reward_clipping_min = None
    reward_clipping_max = None
    human_control = False


class ExplorationParameters(Parameters):
    # Exploration policies
    policy = 'EGreedy'
    evaluation_policy = 'Greedy'
    # -- bootstrap dqn parameters
    bootstrapped_data_sharing_probability = 0.5
    architecture_num_q_heads = 1
    # -- dropout approximation of thompson sampling parameters
    dropout_discard_probability = 0
    initial_keep_probability = 0.0  # unused
    final_keep_probability = 0.99  # unused
    keep_probability_decay_steps = 50000  # unused
    # -- epsilon greedy parameters
    initial_epsilon = 0.5
    final_epsilon = 0.01
    epsilon_decay_steps = 50000
    evaluation_epsilon = 0.05
    # -- epsilon greedy at end of episode parameters
    average_episode_length_over_num_episodes = 20
    # -- boltzmann softmax parameters
    initial_temperature = 100.0
    final_temperature = 1.0
    temperature_decay_steps = 50000
    # -- additive noise
    initial_noise_variance_percentage = 0.1
    final_noise_variance_percentage = 0.1
    noise_variance_decay_steps = 1
    # -- Ornstein-Uhlenbeck process
    mu = 0
    theta = 0.15
    sigma = 0.3
    dt = 0.01


class GeneralParameters(Parameters):
    train = True
    framework = Frameworks.TensorFlow
    threads = 1
    sess = None

    # distributed training options
    num_threads = 1
    synchronize_over_num_threads = 1
    distributed = False

    # Agent blocks
    memory = 'EpisodicExperienceReplay'
    architecture = 'GeneralTensorFlowNetwork'

    # General parameters
    clip_gradients = None
    kl_divergence_constraint = 100000
    num_training_iterations = 10000000000
    num_heatup_steps = 1000
    heatup_using_network_decisions = False
    batch_size = 32
    save_model_sec = None
    save_model_dir = None
    checkpoint_restore_dir = None
    learning_rate = 0.00025
    learning_rate_decay_rate = 0
    learning_rate_decay_steps = 0
    evaluation_episodes = 5
    evaluate_every_x_episodes = 1000000
    evaluate_every_x_training_iterations = 0
    rescaling_interpolation_type = 'bilinear'
    current_episode = 0

    # setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
    # the form of different workers starting at different times, and getting different assignments of CPU
    # time from the OS.
    seed = None

    checkpoints_path = ''

    # Testing parameters
    test = False
    test_min_return_threshold = 0
    test_max_step_threshold = 1
    test_num_workers = 1



class Atari(EnvironmentParameters):
    type = 'Gym'
    frame_skip = 4
    observation_stack_size = 4
    desired_observation_height = 84
    desired_observation_width = 84
    reward_clipping_max = 1.0
    reward_clipping_min = -1.0
    random_initialization_steps = 30
    crop_observation = False  # in the original paper the observation is cropped but not in the Nature paper


class Doom(EnvironmentParameters):
    type = 'Doom'
    frame_skip = 4
    observation_stack_size = 3
    desired_observation_height = 60
    desired_observation_width = 76


class Carla(EnvironmentParameters):
    type = 'Carla'
    frame_skip = 1
    observation_stack_size = 4
    desired_observation_height = 128
    desired_observation_width = 180
    normalize_observation = False
    server_height = 256
    server_width = 360
    config = 'environments/CarlaSettings.ini'
    level = 'town1'
    verbose = True
    stereo = False
    semantic_segmentation = False
    depth = False
    episode_max_time = 100000  # miliseconds for each episode
    continuous_to_bool_threshold = 0.5
    allow_braking = False

(coach_env) ubuntu@ubuntu-Default-string:~/github/coach$ python3 coach.py -l

Warning: failed to import the following packages - RoboSchool, CARLA, Neon, ViZDoom, GymExtensions, PyBullet

Available Presets:

Alien_DQN

Alien_NEC

AntBullet_A3C

AntMaze_A3C

Ant_A3C

Ant_ClippedPPO

Ant_DDPG

Atari_DQN_TestBench

BipedalWalker_A3C

Breakout_A3C

Breakout_C51

Breakout_DDQN

Breakout_DQN

Breakout_Dueling_DDQN

Breakout_NEC

Breakout_QRDQN

Carla_A3C

Carla_BC

Carla_DDPG

CartPole_A2C

CartPole_A3C

CartPole_Bootstrapped_DQN

CartPole_C51

CartPole_ClippedPPO

CartPole_DQN

CartPole_Dueling_DDQN

CartPole_MMC

CartPole_NEC

CartPole_NStepQ

CartPole_OneStepQ

CartPole_PAL

CartPole_PG

CartPole_PPO

CartPole_QRDQN

Doom_Basic_A2C

Doom_Basic_A3C

Doom_Basic_BC

Doom_Basic_DFP

Doom_Basic_DQN

Doom_Basic_Dueling_DDQN

Doom_Basic_Dueling_DQN

Doom_Basic_NEC

Doom_Basic_NStepQ

Doom_Basic_OneStepQ

Doom_Basic_PG

Doom_Basic_QRDQN

Doom_Deadly_Corridor_Bootstrapped_DQN

Doom_Deathmatch_BC

Doom_Defend_BC

Doom_Health_DFP

Doom_Health_DQN

Doom_Health_MMC

Doom_Health_NEC

HalfCheetah_ClippedPPO_Roboschool

HalfCheetah_DDPG

HopperBullet_A3C

HopperIceWall_A3C

HopperStairs_A3C

Hopper_A3C

Hopper_ClippedPPO

Hopper_ClippedPPO_Distributed

Hopper_ClippedPPO_Roboschool

Hopper_DDDPG

Hopper_DDPG

Hopper_DDPG_Roboschool

Hopper_DPPO

Hopper_NAF

Hopper_PPO

Hopper_PPO_Roboschool

Humanoid_A3C

Humanoid_ClippedPPO

InvertedPendulum_A3C

InvertedPendulum_ClippedPPO

InvertedPendulum_ClippedPPO_Roboschool

InvertedPendulum_DDPG

InvertedPendulum_NAF

InvertedPendulum_PG

InvertedPendulum_PPO

Kuka_ClippedPPO

Minitaur_ClippedPPO

MontezumaRevenge_BC

Montezuma_NEC

MountainCar_A3C

Pendulum_A3C

Pendulum_ClippedPPO

Pendulum_DDPG

Pendulum_NAF

Pendulum_PG

Pong_A3C

Pong_DQN

Pong_NEC

Pong_NEC_LSTM

Walker_A3C

Walker_PPO

原文发布于微信公众号 - CreateAMind(createamind)

原文发表时间:2018-03-30

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

相关文章

来自专栏个人分享

hdfs文件按修改时间下载

应用于:对于不同用户创建的表目录,进行文件的下载,程序中执行hadoop cat命令 下载文件到本地,随后通过ftp传至目标服务器,并将hdfs文件目录的修改时...

37420
来自专栏开发与安全

90% of python in 90 minutes

注:本文整理自 http://www.slideshare.net/MattHarrison4/learn-90 -----------------------...

23100
来自专栏JMCui

项目工具类

一、前言     在工作中,难免遇到各种各样的问题,每个人似乎都有一套自己的解决方案。而我,又不想每次解决完问题就把东西扔了,捡了芝麻,丢了西瓜,什么时候才能进...

47760
来自专栏CreateAMind

coach 模块化最好的强化学习框架

11020
来自专栏函数式编程语言及工具

SDP(7):Cassandra- Cassandra-Engine:Streaming

  akka在alpakka工具包里提供了对cassandra数据库的streaming功能。简单来讲就是用一个CQL-statement读取cassandra...

34560
来自专栏Flutter入门到实战

老司机带你重构Android的v4包的部分源码

版权声明:本文为博主原创文章,未经博主允许不得转载。https://www.jianshu.com/p/a08d754944c4

19610
来自专栏Hongten

java的poi技术读取Excel数据到MySQL

这篇blog是介绍java中的poi技术读取Excel数据,然后保存到MySQL数据中。

23030
来自专栏PPV课数据科学社区

【学习】七天搞定SAS(二):基本操作(判断、运算、基本函数)

SAS生成新变量 SAS支持基本的加减乘除,值得一提的是它的**代表指数,而不是^。 * Modify homegarden data set with ass...

48240
来自专栏数据结构与算法

POJ3683 Priest John's Busiest Day(2-SAT)

Description John is the only priest in his town. September 1st is the John's bu...

34850
来自专栏算法修养

ZOJ 3946 Highway Project(Dijkstra)

Highway Project ---- Time Limit: 2 Seconds      Memory Limit: 65536 KB ---- Edwa...

33680

扫码关注云+社区

领取腾讯云代金券