提交训练任务

最近更新时间:2025-02-12 16:09:52

我的收藏
describe_postpaid_training_price(self)
查询每种配置的每小时的价格,价格单位:元
:rtype: tencentcloud.tione.v20211111.models.DescribeBillingSpecsResponse

describe_system_reasoning_images(self)
获取平台内置的推理镜像

:return: 推理镜像信息
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeInferTemplatesResponse`
数据格式如:
{
"FrameworkTemplates": [
{
"Framework": "TENSORFLOW",
"FrameworkVersion": "2.4",
"Groups": [
"TENSORFLOW",
"LIGHT"
],
"InferTemplates": [
{
"InferTemplateId": "tf2.4-py38-cpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tensorflow2.4-cpu-20211206"
},
{
"InferTemplateId": "tf2.4-py38-gpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-gpu:py38-tensorflow2.4-cu110-20211206"
}
]
}
],
"RequestId": "3654e19b-c2ba-4953-b131-d66495723008"
}

def create_training_task(self,
name,
framework,
cos_output,
worker_resource,
code_package_path,
ps_resource=None,
input_data_config=None,
worker_start_cmd=None,
ps_start_cmd=None,
tags=None,
tuning_parameters_dict={},
resource_group_id="",
remark=None,
log_enable=False,
log_logset_id=None,
log_topic_id=None,
vpc_id=None,
sub_net_id=None,
restart_limit=0):
"""创建训练任务

:param name: 任务名称
:type name: str
:param framework: 运行的框架环境
:type framework: :class:`tikit.models.FrameworkInfo`
:param cos_output: 输出的cos信息
:type cos_output: str
:param worker_resource: worker节点的配置
:type worker_resource: :class:`tikit.models.ResourceConfigInfo`
:param code_package_path: 代码的cos信息
:type code_package_path: str
:param ps_resource: ps节点的配置
:type ps_resource: :class:`tikit.models.ResourceConfigInfo`
:param input_data_config: 输入的数据信息
:type input_data_config: list or :class:`tikit.models.TrainingDataConfig`
:param worker_start_cmd: worker的启动命令
:type worker_start_cmd: str
:param ps_start_cmd: ps节点的启动命令
:type ps_start_cmd: str
:param tags: 标签
:type tags: list of :class:`tikit.tencentcloud.tione.v20211111.models.Tag`
:param tuning_parameters_dict: 调参字典
:type tuning_parameters_dict: dict
:param resource_group_id: 预付费的资源组id
:type resource_group_id: str
:param remark: 描述
:type remark: str
:param log_enable: 日志开关
:type log_enable: bool
:param log_logset_id: 日志集id
:type log_logset_id: str
:param log_topic_id: 日志的topic id
:type log_topic_id: str
:param vpc_id: vpc的id
:type vpc_id: str
:param sub_net_id: 子网id
:type sub_net_id: str
:param restart_limit: 当前任务最大重启次数,最高10次,超过后任务被标记为异常
:type restart_limit: int
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.CreateTrainingTaskResponse`
"""

def describe_training_frameworks(self):
"""查看训练框架

:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingFrameworksResponse`

"""

def describe_training_tasks(self, filters=None, tag_filters=None, offset=0, limit=50, order="DESC",
order_field="UpdateTime"):
"""获取训练任务列表

:param filters: 过滤器,eg:[{ "Name": "TaskStatus", "Values": ["Running"] }]
:type filters: list of Filter
:param tag_filters: 标签过滤器,eg:[{ "TagKey": "TagKeyA", "TagValue": ["TagValueA"] }]
:type tag_filters: list of TagFilter
:param offset: 偏移量,默认为0
:type offset: int
:param limit: 返回数量,默认为50
:type limit: int
:param order: 输出列表的排列顺序。取值范围:ASC:升序排列 DESC:降序排列
:type order: str
:param order_field: 排序的依据字段, 取值范围 "CreateTime" "UpdateTime"
:type order_field: str
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingTasksResponse`
"""

def describe_training_task(self, task_id)
获取单个训练任务信息
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskResponse`

def describe_training_task_pods(self, task_id)
获取训练任务的pod列表
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskPodsResponse`

def describe_train_logs(self, pod_name, start_time=None, end_time=None, limit=None, order=None,
context=None, filters=None):
"""查看训练任务的日志

:param pod_name: 查询哪个Pod的日志,支持通配符。查看某个训练任务的全部pod的日志可以填: "<task_id>-*",如:train-51cd6bf7ec1000-*
:type pod_name: str
:param start_time: 日志查询开始时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间的前一个小时
:type start_time: str
:param end_time: 日志查询结束时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间
:type end_time: str
:param limit: 日志查询条数,默认值100,最大值100
:type limit: int
:param order: 排序方向。(ASC | DESC) 默认值为DESC
:type order: str
:param context: 分页的游标
:type context: str
:param filters: 过滤Filters
:type filters: list of tikit.tencentcloud.tione.v20211111.models.Filter
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeLogsResponse`

返回的对象如果非空,就会有 next() 方法,能不断地获取下一页的日志(如果有多页的话),如下:
now_time = datetime.datetime.now(datetime.timezone.utc)
now_time_str = now_time.isoformat()
result = client.describe_train_logs("train-51cd6bf7ec1000-37c5p5nlr01s-launcher",
"2021-12-10T09:32:03.823509+00:00",
now_time_str,
limit=30)
print(result)
print(result.next())
print(result.next())
print(result.next())
"""

def push_training_metrics(self, timestamp, value_map, task_id=None, epoch=None, total_steps=None, step=None):
"""上报训练自定义指标(单条)。单个子账号每秒可以调用20次,请在您的训练代码中注意控制上报频率,避免超限报错。或者使用push_training_metrics_list

:param timestamp: 时间戳
:type timestamp: int
:param value_map: 指标映射。 指标名称 -> 指标值
:type value_map: map: str -> float
:param task_id: 任务ID。若为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type task_id: str
:param epoch: epoch值
:type epoch: int
:param total_steps: 总步数
:type total_steps: int
:param step: 第几步
:type step: int
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

client.push_training_metrics(int(time.time()), {"field1": 11, "field2": 12}, "task-id-00001", 3, 1000, 66)
"""

def push_training_metrics_list(self, metric_list)
上报训练自定义指标(列表)
:param metric_list: MetricData 数组。 若任务ID为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type metric_list: list of :class:`tencentcloud.tione.v20211111.models.MetricData`
:return:
:rtype: :class:`tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

def describe_training_metrics(self, task_id)
查询训练自定义指标
:param task_id: 任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingMetricsResponse`

def stop_training_task(self, task_id)
停止某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.StopTrainingTaskResponse`

def delete_training_task(self, task_id)
删除某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DeleteTrainingTaskResponse`


中间配置:
tikit.models.FrameworkInfo

def new_custom(training_mode, image_type, image_url, registry_region=None, registry_id=None,
user_name=None, passwd=None):
"""自定义训练框架的配置 通过describe_training_frameworks()查看列表

:param training_mode: 训练模式,如"DDP",仅训练任务需要配置
:type training_mode: str
:param image_type: 镜像类型,CCR为腾讯云容器镜像服务个人版,TCR为腾讯云容器镜像服务企业版,CUSTOM表示第三方自定义镜像
:type image_type: str
:param image_url: 镜像地址 必填
:type image_url: str
:param registry_region: 腾讯云容器镜像服务的镜像仓库的地域
:type registry_region: str
:param registry_id: 腾讯云容器镜像服务的镜像仓库ID
:type registry_id: str
:param user_name: 自定义镜像仓库的用户名
:type user_name: str
:param passwd: 自定义镜像仓库的密码
:type passwd: str
:return:
:rtype:
"""

def new_system_framework(framework_name, framework_environment, training_mode):
"""系统内置的训练框架

:param framework_name: 框架名称。 通过describe_training_frameworks()查看列表
:type framework_name: str
:param framework_environment: 框架环境。 通过describe_training_frameworks()查看列表
:type framework_environment: str
:param training_mode: 训练模式。 通过describe_training_frameworks()查看列表
:type training_mode: str
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.ResourceConfigInfo

def new_postpaid(instance_type, instance_num):
"""获取后付费模式下的资源配置

:param instance_type: 实例类型。通过 describe_postpaid_training_price() 查看实例列表
:type instance_type: str
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""
|
def new_prepaid(cpu=0, memory=0, gpu=0, gpu_type=None, instance_num=1):
"""获取预付费模式下的资源配置,当资源组开启GPU按比例预设后,cpu和memory参数需要不设置或者设置为0,该两项的值将通过GPU卡数按照比例分配

:param cpu: CPU个数,单位是核
:type cpu: float
:param memory: 内存大小,单位是GB
:type memory: float
:param gpu_type: gpu类型
:type gpu_type: str
:param gpu: gpu个数
:type gpu: float
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.TrainingDataConfig

def new_mount_cos(cos_str, target_path):
"""一个cos类型的训练数据

:param cos_str: cos存储,格式: <bucket>/<cos path>/
:type cos_str: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_dataset_mount(dataset_id, target_path):
"""一个dataset类型的训练数据

:param dataset_id: 数据集ID
:type dataset_id: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_cfs(cfs_id, source_path, target_path):
"""新建一个cfs类型的训练数据集配置

:param cfs_id: CFS的ID
:type cfs_id: str
:param source_path: CFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_hdfs(hdfs_id, source_path, target_path):
"""新建一个hdfs类型的训练数据集配置

:param hdfs_id: EMR上HDFS的ID
:type hdfs_id: str
:param source_path: HDFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_wedata_hdfs(wedata_id, source_path):
"""新建一个wedata hdfs类型的训练数据集配置

:param wedata_id: wedata数据源id
:type wedata_id: int
:param source_path: HDFS的路径
:type source_path: str
:return:
:rtype:
"""

def new_dataset(id_target_dict):
""" Deprecated !
新建一个dataset类型的训练数据集配置

:param id_target_dict: 数据集信息。 dataset id -> 下载的目标路径
:type id_target_dict: dict
:return:
:rtype:
"""

def new_cos_data(cos_str_target_dict):
"""Deprecated !
新建一个cos类型的训练数据集配置

:param cos_str_target_dict: 数据集信息。 <bucket>/<cos path>/ -> 下载的目标路径
:type cos_str_target_dict: dict
:return:
:rtype:
"""

def new_mount_goosefs(goosefs_id, source_path, namespace, target_path):
"""新建一个goosefs类型的训练数据集配置

:param goosefs_id: goosefs实例id
:type goosefs_id: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""
def new_mount_goosefsx(goosefsx_id, goosefsx_path, target_path):
"""新建一个goosefsx类型的存储配置

:param goosefsx_id: goosefsx实例id
:type goosefsx_id: str
:param goosefsx_path: goosefsx路径
:type goosefsx_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""