提交训练任务

最近更新时间:2024-02-26 10:22:31

我的收藏
describe_postpaid_training_price(self)
查询每种配置的每小时的价格,价格单位:元
:rtype: tencentcloud.tione.v20211111.models.DescribeBillingSpecsResponse

describe_system_reasoning_images(self)
获取平台内置的推理镜像

:return: 推理镜像信息
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeInferTemplatesResponse`
数据格式如:
{
"FrameworkTemplates": [
{
"Framework": "TENSORFLOW",
"FrameworkVersion": "2.4",
"Groups": [
"TENSORFLOW",
"LIGHT"
],
"InferTemplates": [
{
"InferTemplateId": "tf2.4-py38-cpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tensorflow2.4-cpu-20211206"
},
{
"InferTemplateId": "tf2.4-py38-gpu",
"InferTemplateImage": "ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-gpu:py38-tensorflow2.4-cu110-20211206"
}
]
}
],
"RequestId": "3654e19b-c2ba-4953-b131-d66495723008"
}

print的返回结果,输出如下:(镜像标识用来配置新模型的运行环境
+------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+
| 算法框架 | 算法框架版本号 | 支持的训练框架集合 | 镜像标识 | 镜像url |
+------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+
| TENSORFLOW | 2.4 | ['TENSORFLOW', 'LIGHT'] | tf2.4-py38-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tf2.4-cpu-1.0.0 |
| TENSORFLOW | 2.4 | ['TENSORFLOW', 'LIGHT'] | tf2.4-py38-gpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py38-tf2.4-cu11.0-1.0.0 |
| TENSORFLOW | 1.15 | ['TENSORFLOW', 'LIGHT', 'TI_ACC'] | tf1.15-py37-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py37-tf1.15-cpu-1.0.0 |
| TENSORFLOW | 1.15 | ['TENSORFLOW', 'LIGHT', 'TI_ACC'] | tf1.15-py37-gpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-tensorflow-cpu:py37-tf1.15-cu10.0-1.0.0 |
| PYTORCH | 1.9 | ['PYTORCH', 'LIGHT', 'TI_ACC', 'AUTOML_CV'] | torch1.9.0-py38-cpu | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-cpu:py38-torch1.9.0-cpu-1.0.0 |
| PYTORCH | 1.9 | ['PYTORCH', 'LIGHT', 'TI_ACC', 'AUTOML_CV'] | torch1.9.0-py38-cu111 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-gpu:py38-torch1.9.0-cu111-1.0.0 |
| PMML | 0.9.12 | ['SPARK', 'PYSPARK'] | pypmml-py38 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pmml:py38-pmml0.9.16-1.0.0 |
| OCR | 112 | ['AUTOML_OCR'] | ocr-serving | ccr.ccs.tencentyun.com/qcloud-ti-platform/ocr_serving_gpu:serving-v121-335584d |
| DETECTRON2 | 112 | ['DETECTRON2'] | detectron2-torch1.9.0-py38-cu111 | ccr.ccs.tencentyun.com/qcloud-ti-platform/ti-cloud-infer-pytorch-gpu:py38-torch1.9.0-detectron2-cu111-1.0.0 |
+------------+----------------+---------------------------------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------+

def create_training_task(self,
name,
framework,
cos_output,
worker_resource,
code_package_path,
ps_resource=None,
input_data_config=None,
worker_start_cmd=None,
ps_start_cmd=None,
tags=None,
tuning_parameters_dict={},
resource_group_id="",
remark=None,
log_enable=False,
log_logset_id=None,
log_topic_id=None,
vpc_id=None,
sub_net_id=None):
"""创建训练任务

:param name: 任务名称
:type name: str
:param framework: 运行的框架环境
:type framework: :class:`tikit.models.FrameworkInfo`
:param cos_output: 输出的cos信息
:type cos_output: str
:param worker_resource: worker节点的配置
:type worker_resource: :class:`tikit.models.ResourceConfigInfo`
:param code_package_path: 代码的cos信息
:type code_package_path: str
:param ps_resource: ps节点的配置
:type ps_resource: :class:`tikit.models.ResourceConfigInfo`
:param input_data_config: 输入的数据信息
:type input_data_config: list or :class:`tikit.models.TrainingDataConfig`
:param worker_start_cmd: worker的启动命令
:type worker_start_cmd: str
:param ps_start_cmd: ps节点的启动命令
:type ps_start_cmd: str
:param tags: 标签
:type tags: list of :class:`tikit.tencentcloud.tione.v20211111.models.Tag`
:param tuning_parameters_dict: 调参字典
:type tuning_parameters_dict: dict
:param resource_group_id: 预付费的资源组id
:type resource_group_id: str
:param remark: 描述
:type remark: str
:param log_enable: 日志开关
:type log_enable: bool
:param log_logset_id: 日志集id
:type log_logset_id: str
:param log_topic_id: 日志的topic id
:type log_topic_id: str
:param vpc_id: vpc的id
:type vpc_id: str
:param sub_net_id: 子网id
:type sub_net_id: str
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.CreateTrainingTaskResponse`
"""

def describe_training_frameworks(self):
"""查看训练框架

:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingFrameworksResponse`
print返回的结果,输出如下:
+------------+----------------------------------------+-------------------------+
| 框架名称 | 版本 | 训练模式 |
+------------+----------------------------------------+-------------------------+
| TENSORFLOW | ti-acc1.0-tf1.15-py3.6-cuda10.0-gpu | PS_WORKER |
| TENSORFLOW | light3.1.3-tf2.4-py3.8-cuda11.1-gpu | MPI, HOROVOD |
| TENSORFLOW | tf1.15-py3.7-cpu | PS_WORKER, MPI, HOROVOD |
| TENSORFLOW | tf1.15-py3.7-cuda10.0-gpu | PS_WORKER, MPI, HOROVOD |
| TENSORFLOW | tf2.4-py3.8-cpu | PS_WORKER, MPI, HOROVOD |
| TENSORFLOW | tf2.4-py3.8-cuda11.1-gpu | PS_WORKER, MPI, HOROVOD |
| PYTORCH | ti-acc1.0-torch1.9-py3.8-cuda11.1-gpu | DDP |
| PYTORCH | light3.1.3-torch1.9-py3.8-cuda11.1-gpu | DDP, MPI, HOROVOD |
| PYTORCH | torch1.9-py3.8-cuda11.1-gpu | DDP, MPI, HOROVOD |
| SPARK | spark2.4.5-cpu | SPARK |
| PYSPARK | spark2.4.5-py3.6-cpu | SPARK |
+------------+----------------------------------------+-------------------------+
"""

def describe_training_tasks(self, filters=None, tag_filters=None, offset=0, limit=50, order="DESC",
order_field="UpdateTime"):
"""获取训练任务列表

:param filters: 过滤器,eg:[{ "Name": "TaskStatus", "Values": ["Running"] }]
:type filters: list of Filter
:param tag_filters: 标签过滤器,eg:[{ "TagKey": "TagKeyA", "TagValue": ["TagValueA"] }]
:type tag_filters: list of TagFilter
:param offset: 偏移量,默认为0
:type offset: int
:param limit: 返回数量,默认为50
:type limit: int
:param order: 输出列表的排列顺序。取值范围:ASC:升序排列 DESC:降序排列
:type order: str
:param order_field: 排序的依据字段, 取值范围 "CreateTime" "UpdateTime"
:type order_field: str
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeTrainingTasksResponse`
"""

describe_training_task(self, task_id)
获取单个训练任务信息
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskResponse`

describe_training_task_pods(self, task_id)
获取训练任务的pod列表
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingTaskPodsResponse`

def describe_train_logs(self, pod_name, start_time=None, end_time=None, limit=None, order=None,
context=None, filters=None):
"""查看训练任务的日志

:param pod_name: 查询哪个Pod的日志,支持通配符。查看某个训练任务的全部pod的日志可以填: "<task_id>*",如:train-51cd6bf7ec1000*
:type pod_name: str
:param start_time: 日志查询开始时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间的前一个小时
:type start_time: str
:param end_time: 日志查询结束时间。RFC3339格式的时间字符串,比如2021-12-16T13:20:24+08:00,默认值为当前时间
:type end_time: str
:param limit: 日志查询条数,默认值100,最大值100
:type limit: int
:param order: 排序方向。(ASC | DESC) 默认值为DESC
:type order: str
:param context: 分页的游标
:type context: str
:param filters: 过滤Filters
:type filters: list of tikit.tencentcloud.tione.v20211111.models.Filter
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.DescribeLogsResponse`

返回的对象如果非空,就会有 next() 方法,能不断地获取下一页的日志(如果有多页的话),如下:
now_time = datetime.datetime.now(datetime.timezone.utc)
now_time_str = now_time.isoformat()
result = client.describe_train_logs("train-51cd6bf7ec1000-37c5p5nlr01s-launcher",
"2021-12-10T09:32:03.823509+00:00",
now_time_str,
limit=30)
print(result)
print(result.next())
print(result.next())
print(result.next())
"""

def push_training_metrics(self, timestamp, value_map, task_id=None, epoch=None, total_steps=None, step=None):
"""上报训练自定义指标(单条)。单个子账号每秒可以调用20次,请在您的训练代码中注意控制上报频率,避免超限报错。或者使用push_training_metrics_list

:param timestamp: 时间戳
:type timestamp: int
:param value_map: 指标映射。 指标名称 -> 指标值
:type value_map: map: str -> float
:param task_id: 任务ID。若为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type task_id: str
:param epoch: epoch值
:type epoch: int
:param total_steps: 总步数
:type total_steps: int
:param step: 第几步
:type step: int
:return:
:rtype: :class:`tikit.tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

client.push_training_metrics(int(time.time()), {"field1": 11, "field2": 12}, "task-id-00001", 3, 1000, 66)
"""

push_training_metrics_list(self, metric_list)
上报训练自定义指标(列表)
:param metric_list: MetricData 数组。 若任务ID为空,就当前取任务节点环境的 TI_TASK_ID 环境变量的值
:type metric_list: list of :class:`tencentcloud.tione.v20211111.models.MetricData`
:return:
:rtype: :class:`tencentcloud.tione.v20211111.models.PushTrainingMetricsResponse`

describe_training_metrics(self, task_id)
查询训练自定义指标
:param task_id: 任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DescribeTrainingMetricsResponse`

stop_training_task(self, task_id)
停止某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.StopTrainingTaskResponse`

delete_training_task(self, task_id)
删除某个训练任务
:param task_id: 训练任务ID
:type task_id: str
:rtype: :class:`tencentcloud.tione.v20211111.models.DeleteTrainingTaskResponse`


中间配置:
tikit.models.FrameworkInfo

def new_custom(training_mode, image_type, image_url, registry_region=None, registry_id=None):
"""自定义训练框架的配置

:param training_mode: 训练模式。 通过describe_training_frameworks()查看列表
:type training_mode: str
:param image_type: 腾讯云容器镜像服务的镜像类型,如"CCR"
:type image_type: str
:param image_url: 腾讯云容器镜像服务的镜像地址
:type image_url: str
:param registry_region: 腾讯云容器镜像服务的镜像仓库的域
:type registry_region: str
:param registry_id: 腾讯云容器镜像服务的镜像仓库ID
:type registry_id: str
:return:
:rtype:
"""

def new_custom_image(image_type, image_url, registry_region=None, registry_id=None):
"""自定义镜像的配置

:param image_type: 腾讯云容器镜像服务的镜像类型,如"CCR"
:type image_type: str
:param image_url: 腾讯云容器镜像服务的镜像地址
:type image_url: str
:param registry_region: 腾讯云容器镜像服务的镜像仓库的域
:type registry_region: str
:param registry_id: 腾讯云容器镜像服务的镜像仓库ID
:type registry_id: str
:return:
:rtype:
"""

def new_system_framework(framework_name, framework_environment, training_mode):
"""系统内置的训练框架

:param framework_name: 框架名称。 通过describe_training_frameworks()查看列表
:type framework_name: str
:param framework_environment: 框架环境。 通过describe_training_frameworks()查看列表
:type framework_environment: str
:param training_mode: 训练模式。 通过describe_training_frameworks()查看列表
:type training_mode: str
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.ResourceConfigInfo

def new_postpaid(instance_type, instance_num):
"""获取后付费模式下的资源配置

:param instance_type: 实例类型。通过 describe_postpaid_training_price() 查看实例列表
:type instance_type: str
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""
|
def new_prepaid(cpu, memory, gpu=0, gpu_type=None, instance_num=1):
"""获取预付费模式下的资源配置

:param cpu: CPU个数,单位是核
:type cpu: float
:param memory: 内存大小,单位是GB
:type memory: float
:param gpu_type: gpu类型
:type gpu_type: str
:param gpu: gpu个数
:type gpu: float
:param instance_num: 实例数量
:type instance_num: int
:return:
:rtype:
"""

----------------------------------------------------
tikit.models.TrainingDataConfig

def new_mount_cos(cos_str, target_path):
"""一个cos类型的训练数据

:param cos_str: cos存储,格式: <bucket>/<cos path>/
:type cos_str: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_dataset_mount(dataset_id, target_path):
"""一个dataset类型的训练数据

:param dataset_id: 数据集ID
:type dataset_id: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_cfs(cfs_id, source_path, target_path):
"""新建一个cfs类型的训练数据集配置

:param cfs_id: CFS的ID
:type cfs_id: str
:param source_path: CFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_hdfs(hdfs_id, source_path, target_path):
"""新建一个hdfs类型的训练数据集配置

:param hdfs_id: EMR上HDFS的ID
:type hdfs_id: str
:param source_path: HDFS的路径
:type source_path: str
:param target_path: 目标挂载路径
:type target_path: str
:return:
:rtype:
"""

def new_mount_wedata_hdfs(wedata_id, source_path):
"""新建一个wedata hdfs类型的训练数据集配置

:param wedata_id: wedata数据源id
:type wedata_id: int
:param source_path: HDFS的路径
:type source_path: str
:return:
:rtype:
"""

def new_dataset(id_target_dict):
""" Deprecated !
新建一个dataset类型的训练数据集配置

:param id_target_dict: 数据集信息。 dataset id -> 下载的目标路径
:type id_target_dict: dict
:return:
:rtype:
"""

def new_cos_data(cos_str_target_dict):
"""Deprecated !
新建一个cos类型的训练数据集配置

:param cos_str_target_dict: 数据集信息。 <bucket>/<cos path>/ -> 下载的目标路径
:type cos_str_target_dict: dict
:return:
:rtype:
"""