查看模型评估任务报告

更新时间：2025-04-16

功能介绍

用于根据评估任务id，查看模型评估任务报告。

使用说明

本文API支持通过Go SDK、Java SDK 和 Node.js SDK调用，调用流程请参考SDK安装及使用流程。

权限说明

调用本文API，需符合以下权限要求，权限介绍及分配，请查看角色与权限控制列表、账号创建与权限分配。需具有以下任一权限：

完全控制千帆大模型平台的权限：QianfanFullControlAccessPolicy
只读访问千帆大模型平台的权限：QianfanReadAccessPolicy
完全控制千帆大模型平台模型调优的权限：QianfanModelTuningFullControlAccessPolicy
只读访问千帆大模型平台模型调优的权限：QianfanModelTuningReadAccessPolicy

SDK调用

调用示例

import os
from qianfan  import resources

# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk，如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"

resp = resources.console.utils.call_action(
    # 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/v2/eval", 
    # 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求参数-Query参数的Action 
    "DescribeEvalTaskReport", 
    # 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    {"taskId": "ame-4kvnxxxxx"}
)
print(resp.body)

package main

import (
    "context"
    "fmt"
    "os"

    "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)

func main() {
     // 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
    os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
    os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
    
    ca := qianfan.NewConsoleAction()
    
    res, err := ca.Call(context.TODO(),
    // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/v2/eval",
    // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求参数-Query参数的Action 
    "DescribeEvalTaskReport",
    // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    map[string]interface{}{
       "taskId": "ame-4kvnxxx",
    })
    if err != nil {
        panic(err)
    }
    fmt.Println(string(res.Body))
    
}

import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;

public class Dome {
    public static void main(String args[]){
        // 使用安全认证AK/SK鉴权，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
        Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
        
        ConsoleResponse<Map<String, Object>> response = qianfan.console()
                // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
                .route("/v2/eval")
                // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求参数-Query参数的Action 
                .action("DescribeEvalTaskReport")
                // 需要传入参数的场景，可以自行封装请求类，或者使用Map.of()来构建请求Body
                // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
                // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
                .body(CollUtils.mapOf(
                    "taskId", "ame-4kvnxxx"
                ))
                .execute();

        System.out.println(Json.serialize(response));
    }
}

import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";

// 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');

async function main() {
  //base_api_route:调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
  //action:调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求参数-Query参数的Action 
  //data:请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
  const res = await consoleAction({base_api_route: '/v2/eval', action: 'DescribeEvalTaskReport', data: {
       "taskId": "ame-4kvnxxx"
  });    
    
  console.log(res);
}

main();

返回示例

{
	'requestId': 'd60a00c4-a724-4851-96e5-b4dc3b258ca0',
	'result': [
		{
			'taskId': 'ame-4kvnxxxx',
			'taskName': '自动评估_停止测试0910',
			'modelId': 'amv-tts8v6re61hp',
			'inferDatasetId': 'ds-ecwqqjb787dk1vm6',
			'evalObjectType': 'service',
			'evalMode': 'rule',
			'effectMetric': {
				'accuracy': 0,
				'f1Score': 0.34983957,
				'rouge_1': 0.33882716,
				'rouge_2': 0.15241386,
				'rouge_l': 0.26100817,
				'bleu4': 0.09671887,
				'avgJudgeScore': 0,
				'stdJudgeScore': 0,
				'medianJudgeScore': 0,
				'scoreDistribution': null,
				'manualAvgScore': 0,
				'goodCaseProportion': 0,
				'subjectiveImpression': '',
				'manualScoreDistribution': null,
				'gsbDistribution': null
			}
		},
		{
			'taskId': 'ame-4kvnxxxx',
			'taskName': '自动评估_停止测试0910',
			'modelId': 'amv-6j6is3sp166h',
			'inferDatasetId': 'ds-sueg3fqnd14h9kqt',
			'evalObjectType': 'service',
			'evalMode': 'rule',
			'effectMetric': {
				'accuracy': 0,
				'f1Score': 0.34691638,
				'rouge_1': 0.32689363,
				'rouge_2': 0.13487022,
				'rouge_l': 0.25140443,
				'bleu4': 0.087691635,
				'edit_dist': 331.97778,
				'embedding_dist': 0.16930991,
				'avgJudgeScore': 0,
				'stdJudgeScore': 0,
				'medianJudgeScore': 0,
				'scoreDistribution': null,
				'manualAvgScore': 0,
				'goodCaseProportion': 0,
				'subjectiveImpression': '',
				'manualScoreDistribution': null,
				'gsbDistribution': null
			}
		}
	]
}

{
	"requestId": "d60a00c4-a724-4851-96e5-b4dc3b258ca0",
	"result": [
		{
			"taskId": "ame-4kvnxxxx",
			"taskName": "自动评估_停止测试0910",
			"modelId": "amv-tts8v6re61hp",
			"inferDatasetId": "ds-ecwqqjb787dk1vm6",
			"evalObjectType": "service",
			"evalMode": "rule",
			"effectMetric": {
				"accuracy": 0,
				"f1Score": 0.34983957,
				"rouge_1": 0.33882716,
				"rouge_2": 0.15241386,
				"rouge_l": 0.26100817,
				"bleu4": 0.09671887,
				"avgJudgeScore": 0,
				"stdJudgeScore": 0,
				"medianJudgeScore": 0,
				"scoreDistribution": null,
				"manualAvgScore": 0,
				"goodCaseProportion": 0,
				"subjectiveImpression": "",
				"manualScoreDistribution": null,
				"gsbDistribution": null
			}
		},
		{
			"taskId": "ame-4kvnxxxx",
			"taskName": "自动评估_停止测试0910",
			"modelId": "amv-6j6is3sp166h",
			"inferDatasetId": "ds-sueg3fqnd14h9kqt",
			"evalObjectType": "service",
			"evalMode": "rule",
			"effectMetric": {
				"accuracy": 0,
				"f1Score": 0.34691638,
				"rouge_1": 0.32689363,
				"rouge_2": 0.13487022,
				"rouge_l": 0.25140443,
				"bleu4": 0.087691635,
				"edit_dist": 331.97778,
				"embedding_dist": 0.16930991,
				"avgJudgeScore": 0,
				"stdJudgeScore": 0,
				"medianJudgeScore": 0,
				"scoreDistribution": null,
				"manualAvgScore": 0,
				"goodCaseProportion": 0,
				"subjectiveImpression": "",
				"manualScoreDistribution": null,
				"gsbDistribution": null
			}
		}
	]
}

{
	"requestId": "d60a00c4-a724-4851-96e5-b4dc3b258ca0",
	"result": [
		{
			"taskId": "ame-4kvnxxxx",
			"taskName": "自动评估_停止测试0910",
			"modelId": "amv-tts8v6re61hp",
			"inferDatasetId": "ds-ecwqqjb787dk1vm6",
			"evalObjectType": "service",
			"evalMode": "rule",
			"effectMetric": {
				"accuracy": 0,
				"f1Score": 0.34983957,
				"rouge_1": 0.33882716,
				"rouge_2": 0.15241386,
				"rouge_l": 0.26100817,
				"bleu4": 0.09671887,
				"avgJudgeScore": 0,
				"stdJudgeScore": 0,
				"medianJudgeScore": 0,
				"scoreDistribution": null,
				"manualAvgScore": 0,
				"goodCaseProportion": 0,
				"subjectiveImpression": "",
				"manualScoreDistribution": null,
				"gsbDistribution": null
			}
		},
		{
			"taskId": "ame-4kvnxxxx",
			"taskName": "自动评估_停止测试0910",
			"modelId": "amv-6j6is3sp166h",
			"inferDatasetId": "ds-sueg3fqnd14h9kqt",
			"evalObjectType": "service",
			"evalMode": "rule",
			"effectMetric": {
				"accuracy": 0,
				"f1Score": 0.34691638,
				"rouge_1": 0.32689363,
				"rouge_2": 0.13487022,
				"rouge_l": 0.25140443,
				"bleu4": 0.087691635,
				"edit_dist": 331.97778,
				"embedding_dist": 0.16930991,
				"avgJudgeScore": 0,
				"stdJudgeScore": 0,
				"medianJudgeScore": 0,
				"scoreDistribution": null,
				"manualAvgScore": 0,
				"goodCaseProportion": 0,
				"subjectiveImpression": "",
				"manualScoreDistribution": null,
				"gsbDistribution": null
			}
		}
	]
}

{
	requestId: 'd60a00c4-a724-4851-96e5-b4dc3b258ca0',
	result: [
		{
			taskId: 'ame-4kvnxxxx',
			taskName: '自动评估_停止测试0910',
			modelId: 'amv-tts8v6re61hp',
			inferDatasetId: 'ds-ecwqqjb787dk1vm6',
			evalObjectType: 'service',
			evalMode: 'rule',
			effectMetric: {
				accuracy: 0,
				f1Score: 0.34983957,
				rouge_1: 0.33882716,
				rouge_2: 0.15241386,
				rouge_l: 0.26100817,
				bleu4: 0.09671887,
				avgJudgeScore: 0,
				stdJudgeScore: 0,
				medianJudgeScore: 0,
				scoreDistribution: null,
				manualAvgScore: 0,
				goodCaseProportion: 0,
				subjectiveImpression: '',
				manualScoreDistribution: null,
				gsbDistribution: null
			}
		},
		{
			taskId: 'ame-4kvnxxxx',
			taskName: '自动评估_停止测试0910',
			modelId: 'amv-6j6is3sp166h',
			inferDatasetId: 'ds-sueg3fqnd14h9kqt',
			evalObjectType: 'service',
			evalMode: 'rule',
			effectMetric: {
				accuracy: 0,
				f1Score: 0.34691638,
				rouge_1: 0.32689363,
				rouge_2: 0.13487022,
				rouge_l: 0.25140443,
				bleu4: 0.087691635,
				edit_dist: 331.97778,
				embedding_dist: 0.16930991,
				avgJudgeScore: 0,
				stdJudgeScore: 0,
				medianJudgeScore: 0,
				scoreDistribution: null,
				manualAvgScore: 0,
				goodCaseProportion: 0,
				subjectiveImpression: '',
				manualScoreDistribution: null,
				gsbDistribution: null
			}
		}
	]
}

请求参数

名称	类型	必填	描述
taskId	string	是	评估任务id，说明：（1）可以通过以下方式获取该字段值： · 方式一，通过调用创建模型评估任务接口，返回的字段result获取 · 方式二，在控制台-模型调优-模型评估，点击某评估任务名称打开详情页，在任务详情的基本信息中查看，如下图所示

返回参数

名称	类型	描述
requestId	string	请求ID
code	string	错误码，错误时返回
message	string	错误信息，请求失败时返回
result	List<object>	请求结果，请求成功时返回

result说明

名称	类型	描述
taskId	string	评估任务ID
taskName	string	评估任务名称
modelId	string	模型版本ID
inferDatasetId	string	当前评估子任务使用的推理结果集id
evalObjectType	string	评估的数据类型，说明： · model：模型推理 · inferDataset：推理结果集
evalMode	string	评估模式，说明：具体值如下： · rule：基于规则 · model：裁判员模型 · manual：人工评估 · rule,model：同时支持自动规则和自动裁判员评估
effectMetric	object	效果指标

effectMetric说明

名称	类型	描述
accuracy	number	基于规则-准确率打分
f1Score	number	基于规则-准确率打分
rouge_1	number	基于规则-相似度打分
rouge_2	number	基于规则-相似度打分
rouge_l	number	基于规则-相似度打分
bleu4	number	基于规则-相似度打分
avgJudgeScore	number	裁判员打分-均值
stdJudgeScore	number	裁判员打分-标准差
medianJudgeScore	number	裁判员打分-中位数
scoreDistribution	map[string]int	裁判员打分-分值分布，说明：（1）含有从最小值到最大值的所有分数（2）-1为无效打分
manualAvgScore	number	平均分
goodCaseProportion	number	good case占比
subjectiveImpression	string	人工打分-主观印象
manualScoreDistribution	object[]	维度分数分布
gsbDistribution	map[string]int	gsb打分分布

manualScoreDistribution说明

名称	类型	描述
dimension	string	评价维度
scoreDistribution	map[string]int	维度分值分布，key为分值，value为分值的个数

查看模型评估任务详情

删除模型评估任务