资讯 社区 文档
技术能力
语音技术
文字识别
人脸与人体
图像技术
语言与知识
视频技术

查看模型评估报告

功能介绍

用于获取评估报告(整体指标)。

使用说明

本文API支持通过Python SDK、Go SDK、Java SDK 和 Node.js SDK调用,调用流程请参考SDK安装及使用流程

SDK调用

调用示例

import os
from qianfan  import resources

# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk,如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"



resp = resources.console.utils.call_action(
    # 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/modelrepo/eval/report", "", 
    # 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
    {
        "id":"ame-vwgs2ybhyhfv"
    }
    
)

print(resp.body)
package main
import (
    "context"
    "fmt"
    "os"
    "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
     // 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
    os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
    os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
    
    ca := qianfan.NewConsoleAction()
    
    res, err := ca.Call(context.TODO(),
    // 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/modelrepo/eval/report", "",
    // 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
    map[string]interface{}{
              "id":"ame-vwgs2ybhyhfv",
    })
    if err != nil {
        panic(err)
    }
    fmt.Println(string(res.Body))
    
}
import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;

public class Dome {
    public static void main(String args[]){
        // 使用安全认证AK/SK鉴权,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
        Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
        
        ConsoleResponse<List<Map<String, Object>>> response = qianfan.console()
                // 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
                .route("/wenxinworkshop/modelrepo/eval/report")
                // 需要传入参数的场景,可以自行封装请求类,或者使用Map.of()来构建请求Body
                // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
                // 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
                .body(CollUtils.mapOf(
                    "id","ame-vwgs2ybhyhfv"
                ))
                .execute(new TypeRef<List<Map<String, Object>>>() {});

        System.out.println(Json.serialize(response));
    }
}
import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";

// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');

async function main() {
  //base_api_route:调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
  //data:请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
  const res = await consoleAction({base_api_route: '/wenxinworkshop/modelrepo/eval/report', data: {
        "id":"ame-vwgs2ybhyhfv"
    }
  });    
    
  console.log(res);
}

main();

返回示例

{
    "log_id": "3617826755",
    "result": [
        {
            "modelName": "llama2_7b_32k_z_sft",
            "modelVersion": "1",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae1fb1xxx9ca97a1",
            "modelVersionId": 833,
            "modelId": 591,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-dkxwxxxxjgw",
            "modelVersionIdStr": "amv-7ab3xxxtspe1",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-gpvzxxxs0n",
            "inferDatasetId": "ds-p79kyxxx7sbk",
            "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "1",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        },
        {
            "modelName": "mixtral2",
            "modelVersion": "8",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae45dxxxcab739",
            "modelVersionId": 7xx,
            "modelId": 545,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-ktcxxx88z",
            "modelVersionIdStr": "amv-g2acxxxg9v",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-1uxpxxx8uc2",
            "inferDatasetId": "ds-ba82xxxguh",
            "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "2",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        }
    ]
}
{
    "log_id": "3617826755",
    "result": [
        {
            "modelName": "llama2_7b_32k_z_sft",
            "modelVersion": "1",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae1fb1xxx9ca97a1",
            "modelVersionId": 833,
            "modelId": 591,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-dkxwxxxxjgw",
            "modelVersionIdStr": "amv-7ab3xxxtspe1",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-gpvzxxxs0n",
            "inferDatasetId": "ds-p79kyxxx7sbk",
            "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "1",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        },
        {
            "modelName": "mixtral2",
            "modelVersion": "8",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae45dxxxcab739",
            "modelVersionId": 7xx,
            "modelId": 545,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-ktcxxx88z",
            "modelVersionIdStr": "amv-g2acxxxg9v",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-1uxpxxx8uc2",
            "inferDatasetId": "ds-ba82xxxguh",
            "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "2",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        }
    ]
}
{
    "log_id": "3617826755",
    "result": [
        {
            "modelName": "llama2_7b_32k_z_sft",
            "modelVersion": "1",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae1fb1xxx9ca97a1",
            "modelVersionId": 833,
            "modelId": 591,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-dkxwxxxxjgw",
            "modelVersionIdStr": "amv-7ab3xxxtspe1",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-gpvzxxxs0n",
            "inferDatasetId": "ds-p79kyxxx7sbk",
            "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "1",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        },
        {
            "modelName": "mixtral2",
            "modelVersion": "8",
            "modelVersionSource": "Train",
            "evalMode": "manual",
            "evaluationName": "cl_联调_模型评估_用户bos",
            "id": "65eae45dxxxcab739",
            "modelVersionId": 7xx,
            "modelId": 545,
            "userId": 1,
            "evaluationId": 401,
            "modelForm": "model",
            "modelIdStr": "am-ktcxxx88z",
            "modelVersionIdStr": "amv-g2acxxxg9v",
            "evaluationIdStr": "ame-28zxxx2rn4",
            "evalUnitId": "ameu-1uxpxxx8uc2",
            "inferDatasetId": "ds-ba82xxxguh",
            "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
            "effectMetric": {
                "accuracy": 0,
                "f1Score": 0,
                "rouge_1": 0,
                "rouge_2": 0,
                "rouge_l": 0,
                "bleu4": 0,
                "avgJudgeScore": 0,
                "stdJudgeScore": 0,
                "medianJudgeScore": 0,
                "scoreDistribution": null,
                "manualAvgScore": 0.5,
                "goodCaseProportion": 0,
                "subjectiveImpression": "2",
                "manualScoreDistribution": [
                    {
                        "dimension": "满意度",
                        "scoreDistribution": {
                            "-1": 2,
                            "1": 1
                        }
                    },
                    {
                        "dimension": "安全性",
                        "scoreDistribution": {
                            "-1": 2,
                            "0": 1
                        }
                    }
                ]
            },
            "performanceMetric": {}
        }
    ]
}
{
    log_id: '3617826755',
    result: [
        {
            modelName: 'llama2_7b_32k_z_sft',
            modelVersion: '1',
            modelVersionSource: 'Train',
            evalMode: 'manual',
            evaluationName: 'cl_联调_模型评估_用户bos',
            id: '65eae1fb1xxx9ca97a1',
            modelVersionId: 833,
            modelId: 591,
            userId: 1,
            evaluationId: 401,
            modelForm: 'model',
            modelIdStr: 'am-dkxwxxxxjgw',
            modelVersionIdStr: 'amv-7ab3xxxtspe1',
            evaluationIdStr: 'ame-28zxxx2rn4',
            evalUnitId: 'ameu-gpvzxxxs0n',
            inferDatasetId: 'ds-p79kyxxx7sbk',
            inferDatasetName: 'cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr',
            effectMetric: {
                accuracy: 0,
                f1Score: 0,
                rouge_1: 0,
                rouge_2: 0,
                rouge_l: 0,
                bleu4: 0,
                avgJudgeScore: 0,
                stdJudgeScore: 0,
                medianJudgeScore: 0,
                scoreDistribution: null,
                manualAvgScore: 0.5,
                goodCaseProportion: 0,
                subjectiveImpression: '1',
                manualScoreDistribution: [
                    {
                        dimension: '满意度',
                        scoreDistribution: {
                            -1: 2,
                            1: 1
                        }
                    },
                    {
                        dimension: '安全性',
                        scoreDistribution: {
                            -1: 2,
                            0: 1
                        }
                    }
                ]
            },
            performanceMetric: {}
        },
        {
            modelName: 'mixtral2',
            modelVersion: '8',
            modelVersionSource: 'Train',
            evalMode: 'manual',
            evaluationName: 'cl_联调_模型评估_用户bos',
            id: '65eae45dxxxcab739',
            modelVersionId: 7xx,
            modelId: 545,
            userId: 1,
            evaluationId: 401,
            modelForm: 'model',
            modelIdStr: 'am-ktcxxx88z',
            modelVersionIdStr: 'amv-g2acxxxg9v',
            evaluationIdStr: 'ame-28zxxx2rn4',
            evalUnitId: 'ameu-1uxpxxx8uc2',
            inferDatasetId: 'ds-ba82xxxguh',
            inferDatasetName: 'cl_联调_模型评估_用户bos_mixtral2_V8_x5xt',
            effectMetric: {
                accuracy: 0,
                f1Score: 0,
                rouge_1: 0,
                rouge_2: 0,
                rouge_l: 0,
                bleu4: 0,
                avgJudgeScore: 0,
                stdJudgeScore: 0,
                medianJudgeScore: 0,
                scoreDistribution: null,
                manualAvgScore: 0.5,
                goodCaseProportion: 0,
                subjectiveImpression: '2',
                manualScoreDistribution: [
                    {
                        dimension: '满意度',
                        scoreDistribution: {
                            -1: 2,
                            1: 1
                        }
                    },
                    {
                        dimension: '安全性',
                        scoreDistribution: {
                            -1: 2,
                            0: 1
                        }
                    }
                ]
            },
            performanceMetric: {}
        }
    ]
}

请求参数

名称 类型 必填 描述
id string 评估任务id,示例:ame-vwgs2ybhyhfv,说明:
(1)可以通过以下方式获取该字段值:
· 方式一,通过调用创建模型评估任务接口,返回的字段evalIdStr获取
· 方式二,在控制台-模型评估页面,点击某评估任务名称打开详情页,在任务详情的基本信息中查看,如下图所示
image.png
(2)该字段新增支持string类型,如果之前使用的是int类型,建议变更为string类型,后续可能将逐步废弃int类型;例如之前是通过调用创建模型评估任务接口,返回的字段evalId获取,建议替换为返回的evalIdStr获取

返回参数

名称 类型 描述
log_id string 请求ID
result object[] 请求结果

result说明

名称 类型 描述
evaluationId int 评估任务ID
evaluationName string 评估任务名称
modelId int 模型ID
modelVersionId int 模型版本ID
modelName string 模型名
modelVersion string 模型版本号
modelVersionSource string 模型版本来源
evalMode string 评估模式,说明:
(1)有以下评估模式 :
· rule:基于规则
· model:裁判员模型
· manual:人工评估
(2)多个模式使用,拼接,示例“model,manual,rule”
effectMetric object 效果指标
modelForm string 评估的物料类型,说明:
· model:旧数据(推理结果集评估功能上线前的评估任务)类型都是模型,即值为model
· inferDataset:推理结果集
modelIdStr string 模型字符串id
modelVersionIdStr string 模型版本字符串id
evaluationIdStr string 评估任务字符串id
evalUnitId string 评估子任务id,用于唯一标识评估子任务
inferDatasetId string 当前评估子任务使用的推理结果集id
inferDatasetName string 当前评估子任务使用的推理结果集名称

effectMetric说明

名称 类型 描述
id string 单个评估报告的主键
accuracy number 基于规则-准确率打分
f1Score number 基于规则-准确率打分
rouge_1 number 基于规则-相似度打分
rouge_2 number 基于规则-相似度打分
rouge_l number 基于规则-相似度打分
bleu4 number 基于规则-相似度打分
avgJudgeScore number 裁判员打分-均值
stdJudgeScore number 裁判员打分-标准差
medianJudgeScore number 裁判员打分-中位数
scoreDistribution map[string]int 裁判员打分-分值分布,说明:
· 含有从最小值到最大值的所有分数
· -1为无效打分
manualAvgScore number 平均分
goodCaseProportion int good case占比
subjectiveImpression string 人工打分-主观印象
manualScoreDistribution object[] 维度分数分布

manualScoreDistribution说明

名称 类型 描述
dimension string 评价维度
scoreDistribution map[string]int 维度分值分布,key为分值,value为分值的个数
上一篇
查看模型评估详情
下一篇
停止模型评估任务