资讯 社区 文档
技术能力
语音技术
文字识别
人脸与人体
图像技术
语言与知识
视频技术

查看数据清洗任务详情

功能介绍

本接口用于查看数据清洗任务详情。

注意事项

(1)通过API查看数据清洗任务,和千帆控制台页面展示字段不同:

  • 本文API参数有的字段,可能在千帆控制台页面无
  • 千帆控制台页面的部分字段,可能在本文API参数中无
  • 后续会持续完善API功能,请关注API文档更新

(2)本文API支持通过Python SDK、Go SDK、Java SDK 和 Node.js SDK调用,调用流程请参考SDK安装及使用流程

SDK调用

调用示例

import os
from qianfan  import resources

# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk,如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"



resp = resources.console.utils.call_action(
    # 调用本文API,该参数值为固定值,无需修改;对应HTTP调用文档-请求说明-请求地址的后缀
    "/wenxinworkshop/etl/detail", "", 
    # 对应HTTP调用文档-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
    {
        "etlId": "task-9tff1q3h7ngdmgh4"
    }
    
)

print(resp.body)
package main
import (
    "context"
    "fmt"
    "os"
    "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
     // 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
    os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
    os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
    
    ca := qianfan.NewConsoleAction()
    
    res, err := ca.Call(context.TODO(),
    // 调用本文API,该参数值为固定值,无需修改;对应HTTP调用文档-请求说明-请求地址的后缀
    "/wenxinworkshop/etl/detail", "",
    // 对应HTTP调用文档-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
    map[string]interface{}{
               "etlId": "task-9tff1q3h7ngdmgh4",
    })
    if err != nil {
        panic(err)
    }
    fmt.Println(string(res.Body))
    
}
import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;

public class Dome {
    public static void main(String args[]){
        // 使用安全认证AK/SK鉴权,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
        Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
        
        ConsoleResponse<Map<String, Object>> response = qianfan.console()
                // 调用本文API,该参数值为固定值,无需修改;对应HTTP调用文档-请求说明-请求地址的后缀
                .route("/wenxinworkshop/etl/detail")
                // 需要传入参数的场景,可以自行封装请求类,或者使用Map.of()来构建请求Body
                // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
                // 对应HTTP调用文档-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
                .body(CollUtils.mapOf(
                    "etlId", "task-9tff1q3h7ngdmgh4"
                ))
                .execute();

        System.out.println(Json.serialize(response));
    }
}
import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";

// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');

async function main() {
  //base_api_route:调用本文API,该参数值为固定值,无需修改;对应HTTP调用文档-请求说明-请求地址的后缀
  //data:对应HTTP调用文档-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
  const res = await consoleAction({base_api_route: '/wenxinworkshop/etl/detail', data: {
        "etlId": "task-9tff1q3h7ngdmgh4"
    }
  });    
    
  console.log(res);
}

main();

返回示例

{
    "log_id": "44k3yj73ms178179",
    "result": {
        "id": 273,
        "etlTaskId": "task-7bynx9aaa1qyex2s",
        "userId": 113,
        "sourceDatasetId": 2235,
        "destDatasetId": 2230,
        "taskId": 5331,
        "entityCount": 1,
        "entityType": 2,
        "operationsV2": {
            "clean": [
                {
                    "name": "remove_invisible_character",
                    "args": {}
                },
                {
                    "name": "replace_uniform_whitespace",
                    "args": {}
                },
                {
                    "name": "remove_non_meaning_characters",
                    "args": {}
                },
                {
                    "name": "replace_traditional_chinese_to_simplified",
                    "args": {}
                },
                {
                    "name": "remove_web_identifiers",
                    "args": {}
                },
                {
                    "name": "remove_emoji",
                    "args": {}
                },
                {
                    "name": "save_pipeline_clean",
                    "args": {}
                }
            ],
            "deduplication": [
                {
                    "name": "deduplication_simhash",
                    "args": {
                        "distance": 5.6511
                    }
                },
                {
                    "name": "save_pipeline_deduplication",
                    "args": {}
                }
            ],
            "desensitization": [
                {
                    "name": "replace_emails",
                    "args": {}
                },
                {
                    "name": "replace_ip",
                    "args": {}
                },
                {
                    "name": "replace_identifier",
                    "args": {}
                },
                {
                    "name": "save_pipeline_desensitization",
                    "args": {}
                }
            ],
            "filter": [
                {
                    "name": "filter_check_number_words",
                    "args": {
                        "number_words_max_cutoff": 10000,
                        "number_words_min_cutoff": 2.2
                    }
                },
                {
                    "name": "filter_check_character_repetition_removal",
                    "args": {
                        "default_character_repetition_max_cutoff": 0.2
                    }
                },
                {
                    "name": "filter_check_word_repetition_removal",
                    "args": {
                        "word_repetition_max_cutoff": 0.6
                    }
                },
                {
                    "name": "filter_check_special_characters",
                    "args": {
                        "special_characters_max_cutoff": 0.3
                    }
                },
                {
                    "name": "filter_check_flagged_words",
                    "args": {
                        "flagged_words_max_cutoff": 0.50556
                    }
                },
                {
                    "name": "filter_check_lang_id",
                    "args": {
                        "lang_id_min_cutoff": 0.5
                    }
                },
                {
                    "name": "filter_check_perplexity",
                    "args": {
                        "perplexity_max_cutoff": 1110
                    }
                },
                {
                    "name": "save_pipeline_filter",
                    "args": {}
                }
            ]
        },
        "result": {
            "RET_OK": 0,
            "pipeline_stage_result": {
                "clean": {
                    "status": "Success",
                    "operator_count": 6,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "remove_invisible_character",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_uniform_whitespace",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_non_meaning_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_traditional_chinese_to_simplified",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_web_identifiers",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_emoji",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                },
                "deduplication": {
                    "status": "Success",
                    "operator_count": 1,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "deduplication_simhash",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "desensitization": {
                    "status": "Success",
                    "operator_count": 3,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "replace_emails",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_ip",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_identifier",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "filter": {
                    "status": "Success",
                    "operator_count": 7,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "filter_check_number_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_character_repetition_removal",
                            "remaining_count": 0,
                            "drop_count": 1
                        },
                        {
                            "name": "filter_check_word_repetition_removal",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_special_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_flagged_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_lang_id",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_perplexity",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                }
            },
            "export_entity_num": 0,
            "remaining_entity": 0,
            "unprocessed_entity": 0,
            "remove_emoji": {
                "processed_entity": 0
            },
            "remove_url": {
                "processed_entity": 0
            },
            "trad_to_simp": {
                "processed_entity": 0
            },
            "remove_id_card": {
                "processed_entity": 0
            },
            "remove_phone_number": {
                "processed_entity": 0
            },
            "remove_exception_char": {
                "processed_entity": 0
            },
            "replace_sim2trad": {
                "processed_entity": 0
            },
            "replace_trad2sim": {
                "processed_entity": 0
            },
            "replace_upper2lower": {
                "processed_entity": 0
            },
            "cut": {
                "remaining_entity": 0,
                "unprocessed_entity": 0
            },
            "failReason": "",
            "pauseReason": ""
        },
        "processStatus": 2,
        "status": 0,
        "createTime": "2023-11-06T14:31:03+08:00",
        "finishTime": "2023-11-06T14:32:11+08:00",
        "creatorName": "yyw02",
        "sourceDatasetName": "zy_泛文本5-V1",
        "sourceDatasetStrId": "ds-xarnk5tdirfjky2q",
        "destDatasetName": "g423423-V2",
        "destDatasetStrId": "ds-9tf91q1h7n3dm7h4",
        "etlResult": "",
        "remainingEntity": 0,
        "exceptionResult": "",
        "startTime": "2023-11-06 14:31:03",
        "endTime": "2023-11-06 14:32:11",
        "modifyTime": "2023-11-06 14:32:11",
        "logPath": "https://bj.bcebos.com/easydata-qabosqa/qianfan/qianfan1019/_system_/dataset/ds-u7898jqx2aabjp38/cleaning/2235-2230-273-20231106143103.txt?x-bce-security-token=ZjkyZmQ2YmQxZTQ3NDxxxxxZp70QaweY1MNyT32OKRGNCew%3D%3D\u0026authorization=bce-auth-v1%2F24ec282b7c6d11eexxxxx4d33b5123"
    },
    "status": 200,
    "success": True
}
{
    "log_id": "44k3yj73ms178179",
    "result": {
        "id": 273,
        "etlTaskId": "task-7bynx9aaa1qyex2s",
        "userId": 113,
        "sourceDatasetId": 2235,
        "destDatasetId": 2230,
        "taskId": 5331,
        "entityCount": 1,
        "entityType": 2,
        "operationsV2": {
            "clean": [
                {
                    "name": "remove_invisible_character",
                    "args": {}
                },
                {
                    "name": "replace_uniform_whitespace",
                    "args": {}
                },
                {
                    "name": "remove_non_meaning_characters",
                    "args": {}
                },
                {
                    "name": "replace_traditional_chinese_to_simplified",
                    "args": {}
                },
                {
                    "name": "remove_web_identifiers",
                    "args": {}
                },
                {
                    "name": "remove_emoji",
                    "args": {}
                },
                {
                    "name": "save_pipeline_clean",
                    "args": {}
                }
            ],
            "deduplication": [
                {
                    "name": "deduplication_simhash",
                    "args": {
                        "distance": 5.6511
                    }
                },
                {
                    "name": "save_pipeline_deduplication",
                    "args": {}
                }
            ],
            "desensitization": [
                {
                    "name": "replace_emails",
                    "args": {}
                },
                {
                    "name": "replace_ip",
                    "args": {}
                },
                {
                    "name": "replace_identifier",
                    "args": {}
                },
                {
                    "name": "save_pipeline_desensitization",
                    "args": {}
                }
            ],
            "filter": [
                {
                    "name": "filter_check_number_words",
                    "args": {
                        "number_words_max_cutoff": 10000,
                        "number_words_min_cutoff": 2.2
                    }
                },
                {
                    "name": "filter_check_character_repetition_removal",
                    "args": {
                        "default_character_repetition_max_cutoff": 0.2
                    }
                },
                {
                    "name": "filter_check_word_repetition_removal",
                    "args": {
                        "word_repetition_max_cutoff": 0.6
                    }
                },
                {
                    "name": "filter_check_special_characters",
                    "args": {
                        "special_characters_max_cutoff": 0.3
                    }
                },
                {
                    "name": "filter_check_flagged_words",
                    "args": {
                        "flagged_words_max_cutoff": 0.50556
                    }
                },
                {
                    "name": "filter_check_lang_id",
                    "args": {
                        "lang_id_min_cutoff": 0.5
                    }
                },
                {
                    "name": "filter_check_perplexity",
                    "args": {
                        "perplexity_max_cutoff": 1110
                    }
                },
                {
                    "name": "save_pipeline_filter",
                    "args": {}
                }
            ]
        },
        "result": {
            "RET_OK": 0,
            "pipeline_stage_result": {
                "clean": {
                    "status": "Success",
                    "operator_count": 6,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "remove_invisible_character",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_uniform_whitespace",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_non_meaning_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_traditional_chinese_to_simplified",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_web_identifiers",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_emoji",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                },
                "deduplication": {
                    "status": "Success",
                    "operator_count": 1,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "deduplication_simhash",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "desensitization": {
                    "status": "Success",
                    "operator_count": 3,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "replace_emails",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_ip",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_identifier",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "filter": {
                    "status": "Success",
                    "operator_count": 7,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "filter_check_number_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_character_repetition_removal",
                            "remaining_count": 0,
                            "drop_count": 1
                        },
                        {
                            "name": "filter_check_word_repetition_removal",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_special_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_flagged_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_lang_id",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_perplexity",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                }
            },
            "export_entity_num": 0,
            "remaining_entity": 0,
            "unprocessed_entity": 0,
            "remove_emoji": {
                "processed_entity": 0
            },
            "remove_url": {
                "processed_entity": 0
            },
            "trad_to_simp": {
                "processed_entity": 0
            },
            "remove_id_card": {
                "processed_entity": 0
            },
            "remove_phone_number": {
                "processed_entity": 0
            },
            "remove_exception_char": {
                "processed_entity": 0
            },
            "replace_sim2trad": {
                "processed_entity": 0
            },
            "replace_trad2sim": {
                "processed_entity": 0
            },
            "replace_upper2lower": {
                "processed_entity": 0
            },
            "cut": {
                "remaining_entity": 0,
                "unprocessed_entity": 0
            },
            "failReason": "",
            "pauseReason": ""
        },
        "processStatus": 2,
        "status": 0,
        "createTime": "2023-11-06T14:31:03+08:00",
        "finishTime": "2023-11-06T14:32:11+08:00",
        "creatorName": "yyw02",
        "sourceDatasetName": "zy_泛文本5-V1",
        "sourceDatasetStrId": "ds-xarnk5tdirfjky2q",
        "destDatasetName": "g423423-V2",
        "destDatasetStrId": "ds-9tf91q1h7n3dm7h4",
        "etlResult": "",
        "remainingEntity": 0,
        "exceptionResult": "",
        "startTime": "2023-11-06 14:31:03",
        "endTime": "2023-11-06 14:32:11",
        "modifyTime": "2023-11-06 14:32:11",
        "logPath": "https://bj.bcebos.com/easydata-qabosqa/qianfan/qianfan1019/_system_/dataset/ds-u7898jqx2aabjp38/cleaning/2235-2230-273-20231106143103.txt?x-bce-security-token=ZjkyZmQ2YmQxZTQ3NDxxxxxZp70QaweY1MNyT32OKRGNCew%3D%3D\u0026authorization=bce-auth-v1%2F24ec282b7c6d11eexxxxx4d33b5123"
    },
    "status": 200,
    "success": true
}
{
    "log_id": "44k3yj73ms178179",
    "result": {
        "id": 273,
        "etlTaskId": "task-7bynx9aaa1qyex2s",
        "userId": 113,
        "sourceDatasetId": 2235,
        "destDatasetId": 2230,
        "taskId": 5331,
        "entityCount": 1,
        "entityType": 2,
        "operationsV2": {
            "clean": [
                {
                    "name": "remove_invisible_character",
                    "args": {}
                },
                {
                    "name": "replace_uniform_whitespace",
                    "args": {}
                },
                {
                    "name": "remove_non_meaning_characters",
                    "args": {}
                },
                {
                    "name": "replace_traditional_chinese_to_simplified",
                    "args": {}
                },
                {
                    "name": "remove_web_identifiers",
                    "args": {}
                },
                {
                    "name": "remove_emoji",
                    "args": {}
                },
                {
                    "name": "save_pipeline_clean",
                    "args": {}
                }
            ],
            "deduplication": [
                {
                    "name": "deduplication_simhash",
                    "args": {
                        "distance": 5.6511
                    }
                },
                {
                    "name": "save_pipeline_deduplication",
                    "args": {}
                }
            ],
            "desensitization": [
                {
                    "name": "replace_emails",
                    "args": {}
                },
                {
                    "name": "replace_ip",
                    "args": {}
                },
                {
                    "name": "replace_identifier",
                    "args": {}
                },
                {
                    "name": "save_pipeline_desensitization",
                    "args": {}
                }
            ],
            "filter": [
                {
                    "name": "filter_check_number_words",
                    "args": {
                        "number_words_max_cutoff": 10000,
                        "number_words_min_cutoff": 2.2
                    }
                },
                {
                    "name": "filter_check_character_repetition_removal",
                    "args": {
                        "default_character_repetition_max_cutoff": 0.2
                    }
                },
                {
                    "name": "filter_check_word_repetition_removal",
                    "args": {
                        "word_repetition_max_cutoff": 0.6
                    }
                },
                {
                    "name": "filter_check_special_characters",
                    "args": {
                        "special_characters_max_cutoff": 0.3
                    }
                },
                {
                    "name": "filter_check_flagged_words",
                    "args": {
                        "flagged_words_max_cutoff": 0.50556
                    }
                },
                {
                    "name": "filter_check_lang_id",
                    "args": {
                        "lang_id_min_cutoff": 0.5
                    }
                },
                {
                    "name": "filter_check_perplexity",
                    "args": {
                        "perplexity_max_cutoff": 1110
                    }
                },
                {
                    "name": "save_pipeline_filter",
                    "args": {}
                }
            ]
        },
        "result": {
            "RET_OK": 0,
            "pipeline_stage_result": {
                "clean": {
                    "status": "Success",
                    "operator_count": 6,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "remove_invisible_character",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_uniform_whitespace",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_non_meaning_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_traditional_chinese_to_simplified",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_web_identifiers",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "remove_emoji",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                },
                "deduplication": {
                    "status": "Success",
                    "operator_count": 1,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "deduplication_simhash",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "desensitization": {
                    "status": "Success",
                    "operator_count": 3,
                    "entity_match_count": 0,
                    "each_operator_result": [
                        {
                            "name": "replace_emails",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_ip",
                            "remaining_count": 0,
                            "drop_count": 0
                        },
                        {
                            "name": "replace_identifier",
                            "remaining_count": 0,
                            "drop_count": 0
                        }
                    ]
                },
                "filter": {
                    "status": "Success",
                    "operator_count": 7,
                    "entity_match_count": 1,
                    "each_operator_result": [
                        {
                            "name": "filter_check_number_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_character_repetition_removal",
                            "remaining_count": 0,
                            "drop_count": 1
                        },
                        {
                            "name": "filter_check_word_repetition_removal",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_special_characters",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_flagged_words",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_lang_id",
                            "remaining_count": 1,
                            "drop_count": 0
                        },
                        {
                            "name": "filter_check_perplexity",
                            "remaining_count": 1,
                            "drop_count": 0
                        }
                    ]
                }
            },
            "export_entity_num": 0,
            "remaining_entity": 0,
            "unprocessed_entity": 0,
            "remove_emoji": {
                "processed_entity": 0
            },
            "remove_url": {
                "processed_entity": 0
            },
            "trad_to_simp": {
                "processed_entity": 0
            },
            "remove_id_card": {
                "processed_entity": 0
            },
            "remove_phone_number": {
                "processed_entity": 0
            },
            "remove_exception_char": {
                "processed_entity": 0
            },
            "replace_sim2trad": {
                "processed_entity": 0
            },
            "replace_trad2sim": {
                "processed_entity": 0
            },
            "replace_upper2lower": {
                "processed_entity": 0
            },
            "cut": {
                "remaining_entity": 0,
                "unprocessed_entity": 0
            },
            "failReason": "",
            "pauseReason": ""
        },
        "processStatus": 2,
        "status": 0,
        "createTime": "2023-11-06T14:31:03+08:00",
        "finishTime": "2023-11-06T14:32:11+08:00",
        "creatorName": "yyw02",
        "sourceDatasetName": "zy_泛文本5-V1",
        "sourceDatasetStrId": "ds-xarnk5tdirfjky2q",
        "destDatasetName": "g423423-V2",
        "destDatasetStrId": "ds-9tf91q1h7n3dm7h4",
        "etlResult": "",
        "remainingEntity": 0,
        "exceptionResult": "",
        "startTime": "2023-11-06 14:31:03",
        "endTime": "2023-11-06 14:32:11",
        "modifyTime": "2023-11-06 14:32:11",
        "logPath": "https://bj.bcebos.com/easydata-qabosqa/qianfan/qianfan1019/_system_/dataset/ds-u7898jqx2aabjp38/cleaning/2235-2230-273-20231106143103.txt?x-bce-security-token=ZjkyZmQ2YmQxZTQ3NDxxxxxZp70QaweY1MNyT32OKRGNCew%3D%3D\u0026authorization=bce-auth-v1%2F24ec282b7c6d11eexxxxx4d33b5123"
    },
    "status": 200,
    "success": true
}
{
    log_id: '44k3yj73ms178179',
    result: {
        id: 273,
        etlTaskId: 'task-7bynx9aaa1qyex2s',
        userId: 113,
        sourceDatasetId: 2235,
        destDatasetId: 2230,
        taskId: 5331,
        entityCount: 1,
        entityType: 2,
        operationsV2: {
            clean: [
                {
                    name: 'remove_invisible_character',
                    args: {}
                },
                {
                    name: 'replace_uniform_whitespace',
                    args: {}
                },
                {
                    name: 'remove_non_meaning_characters',
                    args: {}
                },
                {
                    name: 'replace_traditional_chinese_to_simplified',
                    args: {}
                },
                {
                    name: 'remove_web_identifiers',
                    args: {}
                },
                {
                    name: 'remove_emoji',
                    args: {}
                },
                {
                    name: 'save_pipeline_clean',
                    args: {}
                }
            ],
            deduplication: [
                {
                    name: 'deduplication_simhash',
                    args: {
                        distance: 5.6511
                    }
                },
                {
                    name: 'save_pipeline_deduplication',
                    args: {}
                }
            ],
            desensitization: [
                {
                    name: 'replace_emails',
                    args: {}
                },
                {
                    name: 'replace_ip',
                    args: {}
                },
                {
                    name: 'replace_identifier',
                    args: {}
                },
                {
                    name: 'save_pipeline_desensitization',
                    args: {}
                }
            ],
            filter: [
                {
                    name: 'filter_check_number_words',
                    args: {
                        number_words_max_cutoff: 10000,
                        number_words_min_cutoff: 2.2
                    }
                },
                {
                    name: 'filter_check_character_repetition_removal',
                    args: {
                        default_character_repetition_max_cutoff: 0.2
                    }
                },
                {
                    name: 'filter_check_word_repetition_removal',
                    args: {
                        word_repetition_max_cutoff: 0.6
                    }
                },
                {
                    name: 'filter_check_special_characters',
                    args: {
                        special_characters_max_cutoff: 0.3
                    }
                },
                {
                    name: 'filter_check_flagged_words',
                    args: {
                        flagged_words_max_cutoff: 0.50556
                    }
                },
                {
                    name: 'filter_check_lang_id',
                    args: {
                        lang_id_min_cutoff: 0.5
                    }
                },
                {
                    name: 'filter_check_perplexity',
                    args: {
                        perplexity_max_cutoff: 1110
                    }
                },
                {
                    name: 'save_pipeline_filter',
                    args: {}
                }
            ]
        },
        result: {
            RET_OK: 0,
            pipeline_stage_result: {
                clean: {
                    status: 'Success',
                    operator_count: 6,
                    entity_match_count: 1,
                    each_operator_result: [
                        {
                            name: 'remove_invisible_character',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'replace_uniform_whitespace',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'remove_non_meaning_characters',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'replace_traditional_chinese_to_simplified',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'remove_web_identifiers',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'remove_emoji',
                            remaining_count: 1,
                            drop_count: 0
                        }
                    ]
                },
                deduplication: {
                    status: 'Success',
                    operator_count: 1,
                    entity_match_count: 0,
                    each_operator_result: [
                        {
                            name: 'deduplication_simhash',
                            remaining_count: 0,
                            drop_count: 0
                        }
                    ]
                },
                desensitization: {
                    status: 'Success',
                    operator_count: 3,
                    entity_match_count: 0,
                    each_operator_result: [
                        {
                            name: 'replace_emails',
                            remaining_count: 0,
                            drop_count: 0
                        },
                        {
                            name: 'replace_ip',
                            remaining_count: 0,
                            drop_count: 0
                        },
                        {
                            name: 'replace_identifier',
                            remaining_count: 0,
                            drop_count: 0
                        }
                    ]
                },
                filter: {
                    status: 'Success',
                    operator_count: 7,
                    entity_match_count: 1,
                    each_operator_result: [
                        {
                            name: 'filter_check_number_words',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'filter_check_character_repetition_removal',
                            remaining_count: 0,
                            drop_count: 1
                        },
                        {
                            name: 'filter_check_word_repetition_removal',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'filter_check_special_characters',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'filter_check_flagged_words',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'filter_check_lang_id',
                            remaining_count: 1,
                            drop_count: 0
                        },
                        {
                            name: 'filter_check_perplexity',
                            remaining_count: 1,
                            drop_count: 0
                        }
                    ]
                }
            },
            export_entity_num: 0,
            remaining_entity: 0,
            unprocessed_entity: 0,
            remove_emoji: {
                processed_entity: 0
            },
            remove_url: {
                processed_entity: 0
            },
            trad_to_simp: {
                processed_entity: 0
            },
            remove_id_card: {
                processed_entity: 0
            },
            remove_phone_number: {
                processed_entity: 0
            },
            remove_exception_char: {
                processed_entity: 0
            },
            replace_sim2trad: {
                processed_entity: 0
            },
            replace_trad2sim: {
                processed_entity: 0
            },
            replace_upper2lower: {
                processed_entity: 0
            },
            cut: {
                remaining_entity: 0,
                unprocessed_entity: 0
            },
            failReason: '',
            pauseReason: ''
        },
        processStatus: 2,
        status: 0,
        createTime: '2023-11-06T14:31:03+08:00',
        finishTime: '2023-11-06T14:32:11+08:00',
        creatorName: 'yyw02',
        sourceDatasetName: 'zy_泛文本5-V1',
        sourceDatasetStrId: 'ds-xarnk5tdirfjky2q',
        destDatasetName: 'g423423-V2',
        destDatasetStrId: 'ds-9tf91q1h7n3dm7h4',
        etlResult: '',
        remainingEntity: 0,
        exceptionResult: '',
        startTime: '2023-11-06 14:31:03',
        endTime: '2023-11-06 14:32:11',
        modifyTime: '2023-11-06 14:32:11',
        logPath: 'https://bj.bcebos.com/easydata-qabosqa/qianfan/qianfan1019/_system_/dataset/ds-u7898jqx2aabjp38/cleaning/2235-2230-273-20231106143103.txt?x-bce-security-token=ZjkyZmQ2YmQxZTQ3NDxxxxxZp70QaweY1MNyT32OKRGNCew==\u0026authorization=bce-auth-v1/24ec282b7c6d11eexxxxx4d33b5123'
    },
    status: 200,
    success: true
}

请求参数

名称 类型 必填 描述
etlId string 数据清洗任务序号,说明:
(1)可以通过以下任一方式获取该字段值:
· 方式一,通过调用创建数据清洗任务接口,返回的字段result获取
· 方式二,通过调用查看清洗任务列表接口,返回的字段etlStrId获取
· 方式三,在控制台-数据处理-数据清洗页面,查看任务序号,如下图所示:
image.png
(2)该字段新增支持string类型,如果之前使用的是int类型,建议变更为string类型,后续可能将逐步废弃int类型;例如之前获取数据清洗任务序号,是调用查看清洗任务列表接口,返回的etlId字段获取,请替换为接口返回的etlStrId字段获取

返回参数

说明:返回的部分字段如下,未说明的字段暂无需关注。

名称 类型 描述
log_id string 操作记录id
result object 返回结果
status int 状态码
success bool 是否操作成功,说明:
· true:成功
· false:失败

返回结果result说明

名称 类型 描述
id int 任务序号,注意:该字段后续将废弃,如果有使用此字段,建议变更为etlTaskId字段
etlTaskId string 任务序号
userId int 用户ID
sourceDatasetId int 清洗前的源数据集版本ID,注意:该字段后续将废弃,如果有使用此字段,建议变更为sourceDatasetStrId字段
sourceDatasetStrId string 清洗前的源数据集版本ID
destDatasetId int 清洗后的目标数据集版本ID,注意: 该字段后续将废弃,如果有使用此字段,建议变更为destDatasetStrId字段
destDatasetStrId string 清洗后的目标数据集版本ID
taskId int 数据清洗任务ID
entityCount int 样本个数
entityType int 样本类型,说明:
1:图片
2:文本
3:音频
4:视频
operationsV2 map[string][]operationV2 清洗配置 ,说明:
(1)key为string,有以下值:
· 清洗:clean
· 过滤:filter
· 去重:deduplication
· 去隐私:desensitization
(2)value为list,值为单个阶段用户所选择的所有算子组成的列表
· 列表中的每个元素,对应某个算子的配置,格式参考operationsV2说明
· 如果用户没有在对应阶段选择任何算子,则value为空列表
result object 清洗结果
processStatus int 清洗状态信息,说明:
· 0:无状态,表示没有任务
· 1:进行中
· 2:已完成
· 3:已终止
· 4:清洗失败
· 5:任务暂停
status int 清洗任务状态,说明:
· 0:正常
· 1:删除
createTime string 创建时间
finishTime string 完成时间
creatorName string 创建者名称
sourceDatasetName string 源数据集名称
destDatasetName string 目标数据集名称
etlResult string 清洗结果
remainingEntity int 清洗后剩余的样本数量
exceptionResult string 异常原因
startTime string 任务启动时间
endTime string 任务结束时间
modifyTime string 更改时间
logPath string 清洗日志文件路径,如/minio/v-abc/some/path/1-2-1-20231010181818.txt

operationV2说明

名称 类型 描述
name string 算子名称,见各阶段name值和args值
(1)Clean清洗阶段算子
· remove_emoji:去除文档中的表情
· remove_invisible_character:移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围
· replace_uniform_whitespace:将不同的unicode空格比如  u2008,转成正常的空格
· remove_non_meaning_characters:去除乱码和无意义的unicode
· replace_traditional_chinese_to_simplified:繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容”
· remove_web_identifiers:移除文档中的html标签,如<html>,<dev>,<p>
(2)Filter过滤阶段算子
· filter_check_number_words:检查文档的词数目,词数目不在指定范围会被过滤掉,如中文[1,10000]
· filter_check_word_repetition_removal:检查文档的词重复率,如果词重复率太高,意味着文档中重复的词太多,文档会被过滤掉
· filter_check_character_repetition_removal:检查文档的字重复率,如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉
· filter_check_special_characters:检查文档的特殊字符率,如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉
·filter_check_flagged_words:检查文档的色情暴力词率,如果色情暴力词率太高,文档会被过滤掉
· filter_check_lang_id:检查文档的语言概率,如果语言概率太低,文档会被过滤掉
· filter_check_perplexity:检查文档的困惑度,如果困惑度太高,文档会被过滤掉
(3)Deduplication去重阶段算子
· deduplication_simhash:根据海明距离计算文档相似度, 相似度<=海明距离,认为两个文档相似。
(4)Desensitization 去隐私阶段算子
· replace_emails:去除email地址
· replace_ip:去除IPv4 或者 IPv6 地址
· replace_identifier:去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例
args object 算子参数,格式随参数名称而变化,见各阶段name值对应的args说明:
· 当name为Clean清洗阶段算子,args值为空
· 当name为Desensitization 去隐私阶段算子,args值为空
· 当name为Deduplication或Desensitization,请查看args说明

args说明

  • 当name为Clean清洗阶段算子,args值为空
  • 当name为Desensitization 去隐私阶段算子,args值为空
  • 当name为Deduplication去重阶段算子,args说明如下
名称 类型 描述
distance int 范围4-6
  • 当name为Filter过滤阶段算子,args说明如下
名称 类型 描述
number_words_min_cutoff float 最小词数目
· 范围为[1,10000]
· 当name=filter_check_number_words,该字段必填
number_words_max_cutoff float 最大词数目
· 范围为[1,10000]
· 当name=filter_check_number_words,该字段必填
word_repetition_max_cutoff float 文档的词重复率
· 范围为0-1
· 当name=filter_check_word_repetition_removal,该字段必填
default_character_repetition_max_cutoff float 文档的字重复率
· 范围为0-1
· 当name=filter_check_character_repetition_removal,该字段必填
special_characters_max_cutoff float 检查文档的特殊字符率,如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉
· 范围为0-1
· 当name=filter_check_special_characters,该字段必填
flagged_words_max_cutoff float 检查文档的色情暴力词率,如果色情暴力词率太高,文档会被过滤掉
·范围为0-1
· 当name=filter_check_flagged_words,该字段必填
lang_id_min_cutoff float 检查文档的语言概率,如果语言概率太低,文档会被过滤掉
· 范围为0-1
· 当name=filter_check_lang_id,该字段必填
perplexity_max_cutoff float 检查文档的困惑度,如果困惑度太高,文档会被过滤掉
·范围为1-5000
· 当name=filter_check_perplexity,该字段必填

清洗结果result说明

名称 类型 描述
RET_OK int 清洗结果
pipeline_stage_result object pipeline状态结果
export_entity_num int 导出样本数量
remaining_entity int 剩余样本
unprocessed_entity int 尚未清洗样本
remove_emoji object 里面只有一个int字段,processed_entity:某个算子被执行的行数
remove_url object 只有一个int字段,processed_entity:某个算子被执行的行数
trad_to_simp object 只有一个int字段,processed_entity:某个算子被执行的行数
remove_id_card object 只有一个int字段,processed_entity:某个算子被执行的行数
remove_phone_number object 只有一个int字段,processed_entity:某个算子被执行的行数
remove_exception_char object 只有一个int字段,processed_entity:某个算子被执行的行数
replace_sim2trad object 只有一个int字段,processed_entity:某个算子被执行的行数
replace_trad2sim object 只有一个int字段,processed_entity:某个算子被执行的行数
replace_upper2lower object 只有一个int字段,processed_entity:某个算子被执行的行数
cut object 裁剪,说明:
· remaining_entity:剩余样本数量
· unprocessed_entity:尚未清洗样本
failReason string 失败原因
pauseReason string 暂停原因

pipeline_stage_result说明

名称 类型 描述
clean object 数据清洗clean阶段执行结果
deduplication object 数据清洗deduplication阶段执行结果
desensitization object 数据清洗desensitization阶段执行结果
filter object 数据清洗filter阶段执行结果

执行结果说明

clean、deduplication、desensitization、filter 阶段执行结果字段相同,如下

名称 类型 描述
status string 数据清洗某阶段执行结果,例:"Success"
operator_count int 该阶段算子数
entity_match_count int 匹配到的样本数量
each_operator_result object[] 具体到算子的清洗结果列表

each_operator_result 说明

名称 类型 描述
name string 算子名称
remaining_count int 通过该算子清洗后剩余样本数
drop_count int 通过该算子清洗掉的样本数
上一篇
创建数据清洗任务
下一篇
删除数据清洗任务