C++ API
更新时间:2022-08-01
本文介绍数据预处理模块的C++ API的使用说明,适用于将预处理模块作为编译依赖,进行C++联编的应用场景。
一、使用说明
- 切字接口
class Tokenizer {
public:
Tokenizer();
virtual ~Tokenizer();
/*
*初始化:通过配置文件初始化全局词典
*@params:
* conf_path: 配置文件目录
* conf_file: 配置文件名
*@return:
* Status::OK: 成功
* Status(-1): 失败
*/
Status init(const char* conf_path, const char* conf_file);
/*
* 将原始输入文本进行切字
* @params:
* query: 输入字符串,utf8编码格式
* vec_chars: 返回切字结果
* @return:
* Status::OK: 成功
* Status(-1): 失败
*/
virtual Status tokenize(const std::string& query, std::vector<std::string>& vec_chars) { return Status::OK; }
};
- 配置文件: a. 词典相关配置(conf/tokenize.conf)
[GlobalData]
# wordseg词典,WordsegTokenizer会用到,其他的可以不配置
seg_dict_dir : ./nlpc_wordseg_3016/conf/nlp_data
# 切字词典文件
vocab: ./vocab/ernie_base.txt
is_ernie: 1
[Auth]
# 用户需要复制NLPC平台上的AK/SK, 填写到下面的ak/sk配置项,用于鉴权;
# NLPC平台申请地址:http://nlp.baidu-int.com/wenxin/apply
ak: xxxxxxxxxxxxxxxxx
sk: xxxxxxxxxxxxxxxxx
# 鉴权server地址,请保持默认值
url: bns://group.opera-onlineOperate-wenxinStatistics-all-hb.NLPC.all
b. 日志配置文件(conf/log.conf)
COMLOG_PROCNAME : wenxin_inference
COMLOG_LEVEL : 8
COMLOG_SELFDEFINE: USER
COMLOG_DEVICE_NUM: 2
COMLOG_DEVICE0: NORMLOG
COMLOG_DEVICE1: WFLOG
COMLOG_LOGLENGTH: 1048576
NORMLOG_NAME: wenxin_inference.log
NORMLOG_TYPE: FILE
NORMLOG_SYSLEVEL: 8
NORMLOG_PATH: ./log
NORMLOG_SELFLEVEL: NULL
NORMLOG_SIZE : 2048
NORMLOG_SPLITE_TYPE : TRUNCT
NORMLOG_DATA_CUTTIME : 1
NORMLOG_LAYOUT : %L: %D:%U %P * %T %R
WFLOG_NAME: wenxin_inference.log.wf
WFLOG_TYPE: FILE
WFLOG_SYSLEVEL: 2
WFLOG_PATH: ./log
WFLOG_SELFLEVEL: NULL
WFLOG_SIZE : 2048
WFLOG_SPLITE_TYPE : TRUNCT
WFLOG_DATA_CUTTIME : 1
WFLOG_LAYOUT : %L: %D:%U %P * %T %R
- BCLOUD文件 a. protobuf 2.4.1版本
#edit-mode: -*- python -*-
#coding:utf-8
WORKROOT('../../../')
COMPILER('gcc82')
CFLAGS('-g -O3 -pipe -W -Wall -fPIC -fpermissive')
#C++ flags.
CXXFLAGS('-g -O3 -pipe -W -Wall -Wno-unused-parameter -fPIC -fpermissive -std=gnu++11')
LDFLAGS('-luuid -lpthread -ldl -lcrypt -lrt -lz -Bsymbolic -rdynamic -Wl,-rpath,./lib')
#link flags
CONFIGS("baidu/base/common@stable")
CONFIGS("lib2-64/dict@dict_3-1-15-0_PD_BL")
CONFIGS('third-64/boost@boost_1-63-0-101_PD_BL')
CONFIGS("baidu/base/uconv@uconv_2-0-6-1_PD_BL@git_tag")
CONFIGS("baidu/base/ullib@stable")
CONFIGS("lib2-64/ccode@ccode_3-2-3-2_PD_BL")
CONFIGS('lib2-64/others-ex@others-ex_3-1-14-2_PD_BL')
CONFIGS('baidu/third-party/gflags@77592648e3f3be87d6c7123eb81cbad75f9aef5a@git_branch')
CONFIGS('baidu/lib/wordseg@wordseg_3-2-47-2_gcc345_PD_BL@git_tag')
CONFIGS('baidu/lib/libcrf@libcrf_2-4-17-1_gcc345_PD_BL@git_tag')
CONFIGS('public/odict@odict_1-1-2-1_PD_BL')
CONFIGS('third-64/glog@glog_0-3-3-100_PD_BL')
CONFIGS('baidu/base/configure@stable')
CONFIGS('baidu/base/baidu-rpc@stable')
CONFIGS('third-64/json-cpp@json-cpp_0-6-1-400_PD_BL')
CONFIGS("baidu/third-party/openssl@openssl_V1.0.1.2_GCC820_4U3_K3_GEN_PD_BL@git_tag")
CONFIGS('baidu/lib/textone-data-process@textone-data-process_1-0-1-8_gcc482_PD_BL@git_tag')
user_sources=GLOB("./src/demo.cpp")
Application('demo', Sources(user_sources))
b. protobuf 3.2版本
#edit-mode: -*- python -*-
#coding:utf-8
WORKROOT('../../../')
COMPILER('gcc82')
CFLAGS('-g -O3 -pipe -W -Wall -fPIC -fpermissive')
#C++ flags.
CXXFLAGS('-g -O3 -pipe -W -Wall -Wno-unused-parameter -fPIC -fpermissive -std=gnu++11')
LDFLAGS('-luuid -lpthread -ldl -lcrypt -lrt -lz -Bsymbolic -rdynamic -Wl,-rpath,./lib')
#link flags
CONFIGS("baidu/base/common@stable")
CONFIGS("lib2-64/dict@dict_3-1-15-0_PD_BL")
CONFIGS('third-64/boost@boost_1-63-0-101_PD_BL')
CONFIGS("baidu/base/uconv@uconv_2-0-6-1_PD_BL@git_tag")
CONFIGS("baidu/base/ullib@stable")
CONFIGS("lib2-64/ccode@ccode_3-2-3-2_PD_BL")
CONFIGS("lib2-64/others-ex@others-ex_3-1-25-0_PD_BL")
CONFIGS('baidu/third-party/gflags@77592648e3f3be87d6c7123eb81cbad75f9aef5a@git_branch')
CONFIGS('baidu/third-party/protobuf@2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a@git_branch')
CONFIGS('baidu/lib/wordseg@wordseg_3-2-47-5_gcc482_PD_BL@git_tag')
CONFIGS('baidu/lib/libcrf@libcrf_2-4-23-1_gcc482_PD_BL@git_tag')
CONFIGS('public/odict@odict_1-1-4-2_PD_BL')
CONFIGS('baidu/base/configure@stable')
CONFIGS('baidu/third-party/glog@v0.3.5@git_branch')
CONFIGS('baidu/base/baidu-rpc@stable')
CONFIGS('third-64/json-cpp@json-cpp_0-6-1-400_PD_BL')
CONFIGS("baidu/third-party/openssl@openssl_V1.0.1.2_GCC820_4U3_K3_GEN_PD_BL@git_tag")
CONFIGS('baidu/lib/textone-data-process@textone-data-process_1-0-6-1_gcc482_PD_BL@git_tag')
user_sources=GLOB("./src/demo.cpp")
Application('demo', Sources(user_sources))
二、使用示例
- 只切单字,不做最长字符串匹配(相对于Python版的BasicTokenizer类)
#include <gflags/gflags.h>
#include <base/comlog_sink.h>
#include "util/data_process_util.h"
#include "data/basic_tokenizer.h"
DEFINE_string(log_conf_file, "conf/log.conf", "log conf file");
int main(int argc, char** argv) {
if (logging::ComlogSink::GetInstance()->SetupFromConfig(FLAGS_log_conf_file.c_str()) != 0) {
std::cerr << "failed to init comlog";
return -1;
}
std::shared_ptr<nlp::infer::BasicTokenizer> tokenizer(new nlp::infer::BasicTokenizer(true));
std::string query = "谁有狂三这张高清的";
std::vector<std::string> result;
tokenizer->tokenize(query, result);
LOG(INFO) << "after tokenize, size: " << result.size();
nlp::infer::print_vector(result);
return 0;
}
- 先切成单字,再做最长字符串匹配,不在词典里面的token,切字结果是[UNK]。(相当于Python版的FullTokenizer类)
#include "data/full_tokenizer.h"
#include <gflags/gflags.h>
#include <base/comlog_sink.h>
DEFINE_string(log_conf_file, "conf/log.conf", "log conf file");
int main(int argc, char** argv) {
if (logging::ComlogSink::GetInstance()->SetupFromConfig(FLAGS_log_conf_file.c_str()) != 0) {
std::cerr << "failed to init comlog";
return -1;
}
std::string config_file = "tokenizer.conf";
if (argc == 2) {
config_file = argv[1];
}
std::shared_ptr<nlp::infer::Tokenizer> tokenizer(new nlp::infer::FullTokenizer());
nlp::infer::Status status;
std::string conf_path = "./conf";
if ((status = tokenizer->init(conf_path.c_str(), config_file.c_str())) != nlp::infer::Status::OK) {
LOG(ERROR) << "Failed to init tokenizer";
return -1;
}
std::string query = "谁有狂三这张高清的";
std::vector<std::string> result;
tokenizer->tokenize(query, result);
LOG(INFO) << "after tokenize: ";
nlp::infer::print_vector(result);
std::vector<int64_t> token_ids;
// token转id接口
tokenizer->convert_tokens_to_ids(result, token_ids);
LOG(INFO) << "after convert_token_to_id, ids: ";
nlp::infer::print_vector(token_ids);
std::vector<std::string> tokens;
// id转回原始token接口
tokenizer->convert_ids_to_tokens(token_ids, tokens);
LOG(INFO) << "after convert_ids_to_tokens, tokens: ";
nlp::infer::print_vector(tokens);
return 0;
}
- WordsegTokenzier使用示例
#include <Configure.h>
#include <gflags/gflags.h>
#include <base/comlog_sink.h>
#include "util/data_process_util.h"
#include "util/status.h"
#include "data/tokenizers.h"
DEFINE_string(log_conf_file, "conf/log.conf", "log conf file");
int main(int argc, char** argv) {
if (logging::ComlogSink::GetInstance()->SetupFromConfig(FLAGS_log_conf_file.c_str()) != 0) {
std::cerr << "failed to init comlog";
return -1;
}
std::string config_file = "tokenizer.conf";
if (argc == 2) {
config_file = argv[1];
}
// 创建Tokenizer对象,并初始化
std::string tokenizer_name = "WordsegTokenizer";
auto tokenizer = nlp::infer::create_tokenizer(tokenizer_name.c_str());
nlp::infer::Status status;
std::string conf_path = "./conf";
if ((status = tokenizer->init(conf_path.c_str(), config_file.c_str())) != nlp::infer::Status::OK) {
LOG(ERROR) << "Failed to init tokenizer";
return -1;
}
# WordsegThreadData是线程级变量,需要每个线程创建一份
std::shared_ptr<nlp::infer::WordsegThreadData> wordseg_thread_data_ptr = std::make_shared<nlp::infer::WordsegThreadData>();
if (wordseg_thread_data_ptr->init() != nlp::infer::Status::OK) {
LOG(ERROR) << "Failed to init wordseg thread data";
return -1;
}
std::string query = "谁有狂三这张高清的";
std::vector<std::string> result;
// wordseg 切词粒度,不指定,默认是phase粒度, SCW_OUT_WPCOMP: phase粒度,SCW_OUT_BASIC:basic粒度
int type = SCW_OUT_WPCOMP;
tokenizer->tokenize(query, wordseg_thread_data_ptr, result, type);
std::cout << "after tokenize, size: " << result.size() << std::endl;
for (auto& str: result) {
std::cout << str << "\t";
}
std::cout << std::endl;
return 0;
}
a. WordsegTokenizer依赖wordseg词典,下载词典解压到当前路径下
wget ftp://yq01-inf-yq01-tianqi55.yq01.baidu.com:/home/disk2/nlpt/nlpc_wordseg_3016.tar
#如果上述ftp链接失效,请使用下面链接
wget http://bj.bcebos.com/wenxin-models/nlpc_wordseg_3016.tar
tar xf nlpc_wordseg_3016.tar && rm nlpc_wordseg_3016.tar
- ErnieSimSlimTokenizer使用示例
#include <gflags/gflags.h>
#include <iostream>
#include <base/comlog_sink.h>
#include "util/data_process_util.h"
#include "data/tokenizers.h"
DEFINE_string(log_conf_file, "conf/log.conf", "log conf file");
int main(int argc, char** argv) {
if (logging::ComlogSink::GetInstance()->SetupFromConfig(FLAGS_log_conf_file.c_str()) != 0) {
std::cerr << "failed to init comlog";
return -1;
}
std::string conf_path = "./conf";
std::string conf_file = "tokenizer.conf";
std::string tokenizer_name = "ErnieSimSlimTokenizer";
auto tokenizer = nlp::infer::create_tokenizer(tokenizer_name.c_str());
if (tokenizer->init(conf_path.c_str(), conf_file.c_str()) != nlp::infer::Status::OK) {
LOG(ERROR) << "Failed to init tokenizer";
return -1;
}
# WordsegThreadData是线程级变量,需要每个线程创建一份
std::shared_ptr<nlp::infer::WordsegThreadData> wordseg_thread_data_ptr = std::make_shared<nlp::infer::WordsegThreadData>();
if (wordseg_thread_data_ptr->init() != nlp::infer::Status::OK) {
LOG(ERROR) << "Failed to init wordseg thread data";
return -1;
}
std::string query = "谁有狂三这张高清的";
std::vector<int64_t> src_ids;
tokenizer->tokenize(query, wordseg_thread_data_ptr, src_ids);
std::cout << "after tokenize, size: " << src_ids.size() << std::endl;
for (auto id: src_ids) {
std::cout << id << "\t";
}
std::cout << std::endl;
return 0;
}
a. ErnieSimSlimTokenizer依赖wordseg词典,下载词典解压到当前路径下:
# 注意:ErnieSimSlimTokenizer采用专用的切词词典,和WordsegTokenzier的词典版本不一致
wget ftp://yq01-inf-yq01-tianqi55.yq01.baidu.com:/home/disk2/wenxin/ernie_sim_slim/nlpc_wordseg_3016.tar
tar xf nlpc_wordseg_3016.tar && rm nlpc_wordseg_3016.tar
如果上述ftp链接失效,请使用下面链接下载
wegt http://bj.bcebos.com/wenxin-models/nlpc_wordseg_3016_sim.tar
tar xf nlpc_wordseg_3016_sim.tar && rm nlpc_wordseg_3016_sim.tar
三、使用说明
- Tokenizer都是线程安全的,用户只需要进程初始化一个Tokenizer对象,就可以在各个线程中使用。