开始训练和预测（ERNIE百亿&15亿模型）

更新时间：2022-08-01

开始训练&预测

目录结构

以文本分类任务为例：

分类任务位于wenxin_appzoo/tasks/text_classification

.
├── data	# 训练数据
│   └── xnli
│       ├── dev
│       │   └── dev.tsv
│       ├── predict
│       │   └── predict.tsv
│       ├── test
│       │   └── test.tsv
│       └── train
│           └── train.tsv
├── data_set_reader	# 数据加载相关类
│   ├── ernie_classification_base_dataset_reader.py
│   ├── ernie_classification_dataset_reader.py
│   └── ...
├── examples
│   ├── cls_ernie_1.5b_xnli_ch_infer.json	# ERNIE3.0-1.5B利用inference-model预测的json配置文件
│   ├── cls_ernie_1.5b_xnli_ch.json	# ERNIE3.0-1.5B训练与eval的json配置文件
│   ├── cls_ernie_1.5b_xnli_ch_save_infer_from_ckpt.json	# ERNIE3.0-1.5B转化checkpoints为inference_model的配置文件
│   ├── cls_ernie_3.0_xnli_ch_infer.json	# ERNIE3.0利用inference-model预测的json配置文件
│   ├── cls_ernie_3.0_xnli_ch.json	# ERNIE3.0训练与eval的json配置文件
│   ├── cls_ernie_3.0_xnli_ch_save_infer_from_ckpt.json	# ERNIE3.0转化checkpoints为inference_model的配置文件
│   └── ...
├── inference	# 推理相关类
│   ├── custom_cls_inference_ernie3.py
│   └── ...
├── model	# 模型文件
│   ├── ernie3_classification.py
│   ├── ernie_billions_classification.py
│   └── ...
├── package	# ernie_3词表与配置文件
│   ├── config
│   │   ├── ernie_base_config.json
│   │   └── ernie_config.json
│   └── dict
│       └── vocab_ernie.txt
├── run_infer_ernie3.py	# ernie_3推理入口文件，只依靠json进行模型预测的入口脚本
├── run_trainer.py	# 训练文件入口文件，只依靠json进行模型训练的入口脚本
└── trainer
    ├── custom_trainer_ernie3.py
    └── ...

模型准备

模型均存放于wenxin_appzoo/wenxin_appzoo/models_hub文件夹下，

进入文件夹执行sh download_ernie_3.0_ch.sh下载3.0模型参数、字典与推断所需环境。
执行sh download_ernie_3.0_1.5b_ch.sh可下载ERNIE3.0-1.5B模型参数、字典和网络配置文件。

cd wenxin_appzoo/wenxin_appzoo/models_hub
#若使用ERNIE3.0模型，请执行以下命令
sh download_ernie_3.0_ch.sh
#若使用ERNIE3.0-1.5B模型，请执行以下命令
sh download_ernie_3.0_1.5b_ch.sh

训练准备

训练与验证的配置代码如下（cls_ernie_3.0_xnli_ch.json）。

{
  "is_ernie3": true, ## 指示是否使用ERNIE 3.0模型
  "dataset_reader": {
    "train_reader": {
      "name": "train_reader", ## 训练、验证、测试各自基于不同的数据集，数据格式也可能不一样，可以在json中配置不同的reader，此处为训练集的reader。
      "type": "ErnieClassificationReader", ## 采用ErnieClassificationReader，其封装了常见的读取tsv、txt文件、组batch等操作。
      "fields": [ ## 域（field）是文心的高阶封装，对于同一个样本存在不同域的时候，不同域有单独的数据类型（文本、数值、整型、浮点型）、单独的词表(vocabulary)等，可以根据不同域进行语义表示，如文本转id等操作，field_reader是实现这些操作的类。
        {
          "name": "text_a",  ## 文本分类任务的第一个特征域，命名为"text_a"。
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "text_b", ## 文本分类任务的第二个特征域，命名为"text_b"。
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "label",
          "data_type": "int",
          "reader": null,
          "tokenizer": null,
          "need_convert": false,
          "vocab_path": "",
          "max_seq_len": 1,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/xnli/train", ## 数据路径。
        "shuffle": true, ## 数据在读取过程中是否需要打乱顺序。
        "batch_size": 6, ## 超参数之一，表示每个step训练多少个样本，每个step训练的总样本数为batch_size×卡数。
        "epoch": 3, ## 超参数之一，表示这个数据集中的数据会被重复训练多少轮。
        "sampling_rate": 1.0, ## 数据集的采样率，文心预留参数，暂时不起作用，后续版本会升级。
        "need_data_distribute": true,
        "extra_params":{
          "vocab_path":"../../models_hub/ernie_3.0_ch_dir/vocab.txt", ## 指定词表
          "label_map_config":"",
          "max_seq_len":256, ## 控制截断长度，text_a, text_b整体不超过max_seq_len个tokens
          "do_lower_case":true,
          "in_tokens":false,
          "tokenizer": "FullTokenizer", ## 指定text_a, text_b的tokenizer，ernie_3固定为FullTokenizer分词器。
        }
      }
    },
    "dev_reader": { # 此处为验证集的reader。
      "name": "dev_reader",
      "type": "ErnieClassificationReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "text_b",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "label",
          "data_type": "int",
          "reader": null,
          "tokenizer": null,
          "need_convert": false,
          "vocab_path": "",
          "max_seq_len": 1,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/xnli/dev",
        "shuffle": false,
        "batch_size": 16,
        "epoch": 1,
        "sampling_rate": 1.0,
        "need_data_distribute": true,
        "extra_params":{
          "vocab_path":"../../models_hub/ernie_3.0_ch_dir/vocab.txt",
          "label_map_config":"",
          "max_seq_len":256,
          "do_lower_case":true,
          "in_tokens":false,
          "tokenizer": "FullTokenizer",
        }
      }
    },
    "test_reader": { # 此处为测试集的reader。
      "name": "test_reader",
      "type": "ErnieClassificationReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "text_b",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "label",
          "data_type": "int",
          "reader": null,
          "tokenizer": null,
          "need_convert": false,
          "vocab_path": "",
          "max_seq_len": 1,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/xnli/test",
        "shuffle": false,
        "batch_size": 16,
        "epoch": 1,
        "sampling_rate": 1.0,
        "need_data_distribute": true,
        "extra_params":{
          "vocab_path":"../../models_hub/ernie_3.0_ch_dir/vocab.txt",
          "label_map_config":"",
          "max_seq_len":256,
          "do_lower_case":true,
          "in_tokens":false,
          "tokenizer": "FullTokenizer",
        }
      }
    }
  },
  "model": {
    "type": "Ernie3Classification",
    "embedding": {
      "emb_dim": 4096,
      "use_amp": true,
      "mem_len": 0,
      "weight_sharing": false,
      "training_server": false, # 是否开启server的训练。对于生成任务不起效。
      "config_path": "../../models_hub/ernie_3.0_ch_dir/ernie_config.json"
    },
    "optimization":{  ## 优化器设置，文心ERNIE推荐的默认设置。
      "learning_rate": 5e-5,
      "use_lr_decay": true,
      "use_default_decay": false,
      "lr_scheduler": "linear_warmup_decay",
      "use_release_paddle": false,
      "epsilon": 1e-6,
      "warmup_steps": 0,
      "warmup_proportion": 0.1,
      "weight_decay": 0.01,
      "use_dynamic_loss_scaling": false,
      "init_loss_scaling": 524288,
      "incr_every_n_steps": 1000,
      "decr_every_n_nan_or_inf": 2,
      "incr_ratio": 2.0,
      "decr_ratio": 0.8,
      "use_layer_decay": false,
      "layer_decay_ratio": 0.95,
      "n_layers": 60, # 整体模型层数
      "sharing_layers": 48 # server层数
    }
  },
  "trainer": {
    "type" : "CustomTrainerErnie3",
    "PADDLE_PLACE_TYPE": "gpu",
    "PADDLE_IS_FLEET": 1,
    "is_recompute": true, # 是否开启重计算，默认开启。通过使用内存与磁盘存储训练前向阶段的激活值，减少了显存的占用，可以加载具有更多参数的模型。
    "ramdom_seed": 1,
    "use_amp": true, # 是否开启混合精度训练，默认开启。
    "use_sharding": true, # 是否开启sharding，默认开启。可以将模型参数自动分配到多个显卡上，实现超大模型训练的关键分布式训练技术。
    "save_inference_model": false, # 是否开启训练时保存inference_model，默认关闭。注意：暂不支持开启sharding训练时保存inference_model，请使用checkpoints到inference_model的转化工具。
    "use_fast_executor": true,
    "train_log_step": 10,
    "is_do_train": 1,
    "is_eval_dev": 1, # 是否开启评估，默认在训练阶段关闭，因为生成任务验证比较慢，会极大影响训练速度。建议使用训练阶段保存的checkpoints，在训练结束后设置is_do_train=0,is_eval_dev=1单独进行验证。
    "is_eval_test": 1,
    "eval_step": 500, ## 进行测试集或验证集评估的间隔步数。
    "save_model_step": 100000000,  ## 保存模型时的间隔步数，建议设置为eval_step的整数倍。
    "load_parameters": "",  ## 加载包含各op参数值的训练好的模型，用于热启动。此处填写checkpoint路径。不填则表示不使用热启动。
    "load_checkpoint": "",  ## 加载包含学习率等所有参数的训练模型，用于热启动。此处填写checkpoint路径。不填则表示不使用热启动。
    "pre_train_model": [
      {
        "name": "ernie_3.0_ch", ## 预训练模型的名称name
        "params_path": "../../models_hub/ernie_3.0_ch_dir/params" ## 预训练模型的目录params_path
      }
    ],
    "output_path": "./output/cls_ernie_3.0_xnli_ch"  ## 保存模型的输出路径
  }
}

注：ERNIE 3.0-1.5B可在单卡中运行，若采用单卡运行时use_sharding设为false，此时save_inference_model可设为true，即无需通过转化工具即可保存预测模型。

开始训练

#进入指定任务的目录
cd wenxin_appzoo/wenxin_appzoo/tasks/text_classification
#使用ERNIE3.0-1.5B单卡训练
fleetrun --log_dir log ./run_trainer.py --param_path "./examples/cls_ernie_3.0_xnli_ch.json" 1>log/lanch.log 2>&1

通过上述脚本调用json文件开启训练（注意ERNIE3.0-1.5B单卡训练时也需fleetrun启动）。
训练阶段日志文件于log文件夹下，workerlog.N 保存了第N张卡的log日志内容，如遇到程序报错可以通过查看不同卡的workerlog.N定位到有效的报错信息。
训练模型保存于./output/cls_ernie_3.0_xnli_ch文件夹下。

预测准备

如果上面的训练过程save_inference_model设置为false，则不会保存用于推理的inference model，因此需要将checkpoints格式参数转成预测推理的inference_model参数格式
- 核心步骤就是使用load_parameters加载要转化的checkpoint，设置学习率learning_rate为0，打开save_inference_model开关，让模型训练过程中保存为inference model。
- 模型配置文件如下：(cls_ernie_3.0_xnli_ch_save_infer_from_ckpt.json)

{
  "is_ernie3": true,
  "dataset_reader": {
    "train_reader": {
      "name": "train_reader",
      "type": "ErnieClassificationReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "text_b",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "label",
          "data_type": "int",
          "reader": null,
          "tokenizer": null,
          "need_convert": false,
          "vocab_path": "",
          "max_seq_len": 1,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/xnli/train",
        "shuffle": true,
        "batch_size": 6,
        "epoch": 1,                      #转换模型只跑1个epoch就行了
        "sampling_rate": 1.0,
        "need_data_distribute": true,
        "extra_params":{
          "vocab_path":"../../models_hub/ernie_3.0_ch_dir/vocab.txt",
          "label_map_config":"",
          "max_seq_len":256,
          "do_lower_case":true,
          "in_tokens":false,
          "tokenizer": "FullTokenizer"
        }
      }
    }
  },
  "model": {
    "type": "Ernie3Classification",
    "num_labels":3,
    "embedding": {
      "emb_dim": 4096,
      "use_amp": true,
      "mem_len": 0,
      "weight_sharing": false,
      "training_server": true,
      "config_path": "../../models_hub/ernie_3.0_ch_dir/ernie_config.json"
    },
    "optimization":{
      "learning_rate": 0,            ##学习率设置为0
      "use_lr_decay": true,
      "use_default_decay": false,
      "lr_scheduler": "linear_warmup_decay",
      "use_release_paddle": false,
      "epsilon": 1e-6,
      "warmup_steps": 0,
      "warmup_proportion": 0.1,
      "weight_decay": 0.01,
      "use_dynamic_loss_scaling": false,
      "init_loss_scaling": 524288,
      "incr_every_n_steps": 1000,
      "decr_every_n_nan_or_inf": 2,
      "incr_ratio": 2.0,
      "decr_ratio": 0.8,
      "use_layer_decay": false,
      "layer_decay_ratio": 0.95,
      "n_layers": 60,
      "sharing_layers": 48
    }
  },
  "trainer": {
    "type" : "CustomTrainerErnie3",
    "PADDLE_PLACE_TYPE": "cpu",
    "PADDLE_IS_FLEET": 1,
    "is_recompute": false,
    "ramdom_seed": 1,
    "use_amp": false,
    "use_sharding": false,
    "save_inference_model": true,
    "use_fast_executor": true,
    "train_log_step": 10,
    "is_do_train": 1,
    "is_eval_dev": 0,
    "is_eval_test": 0,
    "eval_step": 500,
    "save_model_step": 10000000,                 ##设置大的step，这样跑完训练只会保存一个inference model，即转化的model
    "load_parameters": "./output/cls_ernie_3.0_xnli_ch/save_checkpoints/checkpoints_step_1200",     ##填入要转化的checkpoint路径，这里以checkpoints_step_1200为例
    "load_checkpoint": "",
    "pre_train_model": [],     ##这里为空
    "output_path": "./output/cls_ernie_3.0_xnli_ch"
  }
}

运行如下脚本保存inference_model，耗时大约20分钟（ERNIE3.0-百亿）

export CPU_NUM='1'
export CUDA_VISIBLE_DEVICES=0 # mask out other gpus for saving inference-model
 
fleetrun --log_dir log ./run_trainer.py  --param_path "./examples/cls_ernie_3.0_xnli_ch_save_infer_from_ckpt.json" 1>log/lanch.log 2>&1

预测模型保存于./output/cls_ernie_3.0_xnli_ch/save_inference_model路径下

（注：若ERNIE 3.0-1.5B单卡运行且保存了预测模型时，即save_inference_model设置为true，可跳过使用转化工具这个步骤，直接预测即可）

预测配置文件如下: （cls_ernie_3.0_xnli_ch_infer.json）

{
  "is_ernie3": true,
  "dataset_reader": {
    "predict_reader": {
      "name": "predict_reader",
      "type": "ErnieClassificationReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "text_b",
          "data_type": "string",
          "reader": null,
          "tokenizer": null,
          "need_convert": true,
          "vocab_path": "",
          "max_seq_len": 256,
          "truncation_type": 0,
          "padding_id": 0
        },
        {
          "name": "label",
          "data_type": "int",
          "reader": null,
          "tokenizer": null,
          "need_convert": false,
          "vocab_path": "",
          "max_seq_len": 1,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/xnli/dev",
        "shuffle": false,
        "batch_size": 1,
        "epoch": 1,
        "sampling_rate": 1.0,
        "need_data_distribute": true,
        "extra_params":{
          "vocab_path":"../../models_hub/ernie_3.0_ch_dir/vocab.txt",
          "label_map_config":"",
          "max_seq_len":256,
          "do_lower_case":true,
          "in_tokens":false,
          "tokenizer": "FullTokenizer"
        }
      }
    }
  },
  "model": {
    "type": "Ernie3Classification",
    "num_labels":3,
    "embedding": {
      "emb_dim": 4096,
      "use_amp": true,
      "mem_len": 0,
      "weight_sharing": false,
      "training_server": true,
      "config_path": "../../models_hub/ernie_3.0_ch_dir/ernie_config.json"
    },
    "optimization":{
      "learning_rate": 5e-5,
      "use_lr_decay": true,
      "use_default_decay": false,
      "lr_scheduler": "linear_warmup_decay",
      "use_release_paddle": false,
      "epsilon": 1e-6,
      "warmup_steps": 0,
      "warmup_proportion": 0.1,
      "weight_decay": 0.01,
      "use_dynamic_loss_scaling": false,
      "init_loss_scaling": 524288,
      "incr_every_n_steps": 1000,
      "decr_every_n_nan_or_inf": 2,
      "incr_ratio": 2.0,
      "decr_ratio": 0.8,
      "use_layer_decay": false,
      "layer_decay_ratio": 0.95,
      "n_layers": 60,
      "sharing_layers": 48
    }
  },
  "inference": {
    "type": "CustomClassificationInferenceErnie3",
    "output_path": "./output/predict_result.txt",
    "output_server_path": "./output/predict_result_server.txt",
    "PADDLE_PLACE_TYPE": "gpu",
    "training_server": true,
    "turn_on_trt": true,
    "use_cache": false,
    "num_labels": 3,
    "thread_num": 2,
    "inference_model_path": "./output/cls_ernie_3.0_xnli_ch/save_inference_model/inference_step_1/",
    "config_path": "../../models_hub/ernie_3.0_ch_dir/ernie_config.json",
    "extra_param": {
      "meta":{
        "job_type": "text_classification"
      },
      "max_seq_len": 256
    }
}

开始预测

ERNIE 3.0百亿模型预测：

# 下面为inference必须要导入的环境变量以及依赖的python
BASE_PATH="../../models_hub/ernie_3.0_ch_dir/infer_env/" # inference所需环境路径
export CUDA_VISIBLE_DEVICES=0 # 屏蔽其它显卡
export PATH="${BASE_PATH}/py37/bin/:$PATH"
export PYTHONPATH="${BASE_PATH}/py37/"
# ERNIE 3.0百亿模型依赖TensorRT以fp16精度进行单卡预测，下面为所需的tensorRT库，以及对应的cuda和cuddn版本
export LD_LIBRARY_PATH=$BASE_PATH:${BASE_PATH}/cuda-11.0.3/lib64:${BASE_PATH}/cudnn-11.0-linux-x64-v8.0.5.39/lib64:${BASE_PATH}/TensorRT-7.2.1.6/lib:/home/opt/nvidia_lib:${BASE_PATH}/libs:$LD_LIBRARY_PATH
export FLAGS_allocator_strategy=auto_growth # for inference，没有这一行会报显存溢出的错误
 
mkdir -p log
fleetrun --log_dir log ./run_infer_ernie3.py  --param_path "./examples/cls_ernie_3.0_xnli_ch_infer.json" 1>log/lanch.log 2>&1

ERNIE 3.0-1.5B模型预测：

python ./run_infer_ernie3.py  --param_path "./examples/cls_ernie_1.5b_xnli_ch_infer.json"

预测结果保存于./output/predict_result.txt文件中。

准备工作

情感分析任务