Fma策略

更新时间：2022-12-17

简介

Fine-tuned model average策略是指将多个fine-tuned后的模型进行参数平均
目前适配的任务有：
- 文本分类
- 文本匹配

文本分类

任务代码位于./wenxin_appzoo/tasks/text_classification目录

cd ./wenxin_appzoo/tasks/text_classification

数据准备

示例数据使用clue-iflytek数据集，使用下面命令下载全量数据集

cd data
sh download_iflytek_data.sh

注：如果只是简单测试，则./data/iflytek目录下自带demo数据能够使用，可跳过本步骤，无需下载全量数据集
数据格式与[文本分类->数据准备]中的数据示例相同

模型准备

示例model使用ERNIE3.0 base

cd ../models_hub
sh download_ernie_3.0_base_ch.sh

开始训练

配置文件

文心中的各种参数都是在json文件中进行配置的，你可以通过修改所加载的json文件来进行参数的自定义配置。json配置文件主要分为三个部分：dataset_reader（数据部分）、model（网络部分）、trainer（训练任务）或inference（预测部分），在模型训练的时候，json文件中需要配置dataset_reader、model和trainer这三个部分；在预测推理的时候，json文件中需要配置dataset_reader、inference这两个部分。
对抗训练的配置文件为./wenxin_appzoo/tasks/text_classification/examples/cls_ernie_fc_ch_with_layer_fma.json，json配置大体与正常的文本分类json配置类似，只针对trainer部分进行了修改，以下介绍与fma相关的json配置。

{
    "dataset_reader": {
      "train_reader": {
        "name": "train_reader",
        "type": "BasicDataSetReader",
        "fields": [
          {
            "name": "text_a",
            "data_type": "string",
            "reader": {
              "type": "ErnieTextFieldReader"
            },
            "tokenizer": {
              "type": "FullTokenizer",
              "split_char": " ",
              "unk_token": "[UNK]"
            },
            "need_convert": true,
            "vocab_path": "../../models_hub/ernie_3.0_base_ch_dir/vocab.txt",
            "max_seq_len": 128,
            "truncation_type": 0,
            "padding_id": 0
          },
          {
            "name": "label",
            "data_type": "int",
            "reader": {
              "type": "ScalarFieldReader"
            },
            "tokenizer": null,
            "need_convert": false,
            "vocab_path": "",
            "max_seq_len": 1,
            "truncation_type": 0,
            "padding_id": 0,
            "embedding": null
          }
        ],
        "config": {
          "data_path": "./data/iflytek/train_data",
          "shuffle": true,
          "batch_size": 16,
          "epoch": 5,
          "sampling_rate": 1.0,
          "need_data_distribute": true,
          "need_generate_examples": false
        }
      },
      "test_reader": {
        "name": "test_reader",
        "type": "BasicDataSetReader",
        "fields": [
          {
            "name": "text_a",
            "data_type": "string",
            "reader": {
              "type": "ErnieTextFieldReader"
            },
            "tokenizer": {
              "type": "FullTokenizer",
              "split_char": " ",
              "unk_token": "[UNK]"
            },
            "need_convert": true,
            "vocab_path": "../../models_hub/ernie_3.0_base_ch_dir/vocab.txt",
            "max_seq_len": 128,
            "truncation_type": 0,
            "padding_id": 0
          },
          {
            "name": "label",
            "data_type": "int",
            "need_convert": false,
            "reader": {
              "type": "ScalarFieldReader"
            },
            "tokenizer": null,
            "vocab_path": "",
            "max_seq_len": 1,
            "truncation_type": 0,
            "padding_id": 0,
            "embedding": null
          }
        ],
        "config": {
          "data_path": "./data/iflytek/test_data",
          "shuffle": false,
          "batch_size": 8,
          "epoch": 1,
          "sampling_rate": 1.0,
          "need_data_distribute": false,
          "need_generate_examples": false
        }
      }
    },
    "model": {
      "type": "ErnieClassification",
      "use_rdrop": false,
      "rdrop_alpha": 1,
      "is_dygraph": 1,
      "use_adv": false,
      "use_alpha": true,
      "adv_alpha": 0.5, 
      "attack_after_drop": true,
      "adv_epsilon": 0.001,
      "small_constant_for_finite_diff": 1e-05,
      "optimization": {
        "learning_rate": 6e-05,
        "use_lr_decay": true,
        "warmup_steps": 0,
        "warmup_proportion": 0.1,
        "weight_decay": 0.01,
        "use_dynamic_loss_scaling": false,
        "init_loss_scaling": 128,
        "incr_every_n_steps": 100,
        "decr_every_n_nan_or_inf": 2,
        "incr_ratio": 2.0,
        "decr_ratio": 0.8,
        "use_layer_decay":false,
        "layer_decay_ratio":0.8
      },
      "embedding": {
        "config_path": "../../models_hub/ernie_3.0_base_ch_dir/ernie_config.json"
      },
      "num_labels": 119
    },
    "trainer": {
      "type": "CustomDynamicTrainer",
      "PADDLE_PLACE_TYPE": "gpu",
      "PADDLE_IS_FLEET": 0,
      "use_fma":true,           #开启fma
      "fma_use_best_n":true,    #是否自动选择评估得分最好的n个模型进行模型评估，开启后需要设置fma_score和fma_num_model参数
      "fma_score":"acc",        #根据什么评估指标选择模型
      "fma_num_model":4,        #选择4个模型进行模型平均
      "fma_dev_or_test":"test", #根据哪个评估集的得分进行模型选择，可选项为（dev、test），此处为test，那么is_eval_test需要设置为1
      "train_log_step": 10,
      "use_amp": true,
      "is_eval_dev": 0,
      "is_eval_test": 1,
      "eval_step": 50,         #
      "save_model_step": 300,   #save_model_step尽量设置为eval_step的整数倍
      "load_parameters": "",
      "load_checkpoint": "",
      "pre_train_model": [
        {
          "name": "ernie_3.0_base_ch",
          "params_path": "../../models_hub/ernie_3.0_base_ch_dir/params"
      }
      ],
      "output_path": "./output/cls_ernie_3.0_base_fc_ch_dy_fma",
      "extra_param": {
        "meta":{
          "job_type": "text_classification"
        }

      }
    }
  }

启动训练

python run_trainer.py --param_path ./examples/
cls_ernie_fc_ch_with_fma.json

训练运行的日志会自动保存在./log/test.log文件中。
训练中以及结束后产生的模型文件会保存在json配置文件中的output_path字段值的目录下（此处为./output/cls_ernie_3.0_base_fc_ch_dy_fma），其中save_inference_model文件夹会保存用于预测的模型文件，save_checkpoint文件夹会保存用于热启动的模型文件。
⚠️注意：模型平均后的checkpoint文件都会保存在/output/save_checkpoints/checkpoints_step_fma_model的文件夹中，注意该文件夹中只有平均后的模型参数文件wenxin.pdparams，需要从其他step中拷贝模型组网文件wenxin.pdopt。如下图，需要将checkpoints_step_36/wenxin.pdopt 拷贝到checkpoints_step_fma_model/中。

开始预测

配置文件

使用./wenxin_appzoo/tasks/text_classification/examples/cls_ernie_fc_ch_infer_with_iflytek.json进行推理配置
注意需要修改cls_ernie_fc_ch_infer_with_iflytek.json里的"inference_model_path"字段，填入上面训练过程中保存的save_inference_model文件夹下的model路径，配置如下

{
  "dataset_reader": {
    "predict_reader": {
      "name": "predict_reader",
      "type": "BasicDataSetReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": {
            "type": "ErnieTextFieldReader"
          },
          "tokenizer": {
            "type": "FullTokenizer",
            "split_char": " ",
            "unk_token": "[UNK]",
            "params": null
          },
          "need_convert": true,
          "vocab_path": "../../models_hub/ernie_3.0_base_ch_dir/vocab.txt",
          "max_seq_len": 512,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/iflytek/predict_data",
        "shuffle": false,
        "batch_size": 8,
        "epoch": 1,
        "sampling_rate": 1.0,
        "need_data_distribute": false,
        "need_generate_examples": true
      }
    }
  },

  "inference": {
    "type": "CustomInference",
    "output_path": "./output/predict_result.txt",
    "PADDLE_PLACE_TYPE": "cpu",
    "num_labels": 119,
    "thread_num": 2,
    "inference_model_path": "./output/cls_ernie_3.0_base_fc_ch_dy/save_inference_model/inference_step_126/", ###此处修改对应需要预测的模型路径
    "extra_param": {
      "meta":{
        "job_type": "text_classification"
      }

    }
  }
}

启动预测

python run_infer.py --param_path ./examples/cls_ernie_fc_ch_infer_with_iflytek.json

预测结果保存于./output/predict_result.txt文件中。

文本匹配

任务代码位于./wenxin_appzoo/tasks/text_matching

cd ./wenxin_appzoo/tasks/text_matching

数据准备

示例数据使用clue-afqmc数据集，使用下面命令下载全量数据集

cd data
sh download_afqmc_data.sh

注：如果只是简单测试，则./data/afqmc目录下自带demo数据能够使用，可跳过本步骤，无需下载全量数据集
数据格式与[文本匹配->数据准备]中的数据示例相同

模型准备

示例model使用ERNIE3.0 base

cd ../models_hub
sh download_ernie_3.0_base_ch.sh

开始训练

配置文件

文心中的各种参数都是在json文件中进行配置的，你可以通过修改所加载的json文件来进行参数的自定义配置。json配置文件主要分为三个部分：dataset_reader（数据部分）、model（网络部分）、trainer（训练任务）或inference（预测部分），在模型训练的时候，json文件中需要配置dataset_reader、model和trainer这三个部分；在预测推理的时候，json文件中需要配置dataset_reader、inference这两个部分。
对抗训练json配置大体与正常的文本匹配json配置类似，以下介绍与对抗训练相关的json配置。
文本匹配分为三类（详细信息见文本匹配任务），不同类别的文本匹配对应着不同类别的json配置文件：
- 单塔pointwise：./examples/mtch_ernie_fc_pointwise_ch_with_fma.json
- 双塔pointwise：./examples/mtch_ernie_pointwise_simnet_ch_with_fma.json
- 双塔pairwise：./examples/mtch_ernie_pairwise_simnet_ch_with_fma.json
单塔pointwise：对应json为./examples/mtch_ernie_fc_pointwise_ch_with_fma.json，相比于./examples/mtch_ernie_fc_pointwise_ch_with_fma.json重点修改了model部分

{
  "dataset_reader": {
    "train_reader": {.....},
    "test_reader": {...},
    "dev_reader": {......}
  },
  "model": {.....},
  "trainer": {
     "type": "CustomDynamicTrainer",
    "PADDLE_PLACE_TYPE": "gpu",
    "PADDLE_IS_FLEET": 0,
    "use_fma":true,
    "fma_use_best_n":true,
    "fma_score":"acc",
    "fma_num_model":5,
    "fma_dev_or_test":"dev",
    "train_log_step": 10,
    "use_amp": true,
    "is_eval_dev": 1,
    "is_eval_test": 0,
    "eval_step": 100,
    "save_model_step": 300,
    "load_parameters": "",
    "load_checkpoint": "",
    "pre_train_model": [
      {
        "name": "ernie_3.0_base_ch",
        "params_path": "../../models_hub/ernie_3.0_base_ch_dir/params"
      }
    ],
    "output_path": "./output/mtch_ernie_3.0_base_fc_pointwise_ch_fma",
    "extra_param": {
      "meta":{
        "job_type": "text_matching"
      }
    }
  }
}

启动训练

python run_trainer.py --param_path ./examples/mtch_ernie_fc_pointwise_ch_with_fma.json

⚠️注意：模型平均后的checkpoint文件都会保存在/output/save_checkpoints/checkpoints_step_fma_model的文件夹中，注意该文件夹中只有平均后的模型参数文件wenxin.pdparams，需要从其他step中拷贝模型组网文件wenxin.pdopt。如下图，需要将checkpoints_step_36/wenxin.pdopt 拷贝到checkpoints_step_fma_model/中。

开始预测

预测方式与文本匹配的预测方式相同，仅需配置对应的inference_model_path加载推理模型路径和对应的data_path加载预测数据。

修改配置文件：修改cls_ernie_fc_ch_infer_with_iflytek.json 的推理模型路径

{
  "dataset_reader": {
    "predict_reader": {
      "name": "predict_reader",
      "type": "BasicDataSetReader",
      "fields": [
        {
          "name": "text_a",
          "data_type": "string",
          "reader": {
            "type": "ErnieTextFieldReader"
          },
          "tokenizer": {
            "type": "FullTokenizer",
            "split_char": " ",
            "unk_token": "[UNK]",
            "params": null
          },
          "need_convert": true,
          "vocab_path": "../../models_hub/ernie_3.0_base_ch_dir/vocab.txt",
          "max_seq_len": 512,
          "truncation_type": 0,
          "padding_id": 0,
          "embedding": null
        }
      ],
      "config": {
        "data_path": "./data/iflytek/predict_data",
        "shuffle": false,
        "batch_size": 8,
        "epoch": 1,
        "sampling_rate": 1.0,
        "need_data_distribute": false,
        "need_generate_examples": true
      }
    }
  },

  "inference": {
    "type": "CustomInference",
    "output_path": "./output/predict_result.txt", #预测文件路径
    "PADDLE_PLACE_TYPE": "cpu",
    "num_labels": 119,   #预测label数目
    "thread_num": 2,
    "inference_model_path": "./output/cls_ernie_3.0_base_fc_ch_dy/save_inference_model/inference_step_126/", ##修改此处路径为对应的模型路径
    "extra_param": {
      "meta":{
        "job_type": "text_classification"
      }

    }
  }
}

启动预测

python run_infer.py --param_path examples/cls_ernie_fc_ch_infer_with_iflytek.json

Layer decay策略

进阶指南