reader has raised error
一添之饥在于晨 发布于2020-10 浏览:1002 回复:2
0
收藏

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddlehub as hub
from paddlehub.dataset import InputExample, BaseDataset
import paddle

from collections import namedtuple
import codecs
import os
import csv

max_labels = 37
max_seq_lens = 500
# -*- coding: utf-8 -*-
"""
Created on Sun Oct  4 18:40:40 2020

@author: MikeDean
"""

'''
load local dataset
'''
class MyDataset(BaseDataset):
    """DemoDataset"""
    def __init__(self,dataset_dir ):
        self.dataset_dir = dataset_dir #模型路径
        self._load_train_examples()
        self._load_test_examples()
        self._load_dev_examples()

    def _load_train_examples(self):
        self.train_file = os.path.join(self.dataset_dir, "train.tsv")
        self.train_examples = self._read_tsv(self.train_file)

    def _load_dev_examples(self):
        self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
        self.dev_examples = self._read_tsv(self.dev_file)

    def _load_test_examples(self):
        self.test_file = os.path.join(self.dataset_dir, "test.tsv")
        self.test_examples = self._read_tsv(self.test_file)

    def get_train_examples(self):
        return self.train_examples

    def get_dev_examples(self):
        return self.dev_examples

    def get_test_examples(self):
        return self.test_examples

    def get_labels(self):# 确定标签,根据自己的数据集lable进行定义
        """define it according the real dataset"""
        labels = []
        for i in range(max_labels):
            labels.append(str(i))
        return labels

    @property
    def num_labels(self):
        """
        Return the number of labels in the dataset.
        """
        return len(self.get_labels())

    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with codecs.open(input_file, "r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            examples = []
            seq_id = 0
            header = next(reader)  # skip header
            for line in reader:
                example = InputExample(
                    guid=seq_id, label=line[0], text_a=line[1])
                seq_id += 1
                examples.append(example)

            return examples

"""
main codes
"""
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

class DemoDataset(BaseNLPDataset):
    """DemoDataset"""
    def __init__(self):
        # 数据集存放位置
        self.dataset_dir = "drive/dataset/"
        super(DemoDataset, self).__init__(
            base_path=self.dataset_dir,
            train_file="train.tsv",
            dev_file="dev.tsv",
            test_file="test.tsv",
            # 如果还有预测数据(不需要文本类别label),可以放在predict.tsv
            #predict_file="predict.tsv",
            train_file_with_header=False,
            dev_file_with_header=False,
            test_file_with_header=False,
            # predict_file_with_header=False,
            # 数据集类别集合
            label_list=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', \
                        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', \
                        '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36'])
model_path = "drive\\dataset\\"
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(trainable=True, max_seq_len=max_seq_lens)
#dataset = MyDataset(dataset_dir=model_path)
dataset = DemoDataset()
reader = hub.reader.MultiLabelClassifyReader(
    dataset=dataset,
    vocab_path=module.get_vocab_path(),
    max_seq_len=max_seq_lens,
    )
#metrics_choices = ['acc','f1']

strategy = hub.AdamWeightDecayStrategy(
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_proportion=0.0,
    lr_scheduler="linear_decay",
)

config = hub.RunConfig(use_cuda=False, num_epoch=5, batch_size=3, strategy=strategy)

# Define a classfication finetune task by PaddleHub's API
pooled_output = outputs["pooled_output"]
feed_list = [
        inputs["input_ids"].name, 
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name
    ]
    
multi_label_cls_task = hub.MultiLabelClassifierTask(
    data_reader=reader,
    feature=pooled_output,
    feed_list=feed_list,
    num_classes=dataset.num_labels,
    config=config)

# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
#multi_label_cls_task.finetune()
multi_label_cls_task.finetune_and_eval()

 

 

错误:

[2020-10-06 05:12:45,274] [ INFO] - Installing ernie_tiny module
[2020-10-06 05:12:45,277] [ INFO] - Module ernie_tiny already installed in /root/.paddlehub/modules/ernie_tiny
[2020-10-06 05:12:48,429] [ INFO] - Dataset label map = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '30': 30, '31': 31, '32': 32, '33': 33, '34': 34, '35': 35, '36': 36}
[2020-10-06 05:12:48,514] [ INFO] - Checkpoint dir: ckpt_20201006051248
[2020-10-06 05:12:48,606] [ WARNING] - PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, in which you can use your tokenizer to preprocess dataset and run task in a clear flow. New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py
/usr/local/lib/python3.6/dist-packages/paddle/fluid/clip.py:779: UserWarning: Caution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'.
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
[2020-10-06 05:12:50,089] [ INFO] - Strategy with linear decay, slanted triangle learning rate, weight decay regularization,
/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py:1093: UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used.
warnings.warn(error_info)
[2020-10-06 05:12:50,130] [ INFO] - Try loading checkpoint from ckpt_20201006051248/ckpt.meta
[2020-10-06 05:12:50,132] [ INFO] - PaddleHub model checkpoint not found, start from scratch...
[2020-10-06 05:12:50,266] [ INFO] - PaddleHub finetune start
WARNING:root:Your reader has raised an exception!
Exception in thread Thread-17:
Traceback (most recent call last):
File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/usr/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/reader.py", line 1145, in __thread_main__
six.reraise(*sys.exc_info())
File "/usr/local/lib/python3.6/dist-packages/six.py", line 703, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/reader.py", line 1125, in __thread_main__
for tensors in self._tensor_reader():
File "/usr/local/lib/python3.6/dist-packages/paddlehub/reader/nlp_reader.py", line 257, in wrapper
examples, batch_size, phase=phase):
File "/usr/local/lib/python3.6/dist-packages/paddlehub/reader/nlp_reader.py", line 186, in _prepare_batch_data
self.tokenizer, phase)
File "/usr/local/lib/python3.6/dist-packages/paddlehub/reader/nlp_reader.py", line 541, in _convert_example_to_record
label_ids.append(int(label))
ValueError: invalid literal for int() with base 10: '录'


---------------------------------------------------------------------------

EnforceNotMet Traceback (most recent call last)

in ()
147 # will finish training, evaluation, testing, save model automatically
148 #multi_label_cls_task.finetune()
--> 149 multi_label_cls_task.finetune_and_eval()
150



3 frames
/usr/local/lib/python3.6/dist-packages/paddle/fluid/reader.py in __next__(self)
1102 return self._reader.read_next_list()
1103 else:
-> 1104 return self._reader.read_next()
1105 except StopIteration:
1106 self._queue.close()


EnforceNotMet:

--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString(std::string const&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
2 paddle::operators::reader::BlockingQueue > >::Receive(std::vector >*)
3 paddle::operators::reader::PyReader::ReadNext(std::vector >*)
4 std::_Function_handler (), std::__future_base::_Task_setter, std::__future_base::_Result_base::_Deleter>, unsigned long> >::_M_invoke(std::_Any_data const&)
5 std::__future_base::_State_base::_M_do_set(std::function ()>&, bool&)
6 ThreadPool::ThreadPool(unsigned long)::{lambda()#1}::operator()() const

----------------------
Error Message Summary:
----------------------
Error: Blocking queue is killed because the data reader raises an exception
[Hint: Expected killed_ != true, but received killed_:1 == true:1.] at (/paddle/paddle/fluid/operators/reader/blocking_queue.h:141)

收藏
点赞
0
个赞
快速回复
TOP
切换版块