Tokenizer
更新时间:2022-07-05
Tokenizer
基类定义
Tokenizer的基类定义在./wenxin/data/tokenizer/tokenizer.py中。
# -*- coding: utf-8 -*
"""
:py:class:`Tokenizer`
"""
from wenxin.common.register import RegisterSet
from wenxin.data.vocabulary import Vocabulary
@RegisterSet.tokenizer.register
class Tokenizer(object):
"""Tokenizer"""
def __init__(self, vocab_file, split_char=" ", unk_token="[UNK]", params=None):
"""
:param vocab_file: 词表文件路径
:param split_char: 明文分隔符,默认是空格
:param unk_token: unk 对应的token,默认是[UNK]
:param params: 个别tokenizer自己用到的额外参数,dict类型
"""
self.vocabulary = None
if vocab_file:
self.vocabulary = Vocabulary(vocab_file, unk_token)
self.split_char = split_char
self.unk_token = unk_token
self.params = params
def tokenize(self, text):
"""
对明文text进行分词
:param text:
:return: tokens, list类型
"""
raise NotImplementedError
def convert_tokens_to_ids(self, tokens):
"""
将一组明文token转为一组id数值
:param tokens:
:return:
"""
raise NotImplementedError
def convert_ids_to_tokens(self, ids):
"""
将一组id数值转为一组明文token
:param ids:
:return:
"""
raise NotImplementedError
def covert_id_to_token(self, id):
"""
将一个id数值转为一个明文token
:param id:
:return: token
"""
return self.vocabulary.covert_id_to_token(id)
def covert_token_to_id(self, token):
"""
将一个明文token转为一个id数值
:param token:
:return: id
"""
return self.vocabulary.covert_token_to_id(token)
def merge_subword(self, tokens):
"""
subword模式,需要把多个token合并成一个token
:param tokens:
:return: merged_tokens
"""
return tokens
核心函数
Tokenizer做为基类,需要用户按自己业务场景在子类中自定义的核心函数为以下3个:
- tokenize(self, text):对明文进行分词,得到一组token。
- convert_tokens_to_ids(self, tokens):将一组token转成一组id数值。
- convert_id_to_token(self, id):将一组id数值转换为一组token。
自定义实现示例
以文心目前提供的最常见通用Tokenizer,CustomTokenizer为例,解释其3个核心函数的实现,详见以下代码及核心部分的注释。
# -*- coding: utf-8 -*
"""
:py:class:`CustomTokenizer`
"""
from wenxin.common.register import RegisterSet
from wenxin.data.tokenizer.tokenizer import Tokenizer
from wenxin.utils.util_helper import convert_to_unicode
@RegisterSet.tokenizer.register
class CustomTokenizer(Tokenizer):
"""CustomTokenizer:用户自己分好词的明文,该tokenizer只负责按某个分隔符(如" ")分成一个list"""
def __init__(self, vocab_file, split_char=" ", unk_token="[UNK]", params=None):
"""
:param vocab_file: 词表文件路径
:param split_char: 明文分隔符,默认是空格
"""
Tokenizer.__init__(self, vocab_file, split_char, unk_token, params)
self.split_char = split_char
def tokenize(self, text):
"""
:param text:
:return:
"""
text = convert_to_unicode(text)
split_tokens = text.split(self.split_char)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""
:param tokens:
:return:
"""
return self.vocabulary.convert_tokens_to_ids(tokens)
def convert_ids_to_tokens(self, ids):
"""
:param ids:
:return:
"""
return self.vocabulary.convert_ids_to_tokens(ids)