tokenizer
我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?
python
from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained('facebook/opt-125m')
可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py
python
class GPT2TokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = GPT2Tokenizer
def __init__(
self,
vocab_file=None,
merges_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs,
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
python
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class: PreTrainedTokenizer = None
def __init__(self, *args, **kwargs):
tokenizer_object = kwargs.pop("tokenizer_object", None)
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
@property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()
def is_fast(self) -> bool:
@property
def can_save_slow_tokenizer(self) -> bool:
@property
def vocab_size(self) -> int:
def get_vocab(self) -> Dict[str, int]:
@property
def vocab(self) -> Dict[str, int]:
@property
def added_tokens_encoder(self) -> Dict[str, int]:
@property
def added_tokens_decoder(self) -> Dict[int, AddedToken]:
def get_added_vocab(self) -> Dict[str, int]:
def __len__(self) -> int:
@property
def backend_tokenizer(self) -> TokenizerFast:
@property
def decoder(self) -> DecoderFast:
def _convert_encoding(
self,
encoding: EncodingFast,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> Tuple[Dict[str, Any], List[EncodingFast]]:
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
def _convert_token_to_id_with_added_voc(self, token: str) -> int:
def _convert_id_to_token(self, index: int) -> Optional[str]:
def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
def num_special_tokens_to_add(self, pair: bool = False) -> int:
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
def set_truncation_and_padding(
self,
padding_strategy: PaddingStrategy,
truncation_strategy: TruncationStrategy,
max_length: int,
stride: int,
pad_to_multiple_of: Optional[int],
):
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
) -> BatchEncoding:
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
def convert_tokens_to_string(self, tokens: List[str]) -> str:
def _decode(
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],
file_names: Tuple[str],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
def train_new_from_iterator(
self,
text_iterator,
vocab_size,
length=None,
new_special_tokens=None,
special_tokens_map=None,
**kwargs,
):