HuggingFace的transfomers库

tokenizer

我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?

python 复制代码
from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained('facebook/opt-125m')

可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py

python 复制代码
class GPT2TokenizerFast(PreTrainedTokenizerFast):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = GPT2Tokenizer
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs,
    ):
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
python 复制代码
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    vocab_files_names = VOCAB_FILES_NAMES
    slow_tokenizer_class: PreTrainedTokenizer = None

    def __init__(self, *args, **kwargs):
        tokenizer_object = kwargs.pop("tokenizer_object", None)
        slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
        fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
        from_slow = kwargs.pop("from_slow", False)
        added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
    @property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()
    def is_fast(self) -> bool:
    @property
    def can_save_slow_tokenizer(self) -> bool:
    @property
    def vocab_size(self) -> int:
    def get_vocab(self) -> Dict[str, int]:
    @property
    def vocab(self) -> Dict[str, int]:
    @property
    def added_tokens_encoder(self) -> Dict[str, int]:
    @property
    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
    def get_added_vocab(self) -> Dict[str, int]:
    def __len__(self) -> int:
    @property
    def backend_tokenizer(self) -> TokenizerFast:
    @property
    def decoder(self) -> DecoderFast:
    def _convert_encoding(
        self,
        encoding: EncodingFast,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
    def _convert_token_to_id_with_added_voc(self, token: str) -> int:
    def _convert_id_to_token(self, index: int) -> Optional[str]:
    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
    def set_truncation_and_padding(
        self,
        padding_strategy: PaddingStrategy,
        truncation_strategy: TruncationStrategy,
        max_length: int,
        stride: int,
        pad_to_multiple_of: Optional[int],
    ):
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[bool] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,
    ) -> str:
    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
    def train_new_from_iterator(
        self,
        text_iterator,
        vocab_size,
        length=None,
        new_special_tokens=None,
        special_tokens_map=None,
        **kwargs,
    ):
相关推荐
Q_Q51100828520 分钟前
python+django/flask的情绪宣泄系统
spring boot·python·pycharm·django·flask·node.js·php
撸码猿26 分钟前
《Python AI入门》第9章 让机器读懂文字——NLP基础与情感分析实战
人工智能·python·自然语言处理
二川bro31 分钟前
多模态AI开发:Python实现跨模态学习
人工智能·python·学习
2301_764441331 小时前
Python构建输入法应用
开发语言·python·算法
love530love1 小时前
【笔记】ComfUI RIFEInterpolation 节点缺失问题(cupy CUDA 安装)解决方案
人工智能·windows·笔记·python·插件·comfyui
青瓷程序设计1 小时前
昆虫识别系统【最新版】Python+TensorFlow+Vue3+Django+人工智能+深度学习+卷积神经网络算法
人工智能·python·深度学习
秋邱2 小时前
智启未来:AGI 教育融合 × 跨平台联盟 × 个性化空间,重构教育 AI 新范式开篇:一场 “教育 ×AI” 的范式革命
人工智能·python·重构·推荐算法·agi
爱吃泡芙的小白白2 小时前
vscode、anaconda、git、python配置安装(自用)
ide·git·vscode·python·anaconda·学习记录
谷隐凡二2 小时前
Kubernetes主从架构简单解析:基于Python的模拟实现
python·架构·kubernetes
老歌老听老掉牙2 小时前
Matplotlib Pyplot 数据可视化完全指南
python·信息可视化·matplotlib