HuggingFace的transfomers库

tokenizer

我获取了opt类型的tokenizer,那么enc是什么类型呢?有哪些方法呢?

python 复制代码
from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained('facebook/opt-125m')

可以通过print(enc)看到,enc是GPT2TokenizerFast类型,搜索类型的定义,在python安装包的transformers/models/gpt2/tokenization_gpt2_fast.py

python 复制代码
class GPT2TokenizerFast(PreTrainedTokenizerFast):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = GPT2Tokenizer
    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs,
    ):
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
python 复制代码
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    vocab_files_names = VOCAB_FILES_NAMES
    slow_tokenizer_class: PreTrainedTokenizer = None

    def __init__(self, *args, **kwargs):
        tokenizer_object = kwargs.pop("tokenizer_object", None)
        slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
        fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
        from_slow = kwargs.pop("from_slow", False)
        added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
    @property#属性装饰器的作用在于将成员函数变成成员变量,访问的时候不需要/不能加()
    def is_fast(self) -> bool:
    @property
    def can_save_slow_tokenizer(self) -> bool:
    @property
    def vocab_size(self) -> int:
    def get_vocab(self) -> Dict[str, int]:
    @property
    def vocab(self) -> Dict[str, int]:
    @property
    def added_tokens_encoder(self) -> Dict[str, int]:
    @property
    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
    def get_added_vocab(self) -> Dict[str, int]:
    def __len__(self) -> int:
    @property
    def backend_tokenizer(self) -> TokenizerFast:
    @property
    def decoder(self) -> DecoderFast:
    def _convert_encoding(
        self,
        encoding: EncodingFast,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
    def _convert_token_to_id_with_added_voc(self, token: str) -> int:
    def _convert_id_to_token(self, index: int) -> Optional[str]:
    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
    def set_truncation_and_padding(
        self,
        padding_strategy: PaddingStrategy,
        truncation_strategy: TruncationStrategy,
        max_length: int,
        stride: int,
        pad_to_multiple_of: Optional[int],
    ):
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[bool] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,
    ) -> str:
    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
    def train_new_from_iterator(
        self,
        text_iterator,
        vocab_size,
        length=None,
        new_special_tokens=None,
        special_tokens_map=None,
        **kwargs,
    ):
相关推荐
正经码农1 小时前
字典与集合——测试界的黑话宝典与BUG追捕术
python
nuclear20115 小时前
Python 从PPT文档中提取图片和图片信息(坐标、宽度和高度等)
python·powerpoint·ppt图片提取·提取ppt背景图片·提取pp所有图片
樱花穿过千岛湖6 小时前
第六章:Multi-Backend Configuration
人工智能·python·gpt·学习·ai
跳跳糖炒酸奶7 小时前
第十五讲、Isaaclab中在机器人上添加传感器
人工智能·python·算法·ubuntu·机器人
FACELESS VOID7 小时前
llama-factory微调报错:
python
_一条咸鱼_8 小时前
Python 名称空间与作用域深度剖析(二十七)
人工智能·python·面试
_一条咸鱼_8 小时前
Python之函数对象+函数嵌套(二十六)
人工智能·python·面试
_一条咸鱼_8 小时前
Python 文件操作之修改(二十二)
人工智能·python·面试
_一条咸鱼_8 小时前
Python 闭包函数:原理、应用与深度解析(二十八)
人工智能·python·面试
_一条咸鱼_8 小时前
Python 之文件处理编码字符(二十)
人工智能·python·面试