Meta-Llama-3.1-8B-bnb-4bit 加载:
python
from huggingface_hub import InferenceClient
from torch import nn
from transformers import (
AutoModel,
AutoProcessor,
AutoTokenizer,
AutoModelForCausalLM,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
BitsAndBytesConfig,
)
from pathlib import Path
import torch
from PIL import Image
import os
# ===============================
# CLIP (SigLIP)
# # ===============================
# model_id = "google/siglip-so400m-patch14-384"
# CLIP_PATH = download_hg_model(model_id, "clip")
# clip_processor = AutoProcessor.from_pretrained(
# CLIP_PATH,
# trust_remote_code=True
# )
# clip_model = AutoModel.from_pretrained(
# CLIP_PATH,
# trust_remote_code=True
# )
# clip_model = clip_model.vision_model
# clip_model.eval()
# clip_model.requires_grad_(False)
# clip_model.to("cuda")
# ===============================
# LLM (LLaMA 3.1 4bit)
# ===============================
MODEL_PATH = "/data/lbg/models/textoon/ComfyUI/models/LLM/Meta-Llama-3.1-8B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
use_fast=True
)
assert isinstance(
tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
), f"Tokenizer is of type {type(tokenizer)}"
# ⭐ 2️⃣ bitsandbytes 4bit 配置(关键)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# ⭐ 3️⃣ 正确加载 4bit 模型
text_model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
text_model.eval()
bash
pip install bitsandbytes