Fooocus框架代码分析

由于Fooocus的优秀推理能力，后续考虑从webui切换到Fooocus上，因此对其中的代码要进行深入分析，Fooocus的sdxl版本在11g的显存上跑起来压力不大，但是webui的sdxl版本起码12g。尤其要对比其和webui的优化点，但是在代码层面，并不是和webui同一档次的框架，webui采用了支持hook式的插件系统，但是fooocus因为其midjourney的指向，所以并不是走三方插件的路子。

在autodl上：python launch.py --listen --port 6006

entry_with_update.py

python 复制代码

# 启动
python entry_with_update.py --listen

launch.py ->
prepare_environment()->
ini_comfy_args()->
- args_manager.py -> args = comfy_cli.args ->backend/headless/comfy/cli_args.py
download_models()->

webui.py ->
- run_button.click().then(fn=generate_clicked)->
-- modules/async_worker.py->workers()->threading.Thread(target=worker).start()
-- handler()-> # 传入参数并配置
-- prompt_processing/vary/upscale/inpaint/controlnet/
-- imgs = pipeline.process_diffusin(...)->

modules/default_pipeline.py -> process_diffusion() 主pipeline，webui中是StableDiffusionProcessingTxt2Img和StableDiffusionProcessingImg2Img两个核心接口。

python 复制代码

if latent is None:
    empty_latent = core.generate_empty_latent(width,height,1)
else:
    empty_latent = latent

sampled_latent = core.ksampler(final_unet,final_refiner,positive_cond,negative_cond,empty_latent,steps,denoise,callback,cfg_scale,sampler_name,scheduler_name,switch)    
decoded_latent = core.decode_vae(vae,sampled_latent,...)
images = core.pytorch_to_numpy(decoded_latent)

默认方法：refresh_everything()

python 复制代码

refresh_everything()->

refresh_refiner_model(refiner_model_name)
refresh_base_model(base_model_name)
refresh_loras(loras)

prepare_text_encoder(True)

core.py -> generate_empty_latent()

python 复制代码

opEmptyLatentImage.generate(width,height,batch_size)[0]

- backend/headless/nodes.py -> EmptyLatentImage.generate()
- latent = torch.zeros([bs,4,height//8,width//8]) -> {'samples':latent}

core.py->ksampler()->backend/headless/comfy/sample.py

python 复制代码

core->ksampler()

latent_image = latent['sampler']
noise = comfy.sample.prepare_noise(latent_image,seed,...)

samples = comfy.sample.sample(model,noise,steps,cfg,sampler_name,scheduler,positive,negative,latent_image,...)
- backend/headless/comfy/sample.py->sample()
- real_model,positive_copy,negative_copy,noise_mask,models=prepare_sampling(model,noise.shape,positive...)
- sampler = comfy.samplers.KSampler(...)
- sampler = sampler.sample(noise,positive_copy,negative_copy,cfg,latent_image...)
-- sampler = sampler_class(self.sampler)
-- sample(self.model,noise,positive,...)

sampler_class()->backend/headless/comfy/samplers.py

python 复制代码

sampler_class(name)->
sampler = ksampler(name)->class KSAMPLER(Sampler)

sample->modules/sample_hijack.py (backend/headless/comfy/samplers.py) 劫持了comfy中的sample

python 复制代码

sample_hijack(model,noise,positive,negative,cfg,device,sampler,sigmas,model_options,latent_image,denoise_mask,callback,...) ->

positive = positive[:]
negative = negative[:]

model_wrap = model_wrap(model)
- model_denoise = CFGNoisePredictor(model)
- model_wrap = k_diffusion_external.CompVisDenoiser(model_denoise)

calculate_start_end_timesteps(model_wrap,negative)
calculate_start_end_timesteps(model_wrap,positive)
for c in positive/negative:
    create_cond_with_same_area_if_none(negative/positive,c)
pre_run_control(model_wrap,negative+positive) # cfg相关

latent_image = model.process_latent_in(latent_image)

samples = samplers.sample(model_wrap,sigmas,extra_args,...)
model.process_latent_out(samples)

backend/headless/comfy/samplers.py

python 复制代码

class KSAMPLER->sample(model_wrap,sigmas,extra_args,callback,...)
model_k = KSamplerOInpaint(model_wrap)

if sampler_name == "dpm_fast":
    samples = k_diffusion_sampling.sample_dpm_fast(model_k,noise,...)
elif sampler_name == "dpm_adaptive":
    samples = k_diffusion_sampling.sample_dpm_adaptive(model_k,noise,...)
else:
    samples = getattr(k_diffusion_sampling,"sample_{}".format(sampler_name))(model_k,noise,...)

backend/headless/comfy/k_diffusion/sampling.py

python 复制代码

sampler_dpmpp_2m_sde_gpu(model,x,sigmas,extra_args,callback,...)
noise_sampler = BrownianTreeNoiseSampler(x,sigma_min,..) if noise_sampler is None else noise_sampler
sample_dpmpp_2m_sde(model,x,...)

sample_dpmpp_2m_sde

python 复制代码

for i in trange(len(sigmas)-1):
    denoised = model(x,sigmas[i]*s_in,**extra_args)
    if callback is not None:
        callback({'x':x,'i':i,'sigma':sigma[i],'sigma_hat':sigmas[i],'denoised':denoised})
    if sigmas[i+1] == 0:
        x = denoised
    else:
        # DPM-Solver++(2M) SDE
        t, s = -sigmas[i].log(), -sigmas[i + 1].log()
        h = s - t
        eta_h = eta * h
        x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised

        if old_denoised is not None:
            r = h_last / h
            if solver_type == 'heun':
                x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
            elif solver_type == 'midpoint':
                x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)
         if eta:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise

backend/headless/nodes.py 节点就是类

python 复制代码

NODE_CLASS_MAPPINGS = {
    "KSampler": KSampler,
    "CheckpointLoaderSimple": CheckpointLoaderSimple,
    "CLIPTextEncode": CLIPTextEncode,
    "CLIPSetLastLayer": CLIPSetLastLayer,
    "VAEDecode": VAEDecode,
    "VAEEncode": VAEEncode,
    "VAEEncodeForInpaint": VAEEncodeForInpaint,
    "VAELoader": VAELoader,
    "EmptyLatentImage": EmptyLatentImage,
    "LatentUpscale": LatentUpscale,
    "LatentUpscaleBy": LatentUpscaleBy,
    "LatentFromBatch": LatentFromBatch,
    "RepeatLatentBatch": RepeatLatentBatch,
    "SaveImage": SaveImage,
    "PreviewImage": PreviewImage,
    "LoadImage": LoadImage,
    "LoadImageMask": LoadImageMask,
    "ImageScale": ImageScale,
    "ImageScaleBy": ImageScaleBy,
    "ImageInvert": ImageInvert,
    "ImageBatch": ImageBatch,
    "ImagePadForOutpaint": ImagePadForOutpaint,
    "EmptyImage": EmptyImage,
    "ConditioningAverage": ConditioningAverage,
    "ConditioningCombine": ConditioningCombine,
    "ConditioningConcat": ConditioningConcat,
    "ConditioningSetArea": ConditioningSetArea,
    "ConditioningSetAreaPercentage": ConditioningSetAreaPercentage,
    "ConditioningSetMask": ConditioningSetMask,
    "KSamplerAdvanced": KSamplerAdvanced,
    "SetLatentNoiseMask": SetLatentNoiseMask,
    "LatentComposite": LatentComposite,
    "LatentBlend": LatentBlend,
    "LatentRotate": LatentRotate,
    "LatentFlip": LatentFlip,
    "LatentCrop": LatentCrop,
    "LoraLoader": LoraLoader,
    "CLIPLoader": CLIPLoader,
    "UNETLoader": UNETLoader,
    "DualCLIPLoader": DualCLIPLoader,
    "CLIPVisionEncode": CLIPVisionEncode,
    "StyleModelApply": StyleModelApply,
    "unCLIPConditioning": unCLIPConditioning,
    "ControlNetApply": ControlNetApply,
    "ControlNetApplyAdvanced": ControlNetApplyAdvanced,
    "ControlNetLoader": ControlNetLoader,
    "DiffControlNetLoader": DiffControlNetLoader,
    "StyleModelLoader": StyleModelLoader,
    "CLIPVisionLoader": CLIPVisionLoader,
    "VAEDecodeTiled": VAEDecodeTiled,
    "VAEEncodeTiled": VAEEncodeTiled,
    "unCLIPCheckpointLoader": unCLIPCheckpointLoader,
    "GLIGENLoader": GLIGENLoader,
    "GLIGENTextBoxApply": GLIGENTextBoxApply,

    "CheckpointLoader": CheckpointLoader,
    "DiffusersLoader": DiffusersLoader,

    "LoadLatent": LoadLatent,
    "SaveLatent": SaveLatent,

    "ConditioningZeroOut": ConditioningZeroOut,
    "ConditioningSetTimestepRange": ConditioningSetTimestepRange,
}