Qwen-Image-Edit

Qwen-Image、Qwen-Image-Edit に続いて、最新の Qwen-Image-Edit-2509 が出た。この最新版を使ってみる。

2025-09-23時点のdiffusers-0.35.1ではうまく動かない。pip install git+https://github.com/huggingface/diffusers で開発版（diffusers-0.36.0.dev0）をインストールする。

import torch
from PIL import Image
from diffusers import QwenImageEditPlusPipeline

pipeline = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16)

pipeline.to('mps')  # Macではmps
pipeline.set_progress_bar_config(disable=None)
image1 = Image.open("input1.png")
image2 = Image.open("input2.png")
prompt = "The magician bear is on the left, the alchemist bear is on the right, facing each other in the central park square."
inputs = {
    "image": [image1, image2],
    "prompt": prompt,
    "generator": torch.manual_seed(0),
    "true_cfg_scale": 4.0,
    "negative_prompt": " ",
    "num_inference_steps": 40,
    "num_images_per_prompt": 1,
}
with torch.inference_mode():
    output = pipeline(**inputs)
    output_image = output.images[0]
    output_image.save("example.webp", quality=85)  # 非可逆
    # output_image.save("example.webp", lossless=True, method=6) # 可逆

出力はJPEGでもPNGでもいいが、私は上の例のようにWebPの非可逆または可逆圧縮版をよく使っている。

途中段階を出力するコードをGPT-5に書いてもらった。以下は変更点のみ。これを加えるとトータルでは少し遅くなる。40ステップのうち、20ステップまでは砂嵐、25ステップでやっとぼんやり見えてきて、30ステップでやっと適否が判断できる程度になる。

height, width = 1024, 1024

# --- live preview callback ---
def on_step_end(pipe, i, t, kw):
    tokens = kw["latents"]  # shape: [B, num_patches, 64]
    with torch.no_grad():
        # 1) unpack packed tokens -> VAE latent grid [B, z_dim(=16), T(=1), H, W]
        lat = pipe._unpack_latents(tokens, height, width, pipe.vae_scale_factor)  # private helper used by the pipeline

        # 2) match VAE dtype/device
        lat = lat.to(pipe.vae.dtype, non_blocking=True)

        # 3) un-normalize (pipeline does this right before decoding)
        mean = torch.tensor(pipe.vae.config.latents_mean, device=lat.device, dtype=lat.dtype).view(1, pipe.vae.config.z_dim, 1, 1, 1)
        stdinv = 1.0 / torch.tensor(pipe.vae.config.latents_std, device=lat.device, dtype=lat.dtype).view(1, pipe.vae.config.z_dim, 1, 1, 1)
        lat = lat / stdinv + mean

        # 4) decode and postprocess to a PIL image
        frame = pipe.vae.decode(lat, return_dict=False)[0][:, :, 0]  # take temporal dim 0
        pil = pipe.image_processor.postprocess(frame, output_type="pil")[0]

        pil.save(f"step_{i:03d}.webp")  # or display in-notebook
        print(f"[preview] step {i:02d} saved")

    # IMPORTANT: return the (possibly updated) tensors for the sampler to continue
    return {"latents": tokens}

inputs = {
    "image": [image1, image2],
    "prompt": prompt,
    "generator": torch.manual_seed(0),
    "true_cfg_scale": 4.0,
    "negative_prompt": " ",
    "num_inference_steps": 40,
    "num_images_per_prompt": 1,
    "height": height,
    "width": width,
    "callback_on_step_end": on_step_end,
    "callback_on_step_end_tensor_inputs": ["latents"],  # what we want passed into the callback
}