Query Vision Language Model#
Querying Qwen-VL#
[ ]:
import nest_asyncio
nest_asyncio.apply() # Run this first.
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
chat_template = "qwen2-vl"
[ ]:
# Lets create a prompt.
from io import BytesIO
import requests
from PIL import Image
from sglang.srt.parser.conversation import chat_templates
image = Image.open(
BytesIO(
requests.get(
"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
).content
)
)
conv = chat_templates[chat_template].copy()
conv.append_message(conv.roles[0], f"What's shown here: {conv.image_token}?")
conv.append_message(conv.roles[1], "")
conv.image_data = [image]
print(conv.get_prompt())
image
Query via the offline Engine API#
[ ]:
from sglang import Engine
llm = Engine(
model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8
)
[ ]:
out = llm.generate(prompt=conv.get_prompt(), image_data=[image])
print(out["text"])
Query via the offline Engine API, but send precomputed embeddings#
[ ]:
# Compute the image embeddings using Huggingface.
from transformers import AutoProcessor
from transformers import Qwen2_5_VLForConditionalGeneration
processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
vision = (
Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()
)
[ ]:
processed_prompt = processor(
images=[image], text=conv.get_prompt(), return_tensors="pt"
)
input_ids = processed_prompt["input_ids"][0].detach().cpu().tolist()
precomputed_embeddings = vision(
processed_prompt["pixel_values"].cuda(), processed_prompt["image_grid_thw"].cuda()
)
mm_item = dict(
modality="IMAGE",
image_grid_thw=processed_prompt["image_grid_thw"],
precomputed_embeddings=precomputed_embeddings,
)
out = llm.generate(input_ids=input_ids, image_data=[mm_item])
print(out["text"])
Querying Llama 4 (Vision)#
[ ]:
import nest_asyncio
nest_asyncio.apply() # Run this first.
model_path = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
chat_template = "llama-4"
[ ]:
# Lets create a prompt.
from io import BytesIO
import requests
from PIL import Image
from sglang.srt.parser.conversation import chat_templates
image = Image.open(
BytesIO(
requests.get(
"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
).content
)
)
conv = chat_templates[chat_template].copy()
conv.append_message(conv.roles[0], f"What's shown here: {conv.image_token}?")
conv.append_message(conv.roles[1], "")
conv.image_data = [image]
print(conv.get_prompt())
print(f"Image size: {image.size}")
image
Query via the offline Engine API#
[ ]:
from sglang.test.test_utils import is_in_ci
if not is_in_ci():
from sglang import Engine
llm = Engine(
model_path=model_path,
trust_remote_code=True,
enable_multimodal=True,
mem_fraction_static=0.8,
tp_size=4,
attention_backend="fa3",
context_length=65536,
)
[ ]:
if not is_in_ci():
out = llm.generate(prompt=conv.get_prompt(), image_data=[image])
print(out["text"])
Query via the offline Engine API, but send precomputed embeddings#
[ ]:
if not is_in_ci():
# Compute the image embeddings using Huggingface.
from transformers import AutoProcessor
from transformers import Llama4ForConditionalGeneration
processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
model = Llama4ForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto"
).eval()
vision = model.vision_model.cuda()
multi_modal_projector = model.multi_modal_projector.cuda()
[ ]:
if not is_in_ci():
processed_prompt = processor(
images=[image], text=conv.get_prompt(), return_tensors="pt"
)
print(f'{processed_prompt["pixel_values"].shape=}')
input_ids = processed_prompt["input_ids"][0].detach().cpu().tolist()
image_outputs = vision(
processed_prompt["pixel_values"].to("cuda"), output_hidden_states=False
)
image_features = image_outputs.last_hidden_state
vision_flat = image_features.view(-1, image_features.size(-1))
precomputed_embeddings = multi_modal_projector(vision_flat)
mm_item = dict(modality="IMAGE", precomputed_embeddings=precomputed_embeddings)
out = llm.generate(input_ids=input_ids, image_data=[mm_item])
print(out["text"])