-
Notifications
You must be signed in to change notification settings - Fork 7
Open
Description
Hello, thank you for your great work! But when evaluating Qwen3-VL-2B-Instruct on RefSpatial-Bench, the model achieves only 8.9% average accuracy(Location, Placement ,Unseen), which is far below the reported results 28.9%.
I followed the same evaluation pipeline as other multimodal models, and used a standard accuracy metric that measures whether the predicted coordinates fall within the ground-truth mask.
Generate predictions
def query_qwen3_vl_instruct_local(_qwen_model, _qwen_processor, image_paths, prompt, model_name='Qwen/Qwen3-VL-2B-Instruct'):
"""
Query the Qwen3-VL model locally using transformers library with the prompt and a list of image paths.
Parameters:
- image_paths: List of Strings, the path to the images.
- prompt: String, the prompt.
- model_name: String, the model name or path.
- retry: Integer, the number of retries.
"""
base64_images = [encode_image(image_path) for image_path in image_paths]
try:
# Prepare messages with images
content = [{"type": "text", "text": prompt}]
for i, image in enumerate(base64_images):
content.append({"type": "image", "image": f"data:image/jpeg;base64,{image}"})
messages = [
{
"role": "user",
"content": content,
}
]
# Prepare inputs
inputs = _qwen_processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
inputs.pop("token_type_ids", None)
# Move to device
inputs = inputs.to(_qwen_model.device)
# Generate
with torch.no_grad():
generated_ids = _qwen_model.generate(
**inputs,
max_new_tokens=8196,
temperature=0.2
).to(_qwen_model.device)
# Decode output (skip input tokens)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = _qwen_processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
print(f"Model response: {output_text[0]}")
return output_text[0]
except Exception as e:
print(f"Error: {e}")
time.sleep(1)
return 'Failed: Query Error'
def get_prompt(model_name, object_name, prompt, suffix):
if "Molmo" in model_name:
return f"Locate several points of {object_name}."
elif "RoboBrain" in model_name:
return f"{prompt} Please provide its 2D coordinates."
elif "Gemini" in model_name:
return f"Locate one point of {object_name}."
elif "Qwen" in model_name:
return f"Locate {object_name} in this image and output the point coordinates in JSON format."
# return f"Locate {object_name} in this image.Your answer should be formatted as a list of tuples, i.e. [(x1, y1)], where each tuple contains the x and y coordinates of a point satisfying the conditions above. The coordinates should be between 0 and 1, indicating the normalized pixel locations of the points in the image."
else:
return f"{prompt} {suffix}"
def eval_task(task_name, model_name, model_generate_func, output_save_folder):
benchmark_question_file = f"./RefSpatial-Bench/{task_name}"
# _qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# model_name,
# torch_dtype=torch.bfloat16,
# device_map="auto"
# )
_qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
_qwen_processor = AutoProcessor.from_pretrained(model_name)
with open(f"{benchmark_question_file}/question.json", "r") as f:
questions = json.load(f)
output_path = f'{output_save_folder}/{model_name}/{task_name}.jsonl'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
if os.path.exists(output_path):
print(f'{output_path} already exists')
return
with open(output_path, "w") as ans_file:
for question in questions:
image_paths = [f"{benchmark_question_file}/{question['rgb_path']}"]
instruction = get_prompt(model_name, question["object"], question["prompt"], question["suffix"])
enable_depth = int("Depth" in model_name)
gpt_answer = model_generate_func(_qwen_model, _qwen_processor, image_paths, instruction)
result = {
"question_id": question["id"],
"prompt": question["prompt"],
"object_name": question["object"],
"suffix": question["suffix"],
"instruction": instruction,
"text": gpt_answer,
"model_id": model_name,
"rgb_path": question["rgb_path"],
"mask_path": question["mask_path"],
"category": question["category"],
"step": question["step"]
}
ans_file.write(json.dumps(result) + "\n")
ans_file.flush()
eval_task(task_name, model_name, query_qwen3_vl_instruct_local, output_save_folder)
Sumarize
def decode_json_points(text: str,width=640, height=480):
"""Parse coordinate points from text format"""
try:
# 清理markdown标记
if "```json" in text:
text = text.split("```json")[1].split("```")[0]
# 解析JSON
data = json.loads(text)
points = []
labels = []
for item in data:
if "point_2d" in item:
x, y = item["point_2d"]
x = int(x / 1000 * width)
y = int(y / 1000 * height)
points.append([x, y])
# 获取label,如果没有则使用默认值
# label = item.get("label", f"point_{len(points)}")
# labels.append(label)
#print(points)
return np.array(points)
except Exception as e:
print(f"Error: {e}")
return []
def compute_accuracy(
answers: List[Dict],
task_name: str,
parse_func: Callable[[str, int, int], np.ndarray]
) -> None:
accuracy = []
for answer in tqdm(answers):
mask_path = os.path.join("./RefSpatial-Bench", task_name, answer['mask_path'])
mask = np.array(Image.open(mask_path)) / 255.
if mask.ndim == 3:
mask = mask[:, :, 0]
mask = (mask > 0).astype(np.uint8)
try:
points = parse_func(answer["text"], mask.shape[1], mask.shape[0])
except Exception as e:
print(f"Failed to parse question {answer['question_id']}: {e}")
continue
acc = 0.0
if len(points) > 0:
in_range = (points[:, 0] >= 0) & (points[:, 0] < mask.shape[1]) & \
(points[:, 1] >= 0) & (points[:, 1] < mask.shape[0])
acc = np.concatenate([
mask[points[in_range, 1], points[in_range, 0]],
np.zeros(points.shape[0] - in_range.sum())
]).mean()
answer["accuracy"] = acc
accuracy.append(acc)
print(f"Accuracy: {np.mean(accuracy):.4f}, Evaluated: {len(accuracy)}, Total: {len(answers)}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--task_name", type=str, required=True)
args = parser.parse_args()
answer_file = os.path.join('./outputs', f"{args.model_name}/{args.task_name}.jsonl")
with open(answer_file, 'r') as f:
answers = [json.loads(line) for line in f]
print("Computing accuracy for Qwen models...")
compute_accuracy(answers, args.task_name, lambda text, w, h: decode_json_points(text, w, h))
if __name__ == '__main__':
main()
Metadata
Metadata
Assignees
Labels
No labels