Qwen Evaluation

Hello, thank you for your great work! But when evaluating Qwen3-VL-2B-Instruct on RefSpatial-Bench, the model achieves only 8.9% average accuracy(Location, Placement ,Unseen), which is far below the reported results 28.9%.

I followed the same evaluation pipeline as other multimodal models, and used a standard accuracy metric that measures whether the predicted coordinates fall within the ground-truth mask.

### Generate predictions

```

def query_qwen3_vl_instruct_local(_qwen_model, _qwen_processor, image_paths, prompt, model_name='Qwen/Qwen3-VL-2B-Instruct'):
    """
    Query the Qwen3-VL model locally using transformers library with the prompt and a list of image paths.

    Parameters:
    - image_paths: List of Strings, the path to the images.
    - prompt: String, the prompt.
    - model_name: String, the model name or path.
    - retry: Integer, the number of retries.
    """
    
    
    base64_images = [encode_image(image_path) for image_path in image_paths]

    try:
        # Prepare messages with images
        content = [{"type": "text", "text": prompt}]
        for i, image in enumerate(base64_images):
            content.append({"type": "image", "image": f"data:image/jpeg;base64,{image}"})
        messages = [
            {
                "role": "user",
                "content": content,
            }
        ]    

        # Prepare inputs            
        inputs = _qwen_processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt",
        )
        inputs.pop("token_type_ids", None)
            
        # Move to device
        inputs = inputs.to(_qwen_model.device)

        # Generate
        with torch.no_grad():
            generated_ids = _qwen_model.generate(
                **inputs,
                max_new_tokens=8196,
                temperature=0.2
            ).to(_qwen_model.device)
            
        # Decode output (skip input tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
            
        output_text = _qwen_processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )
            
        print(f"Model response: {output_text[0]}")
        return output_text[0]
            
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(1)
    
    return 'Failed: Query Error'



def get_prompt(model_name, object_name, prompt, suffix):
    if "Molmo" in model_name:
        return f"Locate several points of {object_name}."
    elif "RoboBrain" in model_name:
        return f"{prompt} Please provide its 2D coordinates."
    elif "Gemini" in model_name:
        return f"Locate one point of {object_name}."
    elif "Qwen" in model_name:
        return f"Locate {object_name} in this image and output the point coordinates in JSON format."
        # return f"Locate {object_name} in this image.Your answer should be formatted as a list of tuples, i.e. [(x1, y1)], where each tuple contains the x and y coordinates of a point satisfying the conditions above. The coordinates should be between 0 and 1, indicating the normalized pixel locations of the points in the image."
    else:
        return f"{prompt} {suffix}"

def eval_task(task_name, model_name, model_generate_func, output_save_folder):
    benchmark_question_file = f"./RefSpatial-Bench/{task_name}"
    # _qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    #         model_name,
    #         torch_dtype=torch.bfloat16,
    #         device_map="auto"
    #     )

    _qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
    
    _qwen_processor = AutoProcessor.from_pretrained(model_name)
    with open(f"{benchmark_question_file}/question.json", "r") as f:
        questions = json.load(f)

    output_path = f'{output_save_folder}/{model_name}/{task_name}.jsonl'
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    if os.path.exists(output_path):
        print(f'{output_path} already exists')
        return

    with open(output_path, "w") as ans_file:
        for question in questions:
            image_paths = [f"{benchmark_question_file}/{question['rgb_path']}"]
            instruction = get_prompt(model_name, question["object"], question["prompt"], question["suffix"])
            enable_depth = int("Depth" in model_name)

            
            gpt_answer = model_generate_func(_qwen_model, _qwen_processor, image_paths, instruction)


            result = {
                "question_id": question["id"],
                "prompt": question["prompt"],
                "object_name": question["object"],
                "suffix": question["suffix"],
                "instruction": instruction,
                "text": gpt_answer,
                "model_id": model_name,
                "rgb_path": question["rgb_path"],
                "mask_path": question["mask_path"],
                "category": question["category"],
                "step": question["step"]
            }

            ans_file.write(json.dumps(result) + "\n")
            ans_file.flush()

eval_task(task_name, model_name, query_qwen3_vl_instruct_local, output_save_folder)
```


### Sumarize
```

def decode_json_points(text: str,width=640, height=480):
    """Parse coordinate points from text format"""
    try:
        # 清理markdown标记
        if "```json" in text:
            text = text.split("```json")[1].split("```")[0]
        
        # 解析JSON
        data = json.loads(text)
        points = []
        labels = []
        
        for item in data:
            if "point_2d" in item:
                x, y = item["point_2d"]
                x = int(x / 1000 * width)
                y = int(y / 1000 * height)
                points.append([x, y])
                
                # 获取label，如果没有则使用默认值
                # label = item.get("label", f"point_{len(points)}")
                # labels.append(label)
        
        #print(points)
        return np.array(points)
        
    except Exception as e:
        print(f"Error: {e}")
        return []

def compute_accuracy(
    answers: List[Dict],
    task_name: str,
    parse_func: Callable[[str, int, int], np.ndarray]
) -> None:
    accuracy = []

    for answer in tqdm(answers):
        mask_path = os.path.join("./RefSpatial-Bench", task_name, answer['mask_path'])
        mask = np.array(Image.open(mask_path)) / 255.
        if mask.ndim == 3:
            mask = mask[:, :, 0]
        mask = (mask > 0).astype(np.uint8)

        try:
            points = parse_func(answer["text"], mask.shape[1], mask.shape[0])
        except Exception as e:
            print(f"Failed to parse question {answer['question_id']}: {e}")
            continue

        acc = 0.0
        if len(points) > 0:
            in_range = (points[:, 0] >= 0) & (points[:, 0] < mask.shape[1]) & \
                       (points[:, 1] >= 0) & (points[:, 1] < mask.shape[0])
            acc = np.concatenate([
                mask[points[in_range, 1], points[in_range, 0]],
                np.zeros(points.shape[0] - in_range.sum())
            ]).mean()

        answer["accuracy"] = acc
        accuracy.append(acc)

    print(f"Accuracy: {np.mean(accuracy):.4f}, Evaluated: {len(accuracy)}, Total: {len(answers)}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, required=True)
    parser.add_argument("--task_name", type=str, required=True)
    args = parser.parse_args()

    answer_file = os.path.join('./outputs', f"{args.model_name}/{args.task_name}.jsonl")
    with open(answer_file, 'r') as f:
        answers = [json.loads(line) for line in f]

    print("Computing accuracy for Qwen models...")
    compute_accuracy(answers, args.task_name, lambda text, w, h: decode_json_points(text, w, h))
      


if __name__ == '__main__':
    main()
```



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Qwen Evaluation #22

Generate predictions

Sumarize

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Qwen Evaluation #22

Description

Generate predictions

Sumarize

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions