@@ -297,7 +297,7 @@ def setUp(self):
297
297
cleanup (torch_device , gc_collect = True )
298
298
self .model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
299
299
self .model = Mistral3ForConditionalGeneration .from_pretrained (
300
- self .model_checkpoint , torch_dtype = torch .bfloat16
300
+ self .model_checkpoint , torch_dtype = torch .float16
301
301
)
302
302
accelerate .cpu_offload (self .model , execution_device = torch_device )
303
303
@@ -319,7 +319,7 @@ def test_mistral3_integration_generate_text_only(self):
319
319
320
320
inputs = processor .apply_chat_template (
321
321
messages , add_generation_prompt = True , tokenize = True , return_dict = True , return_tensors = "pt"
322
- ).to (torch_device , dtype = torch .bfloat16 )
322
+ ).to (torch_device , dtype = torch .float16 )
323
323
324
324
with torch .no_grad ():
325
325
generate_ids = self .model .generate (** inputs , max_new_tokens = 200 , do_sample = False )
@@ -351,7 +351,7 @@ def test_mistral3_integration_generate(self):
351
351
352
352
inputs = processor .apply_chat_template (
353
353
messages , add_generation_prompt = True , tokenize = True , return_dict = True , return_tensors = "pt"
354
- ).to (torch_device , dtype = torch .bfloat16 )
354
+ ).to (torch_device , dtype = torch .float16 )
355
355
with torch .no_grad ():
356
356
generate_ids = self .model .generate (** inputs , max_new_tokens = 20 , do_sample = False )
357
357
decoded_output = processor .decode (
@@ -361,8 +361,8 @@ def test_mistral3_integration_generate(self):
361
361
expected_outputs = Expectations (
362
362
{
363
363
("xpu" , 3 ): "The image features two cats resting on a pink blanket. The cat on the left is a kitten" ,
364
- ("cuda" , 7 ): " The image features two cats resting on a pink blanket. The cat on the left is a kitten" ,
365
- ("cuda" , 8 ): " The image features two cats resting on a pink blanket. The cat on the left is a small kit" ,
364
+ ("cuda" , 7 ): ' The image features two tabby cats lying on a pink surface, which appears to be a couch or' ,
365
+ ("cuda" , 8 ): ' The image features two cats lying on a pink surface, which appears to be a couch or a bed' ,
366
366
}
367
367
) # fmt: skip
368
368
expected_output = expected_outputs .get_expectation ()
@@ -378,7 +378,10 @@ def test_mistral3_integration_batched_generate(self):
378
378
{
379
379
"role" : "user" ,
380
380
"content" : [
381
- {"type" : "image" , "url" : "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/view.jpg" },
381
+ {
382
+ "type" : "image" ,
383
+ "url" : "https://huggingface.co/datasets/hf-internal-testing/testing-data-mistral3/resolve/main/view.jpg" ,
384
+ },
382
385
{"type" : "text" , "text" : "Write a haiku for this image" },
383
386
],
384
387
},
@@ -396,7 +399,7 @@ def test_mistral3_integration_batched_generate(self):
396
399
397
400
inputs = processor .apply_chat_template (
398
401
messages , padding = True , add_generation_prompt = True , tokenize = True , return_dict = True , return_tensors = "pt"
399
- ).to (torch_device , dtype = torch .bfloat16 )
402
+ ).to (torch_device , dtype = torch .float16 )
400
403
401
404
output = self .model .generate (** inputs , do_sample = False , max_new_tokens = 25 )
402
405
@@ -408,8 +411,8 @@ def test_mistral3_integration_batched_generate(self):
408
411
expected_outputs = Expectations (
409
412
{
410
413
("xpu" , 3 ): "Calm lake's mirror gleams,\n Whispering pines stand in silence,\n Path to peace begins." ,
411
- ("cuda" , 7 ): " Calm waters reflect\n Whispering pines stand in silence \n Path to peace begins" ,
412
- ("cuda" , 8 ): "Calm waters reflect \n Whispering pines stand in silence \n Path to peace begins " ,
414
+ ("cuda" , 7 ): ' Calm waters reflect\n Wooden path to distant shore \n Silence in the woods' ,
415
+ ("cuda" , 8 ): "Wooden path to calm, \n Reflections whisper secrets, \n Nature's peace unfolds. " ,
413
416
}
414
417
) # fmt: skip
415
418
expected_output = expected_outputs .get_expectation ()
@@ -424,8 +427,8 @@ def test_mistral3_integration_batched_generate(self):
424
427
expected_outputs = Expectations (
425
428
{
426
429
("xpu" , 3 ): "The image depicts a vibrant urban scene in what appears to be Chinatown. The focal point is a traditional Chinese archway" ,
427
- ("cuda" , 7 ): 'The image depicts a vibrant street scene in Chinatown, likely in a major city . The focal point is a traditional Chinese' ,
428
- ("cuda" , 8 ): 'The image depicts a vibrant street scene in what appears to be Chinatown in a major city . The focal point is a' ,
430
+ ("cuda" , 7 ): 'The image depicts a street scene in what appears to be a Chinatown district . The focal point is a traditional Chinese arch ' ,
431
+ ("cuda" , 8 ): 'The image depicts a street scene in what appears to be a Chinatown district . The focal point is a traditional Chinese arch ' ,
429
432
}
430
433
) # fmt: skip
431
434
expected_output = expected_outputs .get_expectation ()
@@ -446,7 +449,10 @@ def test_mistral3_integration_batched_generate_multi_image(self):
446
449
{
447
450
"role" : "user" ,
448
451
"content" : [
449
- {"type" : "image" , "url" : "https://huggingface.co/ydshieh/kosmos-2.5/resolve/main/view.jpg" },
452
+ {
453
+ "type" : "image" ,
454
+ "url" : "https://huggingface.co/datasets/hf-internal-testing/testing-data-mistral3/resolve/main/view.jpg" ,
455
+ },
450
456
{"type" : "text" , "text" : "Write a haiku for this image" },
451
457
],
452
458
},
@@ -457,11 +463,11 @@ def test_mistral3_integration_batched_generate_multi_image(self):
457
463
"content" : [
458
464
{
459
465
"type" : "image" ,
460
- "url" : "https://huggingface.co/ydshieh/kosmos-2.5 /resolve/main/Statue-of-Liberty-Island-New-York-Bay.jpg" ,
466
+ "url" : "https://huggingface.co/datasets/hf-internal-testing/testing-data-mistral3 /resolve/main/Statue-of-Liberty-Island-New-York-Bay.jpg" ,
461
467
},
462
468
{
463
469
"type" : "image" ,
464
- "url" : "https://huggingface.co/ydshieh/kosmos-2.5 /resolve/main/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" ,
470
+ "url" : "https://huggingface.co/datasets/hf-internal-testing/testing-data-mistral3 /resolve/main/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" ,
465
471
},
466
472
{
467
473
"type" : "text" ,
@@ -473,7 +479,7 @@ def test_mistral3_integration_batched_generate_multi_image(self):
473
479
]
474
480
inputs = processor .apply_chat_template (
475
481
messages , padding = True , add_generation_prompt = True , tokenize = True , return_dict = True , return_tensors = "pt"
476
- ).to (torch_device , dtype = torch .bfloat16 )
482
+ ).to (torch_device , dtype = torch .float16 )
477
483
478
484
output = self .model .generate (** inputs , do_sample = False , max_new_tokens = 25 )
479
485
gen_tokens = output [:, inputs ["input_ids" ].shape [1 ] :]
@@ -483,8 +489,8 @@ def test_mistral3_integration_batched_generate_multi_image(self):
483
489
expected_outputs = Expectations (
484
490
{
485
491
("xpu" , 3 ): "Still lake reflects skies,\n Wooden path to nature's heart,\n Silence speaks volumes." ,
486
- ("cuda" , 7 ): " Calm waters reflect\n Whispering pines stand in silence \n Path to peace begins" ,
487
- ("cuda" , 8 ): " Calm waters reflect\n Whispering pines stand in silence \n Path to peace begins" ,
492
+ ("cuda" , 7 ): ' Calm waters reflect\n Wooden path to distant shore \n Silence in the pines' ,
493
+ ("cuda" , 8 ): ' Calm waters reflect\n Wooden path to distant shore \n Silence in the pines' ,
488
494
}
489
495
) # fmt: skip
490
496
expected_output = expected_outputs .get_expectation ()
@@ -499,8 +505,8 @@ def test_mistral3_integration_batched_generate_multi_image(self):
499
505
expected_outputs = Expectations (
500
506
{
501
507
("xpu" , 3 ): "Certainly! The images depict two iconic landmarks:\n \n 1. The first image shows the Statue of Liberty in New York City." ,
502
- ("cuda" , 7 ): " Certainly! The images depict the following landmarks:\n \n 1. The first image shows the Statue of Liberty and the New York City" ,
503
- ("cuda" , 8 ): " Certainly! The images depict the following landmarks:\n \n 1. The first image shows the Statue of Liberty and the New York City" ,
508
+ ("cuda" , 7 ): ' Certainly! The images depict two famous landmarks in the United States :\n \n 1. The first image shows the Statue of Liberty,' ,
509
+ ("cuda" , 8 ): ' Certainly! The images depict two famous landmarks in the United States :\n \n 1. The first image shows the Statue of Liberty,' ,
504
510
}
505
511
) # fmt: skip
506
512
expected_output = expected_outputs .get_expectation ()
0 commit comments