@@ -318,7 +318,9 @@ def tearDown(self):
318
318
@require_bitsandbytes
319
319
def test_small_model_integration_test (self ):
320
320
# Let' s make sure we test the preprocessing to replace what is used
321
- model = AriaForConditionalGeneration .from_pretrained ("rhymes-ai/Aria" , load_in_4bit = True )
321
+ model = AriaForConditionalGeneration .from_pretrained (
322
+ "rhymes-ai/Aria" , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
323
+ )
322
324
323
325
prompt = "<image>\n USER: What are the things I should be cautious about when I visit this place?\n ASSISTANT:"
324
326
image_file = "https://aria-vl.github.io/static/images/view.jpg"
@@ -342,9 +344,11 @@ def test_small_model_integration_test_llama_single(self):
342
344
# Let' s make sure we test the preprocessing to replace what is used
343
345
model_id = "rhymes-ai/Aria"
344
346
345
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
347
+ model = AriaForConditionalGeneration .from_pretrained (
348
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
349
+ )
346
350
processor = AutoProcessor .from_pretrained (model_id )
347
-
351
+ breakpoint ()
348
352
prompt = "USER: <image>\n What are the things I should be cautious about when I visit this place? ASSISTANT:"
349
353
image_file = "https://aria-vl.github.io/static/images/view.jpg"
350
354
raw_image = Image .open (requests .get (image_file , stream = True ).raw )
@@ -364,7 +368,9 @@ def test_small_model_integration_test_llama_batched(self):
364
368
# Let' s make sure we test the preprocessing to replace what is used
365
369
model_id = "rhymes-ai/Aria"
366
370
367
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
371
+ model = AriaForConditionalGeneration .from_pretrained (
372
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
373
+ )
368
374
processor = AutoProcessor .from_pretrained (model_id )
369
375
370
376
prompts = [
@@ -389,7 +395,9 @@ def test_small_model_integration_test_llama_batched(self):
389
395
@require_bitsandbytes
390
396
def test_small_model_integration_test_batch (self ):
391
397
# Let' s make sure we test the preprocessing to replace what is used
392
- model = AriaForConditionalGeneration .from_pretrained ("rhymes-ai/Aria" , load_in_4bit = True )
398
+ model = AriaForConditionalGeneration .from_pretrained (
399
+ "rhymes-ai/Aria" , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
400
+ )
393
401
# The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
394
402
prompts = [
395
403
"USER: <image>\n What are the things I should be cautious about when I visit this place? What should I bring with me?\n ASSISTANT:" ,
@@ -418,7 +426,12 @@ def test_small_model_integration_test_llama_batched_regression(self):
418
426
model_id = "rhymes-ai/Aria"
419
427
420
428
# Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
421
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True , attn_implementation = "eager" )
429
+ model = AriaForConditionalGeneration .from_pretrained (
430
+ model_id ,
431
+ revision = "6583f58908d092e52f348069485e64fef4867730" ,
432
+ load_in_4bit = True ,
433
+ attn_implementation = "eager" ,
434
+ )
422
435
processor = AutoProcessor .from_pretrained (model_id , pad_token = "<pad>" )
423
436
424
437
prompts = [
@@ -443,7 +456,9 @@ def test_small_model_integration_test_llama_batched_regression(self):
443
456
@require_torch
444
457
@require_vision
445
458
def test_batched_generation (self ):
446
- model = AriaForConditionalGeneration .from_pretrained ("rhymes-ai/Aria" , load_in_4bit = True )
459
+ model = AriaForConditionalGeneration .from_pretrained (
460
+ "rhymes-ai/Aria" , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
461
+ )
447
462
448
463
processor = AutoProcessor .from_pretrained ("rhymes-ai/Aria" )
449
464
@@ -481,7 +496,9 @@ def test_aria_index_error_bug(self):
481
496
# Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
482
497
# more details
483
498
model_id = "rhymes-ai/Aria"
484
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
499
+ model = AriaForConditionalGeneration .from_pretrained (
500
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
501
+ )
485
502
486
503
processor = AutoProcessor .from_pretrained (model_id )
487
504
@@ -501,7 +518,9 @@ def test_aria_index_error_bug(self):
501
518
def test_aria_merge_inputs_error_bug (self ):
502
519
# This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
503
520
model_id = "rhymes-ai/Aria"
504
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
521
+ model = AriaForConditionalGeneration .from_pretrained (
522
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
523
+ )
505
524
506
525
# Simulate some user inputs
507
526
pixel_values = torch .randn (
@@ -556,7 +575,9 @@ def test_tokenizer_integration(self):
556
575
@require_bitsandbytes
557
576
def test_generation_no_images (self ):
558
577
model_id = "rhymes-ai/Aria"
559
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
578
+ model = AriaForConditionalGeneration .from_pretrained (
579
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
580
+ )
560
581
processor = AutoProcessor .from_pretrained (model_id )
561
582
562
583
# Prepare inputs with no images
@@ -569,7 +590,12 @@ def test_generation_no_images(self):
569
590
@require_bitsandbytes
570
591
def test_generation_siglip_backbone (self ):
571
592
model_id = "rhymes-ai/Aria"
572
- model = AriaForConditionalGeneration .from_pretrained (model_id , torch_dtype = "float16" , device_map = torch_device )
593
+ model = AriaForConditionalGeneration .from_pretrained (
594
+ model_id ,
595
+ revision = "6583f58908d092e52f348069485e64fef4867730" ,
596
+ torch_dtype = "float16" ,
597
+ device_map = torch_device ,
598
+ )
573
599
processor = AutoProcessor .from_pretrained (model_id )
574
600
575
601
# check processing with expansion of inputs (w/o expansion should work with any backbone)
@@ -594,7 +620,9 @@ def test_generation_siglip_backbone(self):
594
620
@require_bitsandbytes
595
621
def test_expansion_in_processing (self ):
596
622
model_id = "rhymes-ai/Aria"
597
- model = AriaForConditionalGeneration .from_pretrained (model_id , load_in_4bit = True )
623
+ model = AriaForConditionalGeneration .from_pretrained (
624
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730" , load_in_4bit = True
625
+ )
598
626
processor = AutoProcessor .from_pretrained (model_id )
599
627
600
628
prompt = "USER: <image>\n Describe the image:\n ASSISTANT:"
@@ -624,7 +652,9 @@ def test_expansion_in_processing(self):
624
652
@require_bitsandbytes
625
653
def test_pixtral (self ):
626
654
model_id = "rhymes-ai/Aria"
627
- model = AriaForConditionalGeneration .from_pretrained (model_id )
655
+ model = AriaForConditionalGeneration .from_pretrained (
656
+ model_id , revision = "6583f58908d092e52f348069485e64fef4867730"
657
+ )
628
658
processor = AutoProcessor .from_pretrained (model_id )
629
659
630
660
IMG_URLS = [
0 commit comments