@@ -638,111 +638,156 @@ int init_CL(int num_streams, cl_int *devnumber)
638638/*
639639 * set_gpu_type
640640 * try to extract the GPU type from the device info
641+ * type informs our kernel selection due to perf and compilability issues and #defines
642+ * such as USE_DP
643+ *
644+ * broadly speaking, each group of architectures with a certain amount of int32 mul
645+ * relative to other parts should get its own type.
646+ *
647+ * the "APU" type is only for VLIW. newer GPUs seem to just match the arch. they mainly
648+ * lose out on memory due to no L3, but we don't use much memory bus anyways.
641649 */
642650void set_gpu_type ()
643651{
652+ #define PAT (b ) patmatch(deviceinfo.d_name,b,0 )
653+ #define STM (b ) strstr(deviceinfo.d_name,b)
644654 if (mystuff.gpu_type == GPU_AUTO)
645655 {
646656 // try to auto-detect the type of GPU
647- if (strstr (deviceinfo.d_name , " Capeverde" ) || // 7730, 7750, 7770, 8760, 8740, R7 250X
648- strstr (deviceinfo.d_name , " Pitcairn" ) || // 7850, 7870, 8870
649- strstr (deviceinfo.d_name , " Bonaire" ) || // 7790, R7 260, R7 260X
650- strstr (deviceinfo.d_name , " Oland" ) || // 8670, 8570, R9 240, R9 250
651- strstr (deviceinfo.d_name , " Sun" ) || // 85x0M
652- strstr (deviceinfo.d_name , " Mars" ) || // 86x0M, 87x0M
653- strstr (deviceinfo.d_name , " Venus" ) || // 88x0M
654- strstr (deviceinfo.d_name , " Saturn" ) || // 8930M, 8950M
655- strstr (deviceinfo.d_name , " Neptune" ) || // 8970M, 8990M
656- strstr (deviceinfo.d_name , " Curacao" ) || // R9 265, R9 270, R9 270X
657- strstr (deviceinfo.d_name , " Tonga" ) || // R9 285
658- strstr (deviceinfo.d_name , " Hainan" ) || // R9 285
659- strstr (deviceinfo.d_name , " Antigua" ) || // R9 380(X)
660- strstr (deviceinfo.d_name , " Kalindi" ) || // GCN APU, Kabini, R7 ???
661- strstr (deviceinfo.d_name , " D300" ) || // FirePro D-series
662- strstr (deviceinfo.d_name , " D500" ) ||
663- strstr (deviceinfo.d_name , " D700" )
657+ // There are basically two styles of names:
658+ // 1) Model: "Radeon HD 7770", "Radeon R7 260X", "Radeon R9 290X", "Radeon RX 6800 XT"
659+ // 2) Codename: "Capeverde", "Pitcairn", "Tahiti", "Hawaii", "Ellesmere", "gfx900", "gfx1010"
660+ // The first type is typical of mac and Windows. The second type is typical of Linux.
661+
662+ if (STM (" Capeverde" ) || // 7730, 7750, 7770, 8760, 8740, R7 250X
663+ STM (" Pitcairn" ) || // 7850, 7870, 8870
664+ STM (" Bonaire" ) || // 7790, R7 260, R7 260X
665+ STM (" Oland" ) || // 8670, 8570, R9 240, R9 250
666+ STM (" Sun" ) || // 85x0M
667+ STM (" Mars" ) || // 86x0M, 87x0M
668+ STM (" Venus" ) || // 88x0M
669+ STM (" Saturn" ) || // 8930M, 8950M
670+ STM (" Neptune" ) || // 8970M, 8990M
671+ STM (" Curacao" ) || // R9 265, R9 270, R9 270X
672+ STM (" Tonga" ) || // R9 285
673+ STM (" Hainan" ) || // R9 285
674+ STM (" Antigua" ) || // R9 380(X)
675+ STM (" Kalindi" ) || // GCN APU, Kabini, R7 ???
676+ PAT (" D[357]00" ) || // FirePro D-series
677+ PAT (" HD [78][0-7][0-9][0-9]" )
664678 )
665679 {
666680 mystuff.gpu_type = GPU_GCN;
667681 }
668- else if (strstr (deviceinfo.d_name , " Malta" ) || // 7990
669- strstr (deviceinfo.d_name , " Tahiti" ) // 7870XT, 7950, 7970, 8970, 8950, R9 280X
682+ else if (STM (" Malta" ) || // 7990
683+ STM (" Tahiti" ) || // 7870XT, 7950, 7970, 8970, 8950, R9 280X
684+ PAT (" HD [78][89][0-9]0" ) ||
685+ STM (" R9 280X" ) ||
686+ PAT (" gfx60[0-9]" ) // GCN-GFX6 Southern Islands
670687 )
671688 {
672689 mystuff.gpu_type = GPU_GCN2; // these cards have faster DP performance, allowing to use it for the division algorithms
673690 }
674- else if (strstr (deviceinfo. d_name , " Hawaii" ) || // R9 290, R9 290X
691+ else if (STM ( " Hawaii" ) || // R9 290, R9 290X
675692 // Hawaii is both desktop graphics (1:8) and workstation graphics (1:2) in W8100, W9100, S9150
676693 // 1:8 is just below the sweet spot for using DP. FirePro cards would run faster using DP
677- strstr (deviceinfo.d_name , " Vesuvius" ) || // 295X2
678- strstr (deviceinfo.d_name , " gfx803" ) // Fury X
694+ STM (" Vesuvius" ) || // 295X2
695+ STM (" gfx803" ) || // Fury X
696+ STM (" gfx802" ) || // Iceland, Tonga (guess)
697+ STM (" Polaris" ) || // (guess)
698+ PAT (" R9 29" ) ||
699+ PAT (" W[89]1[01][05]0" )
679700 )
680701 {
681702 mystuff.gpu_type = GPU_GCN3; // these cards have improved int32 performance over the previous GCNs, making for a changed kernel selection
682703 }
683- else if (strstr (deviceinfo.d_name , " Ellesmere" ) || // RX 470/480/570/580/590
684- strstr (deviceinfo.d_name , " gfx804" ) || // RX 550
685- strstr (deviceinfo.d_name , " Radeon Pro 560" ) || // Baffin, used on Macs
686- strstr (deviceinfo.d_name , " Lexa" ) || // small GCN 4.0 - not tested, only assumption
687- strstr (deviceinfo.d_name , " Baffin" ) // small GCN 4.0 - not tested, only assumption
704+ else if (STM (" Ellesmere" ) || // RX 470/480/570/580/590
705+ STM (" gfx804" ) || // RX 550
706+ STM (" gfx810" ) || // "TBA" APU
707+ STM (" gfx801" ) || // Carrizo APU
708+ STM (" Radeon Pro 560" ) || // Baffin, used on Macs
709+ STM (" Lexa" ) || // small GCN 4.0 - not tested, only assumption
710+ STM (" Baffin" ) // small GCN 4.0 - not tested, only assumption
711+
712+ // The following are GCN3/4 APUs. Need to verify that they do not interfere with FX CPU names
713+ // PAT("A[468]-") || // APU (guess)
714+ // STM("E2-") || // APU (guess)
715+ // PAT("A1[02]-") || // APU (guess)
688716 )
689717 {
690718 mystuff.gpu_type = GPU_GCN4;
691719 }
692- else if (strstr (deviceinfo. d_name , " gfx901" ) || // Vega 64(?)
693- strstr (deviceinfo. d_name , " gfx900" ) || // Vega 56
694- strstr (deviceinfo. d_name , " gfx902" ) || // Vega Ryzen 2xxx-3xxx iGPU
695- strstr (deviceinfo. d_name , " gfx903" ) // Vega Ryzen 2xxx-3xxx iGPU
720+ else if (STM ( " gfx901" ) || // Vega 64(?)
721+ STM ( " gfx900" ) || // Vega 56
722+ STM ( " gfx902" ) || // Vega Ryzen 2xxx-3xxx iGPU
723+ STM ( " gfx903" ) // Vega Ryzen 2xxx-3xxx iGPU
696724 )
697725 {
698726 mystuff.gpu_type = GPU_GCN5;
699727 }
700- else if (strstr (deviceinfo.d_name , " gfx906" ) // Radeon VII
728+ else if (STM (" gfx906" ) || // Radeon VII and Pro and MI50/60 (1:4, 1:1)
729+ STM (" Radeon Pro VII" ) || // Radeon VII
730+ STM (" Radeon VII" ) || // Radeon VII
731+
732+ PAT (" gfx90[89]" ) || // CDNA1: MI100
733+ PAT (" gfx90[ac]" ) || // CDNA2: MI210, MI250(X)
734+ STM (" gfx942" ) || // CDNA3: MI300(A/X)
735+ STM (" gfx950" ) || // CDNA4: MI350/355
736+
737+ PAT (" MI[5-6]0" ) || // MI50, MI60
738+ PAT (" MI[0-9][0-9][0-9]" ) // Any MI with three digits
701739 )
702740 {
703741 mystuff.gpu_type = GPU_GCNF;
704742 }
705- else if (strstr (deviceinfo. d_name , " gfx1010 " ) || // RX 5600-5700 XT
706- strstr (deviceinfo. d_name , " gfx1012 " ) || // RX 5300-5500 XT
707- strstr (deviceinfo. d_name , " gfx1011 " ) || //
708- strstr (deviceinfo. d_name , " gfx1030 " ) || // RX 6800-6900 XT (untested but kernel list should be similar)
709- strstr (deviceinfo. d_name , " gfx1031 " ) || // RX 6700 (XT)
710- strstr (deviceinfo. d_name , " gfx1032 " )) // lower end RDNA2
743+ else if (STM ( " gfx101 " ) || // RDNA1
744+ STM ( " gfx103 " ) || // RDNA2
745+
746+ PAT ( " RX [56][0-9][0-9][0-9] " ) // Model
747+ // Also known as 6[0-9]0M, but might be too vague to match
748+ )
711749 {
712750 mystuff.gpu_type = GPU_RDNA;
713751 }
714- else if (strstr (deviceinfo.d_name , " gfx1101" )) // 7800XT
752+ else if (STM (" gfx110" ) || // Catch-all RDNA3
753+ STM (" gfx115" ) || // Catch-all RDNA3.5
754+ STM (" gfx120" ) || // Catch-all RDNA4
755+
756+ PAT (" RX [79][0-9][0-9][0-9]" ) || // Model
757+ PAT (" 80[456]0S" ) // Strix Halo, huge APU
758+ // Also [78][0-9]0M, but might be too vague to match
759+ )
715760 {
716761 mystuff.gpu_type = GPU_RDNA3;
717762 }
718- else if (strstr (deviceinfo. d_name , " Cayman" ) || // 6950, 6970
719- strstr (deviceinfo. d_name , " Devastator" ) || // 7xx0D (iGPUs of A4/6/8/10)
720- strstr (deviceinfo. d_name , " Scrapper" ) || // 7xx0G (iGPUs of A4/6/8/10)
721- strstr (deviceinfo. d_name , " Antilles" )) // 6990
763+ else if (STM ( " Cayman" ) || // 6950, 6970
764+ STM ( " Devastator" ) || // 7xx0D (iGPUs of A4/6/8/10)
765+ STM ( " Scrapper" ) || // 7xx0G (iGPUs of A4/6/8/10)
766+ STM ( " Antilles" )) // 6990
722767 {
723768 mystuff.gpu_type = GPU_VLIW4;
724769 }
725- else if (strstr (deviceinfo. d_name , " WinterPark" ) || // 6370D (E2-3200), 6410D (A4-3300, A4-3400)
726- strstr (deviceinfo. d_name , " BeaverCreek" ) || // 6530D (A6-3500, A6-3600, A6-3650, A6-3670K), 6550D (A8-3800, A8-3850, A8-3870K)
727- strstr (deviceinfo. d_name , " Zacate" ) || // 6320 (E-450)
728- strstr (deviceinfo. d_name , " Ontario" ) || // 6290 (C-60)
729- strstr (deviceinfo. d_name , " Wrestler" )) // 6250 (C-30, C-50), 6310 (E-240, E-300, E-350)
770+ else if (STM ( " WinterPark" ) || // 6370D (E2-3200), 6410D (A4-3300, A4-3400)
771+ STM ( " BeaverCreek" ) || // 6530D (A6-3500, A6-3600, A6-3650, A6-3670K), 6550D (A8-3800, A8-3850, A8-3870K)
772+ STM ( " Zacate" ) || // 6320 (E-450)
773+ STM ( " Ontario" ) || // 6290 (C-60)
774+ STM ( " Wrestler" )) // 6250 (C-30, C-50), 6310 (E-240, E-300, E-350)
730775 {
731776 mystuff.gpu_type = GPU_APU;
732777 }
733- else if (strstr (deviceinfo. d_name , " Caicos" ) || // (6450, 8450, R5 230) 7450, 7470,
734- strstr (deviceinfo. d_name , " Cedar" ) || // 7350, 5450
735- strstr (deviceinfo. d_name , " Redwood" ) || // 5550, 5570, 5670
736- strstr (deviceinfo. d_name , " Turks" ) || // 6570, 6670, 7570, 7670
737- strstr (deviceinfo. d_name , " Juniper" ) || // 6750, 6770, 5750, 5770
738- strstr (deviceinfo. d_name , " Cypress" ) || // 5830, 5850, 5870
739- strstr (deviceinfo. d_name , " Hemlock" ) || // 5970
740- strstr (deviceinfo. d_name , " Barts" )) // 6790, 6850, 6870
778+ else if (STM ( " Caicos" ) || // (6450, 8450, R5 230) 7450, 7470,
779+ STM ( " Cedar" ) || // 7350, 5450
780+ STM ( " Redwood" ) || // 5550, 5570, 5670
781+ STM ( " Turks" ) || // 6570, 6670, 7570, 7670
782+ STM ( " Juniper" ) || // 6750, 6770, 5750, 5770
783+ STM ( " Cypress" ) || // 5830, 5850, 5870
784+ STM ( " Hemlock" ) || // 5970
785+ STM ( " Barts" )) // 6790, 6850, 6870
741786 {
742787 mystuff.gpu_type = GPU_VLIW5;
743788 }
744- else if (strstr (deviceinfo. d_name , " RV7" ) || // 4xxx (ATI RV 7xx)
745- strstr (deviceinfo. d_name , " Loveland" )) // e.g. 6310 as part of E350: it reports 2 compute units, but only has a total of 80 compute elements
789+ else if (STM ( " RV7" ) || // 4xxx (ATI RV 7xx)
790+ STM ( " Loveland" )) // e.g. 6310 as part of E350: it reports 2 compute units, but only has a total of 80 compute elements
746791 {
747792 mystuff.gpu_type = GPU_VLIW5;
748793 gpu_types[mystuff.gpu_type ].CE_per_multiprocessor = 40 ; // though VLIW5, only 40 instead of 80 compute elements
@@ -754,8 +799,8 @@ void set_gpu_type()
754799 }
755800 }
756801 else if (
757- strstr (deviceinfo. d_name , " CPU" ) ||
758- strstr (deviceinfo. d_name , " cpu" ) ||
802+ STM ( " CPU" ) ||
803+ STM ( " cpu" ) ||
759804 strstr (deviceinfo.v_name , " GenuineIntel" ) ||
760805 strstr (deviceinfo.v_name , " AuthenticAMD" ))
761806 {
@@ -764,11 +809,13 @@ void set_gpu_type()
764809 else if (strstr (deviceinfo.v_name , " NVIDIA" ))
765810 {
766811 mystuff.gpu_type = GPU_NVIDIA; // working only with VectorSize=1 and GPU sieving
812+ // NVIDIA uses a non-SIMD architecture. other special trait is fast int32 mul
767813 }
768- else if (strstr (deviceinfo. d_name , " Intel(R)" ) &&
769- strstr (deviceinfo. d_name , " Graphics" ))
814+ else if (STM ( " Intel(R)" ) &&
815+ STM ( " Graphics" ))
770816 {
771817 mystuff.gpu_type = GPU_INTEL; // IntelHD
818+ // Could be a good idea to split on the fancier Arc/Xe GPUs
772819 }
773820 else
774821 {
@@ -782,6 +829,8 @@ void set_gpu_type()
782829 mystuff.gpu_type = GPU_GCN;
783830 }
784831 }
832+ #undef PAT
833+ #undef STM
785834
786835 if (((mystuff.gpu_type >= GPU_GCN) && (mystuff.gpu_type <= GPU_GCN3)) && (mystuff.vectorsize > 3 ))
787836 {
0 commit comments