Skip to content

Commit 89e383f

Browse files
committed
Bunch of GPU detection work
* Add myfnmatch.c/h for simple wildcard matching. Unix systems have fnmatch() but Windows doesn't, so we provide a version from musl. * Use fnmatch() to: * Match GPU models, which is what you get on not-Linux. Should improve detection on Windows and macOS. * Match gfx-targets, which is what you get on Linux. Should cover more AMD GPUs. Also added matching for CDNA, which are chonkier versions of GCN5. * Cover RDNA4 and RDNA3.5 as using RDNA3 features. Should help with #43 but definitely test before closing.
1 parent 9957d39 commit 89e383f

File tree

7 files changed

+461
-72
lines changed

7 files changed

+461
-72
lines changed

mfaktoVS12.vcxproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
<ClCompile Include="src\output.c" />
3535
<ClCompile Include="src\perftest.cpp" />
3636
<ClCompile Include="src\crc.c" />
37+
<ClCompile Include="src\myfnmatch.c" />
3738
</ItemGroup>
3839
<ItemGroup>
3940
<ClInclude Include="src\checkpoint.h" />
@@ -56,6 +57,7 @@
5657
<ClInclude Include="src\tf_debug.h" />
5758
<ClInclude Include="src\filelocking.h" />
5859
<ClInclude Include="src\crc.h" />
60+
<ClInclude Include="src\myfnmatch.h" />
5961
</ItemGroup>
6062
<ItemGroup>
6163
<None Include="Changelog-mfakto.txt" />

src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ endif
9494
##############################################################################
9595

9696
CSRC = sieve.c timer.c parse.c read_config.c mfaktc.c checkpoint.c \
97-
crc.c signal_handler.c filelocking.c output.c
97+
crc.c signal_handler.c filelocking.c output.c myfnmatch.c
9898

9999
# CLSRC = barrett15.cl barrett.cl common.cl gpusieve.cl mfakto_Kernels.cl montgomery.cl mul24.cl
100100

src/mfakto.cpp

Lines changed: 111 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -638,111 +638,156 @@ int init_CL(int num_streams, cl_int *devnumber)
638638
/*
639639
* set_gpu_type
640640
* try to extract the GPU type from the device info
641+
* type informs our kernel selection due to perf and compilability issues and #defines
642+
* such as USE_DP
643+
*
644+
* broadly speaking, each group of architectures with a certain amount of int32 mul
645+
* relative to other parts should get its own type.
646+
*
647+
* the "APU" type is only for VLIW. newer GPUs seem to just match the arch. they mainly
648+
* lose out on memory due to no L3, but we don't use much memory bus anyways.
641649
*/
642650
void set_gpu_type()
643651
{
652+
#define PAT(b) patmatch(deviceinfo.d_name,b,0)
653+
#define STM(b) strstr(deviceinfo.d_name,b)
644654
if (mystuff.gpu_type == GPU_AUTO)
645655
{
646656
// try to auto-detect the type of GPU
647-
if (strstr(deviceinfo.d_name, "Capeverde") || // 7730, 7750, 7770, 8760, 8740, R7 250X
648-
strstr(deviceinfo.d_name, "Pitcairn") || // 7850, 7870, 8870
649-
strstr(deviceinfo.d_name, "Bonaire") || // 7790, R7 260, R7 260X
650-
strstr(deviceinfo.d_name, "Oland") || // 8670, 8570, R9 240, R9 250
651-
strstr(deviceinfo.d_name, "Sun") || // 85x0M
652-
strstr(deviceinfo.d_name, "Mars") || // 86x0M, 87x0M
653-
strstr(deviceinfo.d_name, "Venus") || // 88x0M
654-
strstr(deviceinfo.d_name, "Saturn") || // 8930M, 8950M
655-
strstr(deviceinfo.d_name, "Neptune") || // 8970M, 8990M
656-
strstr(deviceinfo.d_name, "Curacao") || // R9 265, R9 270, R9 270X
657-
strstr(deviceinfo.d_name, "Tonga") || // R9 285
658-
strstr(deviceinfo.d_name, "Hainan") || // R9 285
659-
strstr(deviceinfo.d_name, "Antigua") || // R9 380(X)
660-
strstr(deviceinfo.d_name, "Kalindi") || // GCN APU, Kabini, R7 ???
661-
strstr(deviceinfo.d_name, "D300") || // FirePro D-series
662-
strstr(deviceinfo.d_name, "D500") ||
663-
strstr(deviceinfo.d_name, "D700")
657+
// There are basically two styles of names:
658+
// 1) Model: "Radeon HD 7770", "Radeon R7 260X", "Radeon R9 290X", "Radeon RX 6800 XT"
659+
// 2) Codename: "Capeverde", "Pitcairn", "Tahiti", "Hawaii", "Ellesmere", "gfx900", "gfx1010"
660+
// The first type is typical of mac and Windows. The second type is typical of Linux.
661+
662+
if (STM("Capeverde") || // 7730, 7750, 7770, 8760, 8740, R7 250X
663+
STM("Pitcairn") || // 7850, 7870, 8870
664+
STM("Bonaire") || // 7790, R7 260, R7 260X
665+
STM("Oland") || // 8670, 8570, R9 240, R9 250
666+
STM("Sun") || // 85x0M
667+
STM("Mars") || // 86x0M, 87x0M
668+
STM("Venus") || // 88x0M
669+
STM("Saturn") || // 8930M, 8950M
670+
STM("Neptune") || // 8970M, 8990M
671+
STM("Curacao") || // R9 265, R9 270, R9 270X
672+
STM("Tonga") || // R9 285
673+
STM("Hainan") || // R9 285
674+
STM("Antigua") || // R9 380(X)
675+
STM("Kalindi") || // GCN APU, Kabini, R7 ???
676+
PAT("D[357]00") || // FirePro D-series
677+
PAT("HD [78][0-7][0-9][0-9]")
664678
)
665679
{
666680
mystuff.gpu_type = GPU_GCN;
667681
}
668-
else if (strstr(deviceinfo.d_name, "Malta") || // 7990
669-
strstr(deviceinfo.d_name, "Tahiti") // 7870XT, 7950, 7970, 8970, 8950, R9 280X
682+
else if (STM("Malta") || // 7990
683+
STM("Tahiti") || // 7870XT, 7950, 7970, 8970, 8950, R9 280X
684+
PAT("HD [78][89][0-9]0") ||
685+
STM("R9 280X") ||
686+
PAT("gfx60[0-9]") // GCN-GFX6 Southern Islands
670687
)
671688
{
672689
mystuff.gpu_type = GPU_GCN2; // these cards have faster DP performance, allowing to use it for the division algorithms
673690
}
674-
else if (strstr(deviceinfo.d_name, "Hawaii") || // R9 290, R9 290X
691+
else if (STM("Hawaii") || // R9 290, R9 290X
675692
// Hawaii is both desktop graphics (1:8) and workstation graphics (1:2) in W8100, W9100, S9150
676693
// 1:8 is just below the sweet spot for using DP. FirePro cards would run faster using DP
677-
strstr(deviceinfo.d_name, "Vesuvius") || // 295X2
678-
strstr(deviceinfo.d_name, "gfx803") // Fury X
694+
STM("Vesuvius") || // 295X2
695+
STM("gfx803") || // Fury X
696+
STM("gfx802") || // Iceland, Tonga (guess)
697+
STM("Polaris") || // (guess)
698+
PAT("R9 29") ||
699+
PAT("W[89]1[01][05]0")
679700
)
680701
{
681702
mystuff.gpu_type = GPU_GCN3; // these cards have improved int32 performance over the previous GCNs, making for a changed kernel selection
682703
}
683-
else if (strstr(deviceinfo.d_name, "Ellesmere") || // RX 470/480/570/580/590
684-
strstr(deviceinfo.d_name, "gfx804") || // RX 550
685-
strstr(deviceinfo.d_name, "Radeon Pro 560") || // Baffin, used on Macs
686-
strstr(deviceinfo.d_name, "Lexa") || // small GCN 4.0 - not tested, only assumption
687-
strstr(deviceinfo.d_name, "Baffin") // small GCN 4.0 - not tested, only assumption
704+
else if (STM("Ellesmere") || // RX 470/480/570/580/590
705+
STM("gfx804") || // RX 550
706+
STM("gfx810") || // "TBA" APU
707+
STM("gfx801") || // Carrizo APU
708+
STM("Radeon Pro 560") || // Baffin, used on Macs
709+
STM("Lexa") || // small GCN 4.0 - not tested, only assumption
710+
STM("Baffin") // small GCN 4.0 - not tested, only assumption
711+
712+
// The following are GCN3/4 APUs. Need to verify that they do not interfere with FX CPU names
713+
// PAT("A[468]-") || // APU (guess)
714+
// STM("E2-") || // APU (guess)
715+
// PAT("A1[02]-") || // APU (guess)
688716
)
689717
{
690718
mystuff.gpu_type = GPU_GCN4;
691719
}
692-
else if (strstr(deviceinfo.d_name, "gfx901") || // Vega 64(?)
693-
strstr(deviceinfo.d_name, "gfx900") || // Vega 56
694-
strstr(deviceinfo.d_name, "gfx902") || // Vega Ryzen 2xxx-3xxx iGPU
695-
strstr(deviceinfo.d_name, "gfx903") // Vega Ryzen 2xxx-3xxx iGPU
720+
else if (STM("gfx901") || // Vega 64(?)
721+
STM("gfx900") || // Vega 56
722+
STM("gfx902") || // Vega Ryzen 2xxx-3xxx iGPU
723+
STM("gfx903") // Vega Ryzen 2xxx-3xxx iGPU
696724
)
697725
{
698726
mystuff.gpu_type = GPU_GCN5;
699727
}
700-
else if (strstr(deviceinfo.d_name, "gfx906") // Radeon VII
728+
else if (STM("gfx906") || // Radeon VII and Pro and MI50/60 (1:4, 1:1)
729+
STM("Radeon Pro VII") || // Radeon VII
730+
STM("Radeon VII") || // Radeon VII
731+
732+
PAT("gfx90[89]") || // CDNA1: MI100
733+
PAT("gfx90[ac]") || // CDNA2: MI210, MI250(X)
734+
STM("gfx942") || // CDNA3: MI300(A/X)
735+
STM("gfx950") || // CDNA4: MI350/355
736+
737+
PAT("MI[5-6]0") || // MI50, MI60
738+
PAT("MI[0-9][0-9][0-9]") // Any MI with three digits
701739
)
702740
{
703741
mystuff.gpu_type = GPU_GCNF;
704742
}
705-
else if (strstr(deviceinfo.d_name, "gfx1010") || // RX 5600-5700 XT
706-
strstr(deviceinfo.d_name, "gfx1012") || // RX 5300-5500 XT
707-
strstr(deviceinfo.d_name, "gfx1011") || //
708-
strstr(deviceinfo.d_name, "gfx1030") || // RX 6800-6900 XT (untested but kernel list should be similar)
709-
strstr(deviceinfo.d_name, "gfx1031") || // RX 6700 (XT)
710-
strstr(deviceinfo.d_name, "gfx1032")) // lower end RDNA2
743+
else if (STM("gfx101") || // RDNA1
744+
STM("gfx103") || // RDNA2
745+
746+
PAT("RX [56][0-9][0-9][0-9]") // Model
747+
// Also known as 6[0-9]0M, but might be too vague to match
748+
)
711749
{
712750
mystuff.gpu_type = GPU_RDNA;
713751
}
714-
else if (strstr(deviceinfo.d_name, "gfx1101")) // 7800XT
752+
else if (STM("gfx110") || // Catch-all RDNA3
753+
STM("gfx115") || // Catch-all RDNA3.5
754+
STM("gfx120") || // Catch-all RDNA4
755+
756+
PAT("RX [79][0-9][0-9][0-9]") || // Model
757+
PAT("80[456]0S") // Strix Halo, huge APU
758+
// Also [78][0-9]0M, but might be too vague to match
759+
)
715760
{
716761
mystuff.gpu_type = GPU_RDNA3;
717762
}
718-
else if (strstr(deviceinfo.d_name, "Cayman") || // 6950, 6970
719-
strstr(deviceinfo.d_name, "Devastator") || // 7xx0D (iGPUs of A4/6/8/10)
720-
strstr(deviceinfo.d_name, "Scrapper") || // 7xx0G (iGPUs of A4/6/8/10)
721-
strstr(deviceinfo.d_name, "Antilles")) // 6990
763+
else if (STM("Cayman") || // 6950, 6970
764+
STM("Devastator") || // 7xx0D (iGPUs of A4/6/8/10)
765+
STM("Scrapper") || // 7xx0G (iGPUs of A4/6/8/10)
766+
STM("Antilles")) // 6990
722767
{
723768
mystuff.gpu_type = GPU_VLIW4;
724769
}
725-
else if (strstr(deviceinfo.d_name, "WinterPark") || // 6370D (E2-3200), 6410D (A4-3300, A4-3400)
726-
strstr(deviceinfo.d_name, "BeaverCreek") || // 6530D (A6-3500, A6-3600, A6-3650, A6-3670K), 6550D (A8-3800, A8-3850, A8-3870K)
727-
strstr(deviceinfo.d_name, "Zacate") || // 6320 (E-450)
728-
strstr(deviceinfo.d_name, "Ontario") || // 6290 (C-60)
729-
strstr(deviceinfo.d_name, "Wrestler")) // 6250 (C-30, C-50), 6310 (E-240, E-300, E-350)
770+
else if (STM("WinterPark") || // 6370D (E2-3200), 6410D (A4-3300, A4-3400)
771+
STM("BeaverCreek") || // 6530D (A6-3500, A6-3600, A6-3650, A6-3670K), 6550D (A8-3800, A8-3850, A8-3870K)
772+
STM("Zacate") || // 6320 (E-450)
773+
STM("Ontario") || // 6290 (C-60)
774+
STM("Wrestler")) // 6250 (C-30, C-50), 6310 (E-240, E-300, E-350)
730775
{
731776
mystuff.gpu_type = GPU_APU;
732777
}
733-
else if (strstr(deviceinfo.d_name, "Caicos") || // (6450, 8450, R5 230) 7450, 7470,
734-
strstr(deviceinfo.d_name, "Cedar") || // 7350, 5450
735-
strstr(deviceinfo.d_name, "Redwood") || // 5550, 5570, 5670
736-
strstr(deviceinfo.d_name, "Turks") || // 6570, 6670, 7570, 7670
737-
strstr(deviceinfo.d_name, "Juniper") || // 6750, 6770, 5750, 5770
738-
strstr(deviceinfo.d_name, "Cypress") || // 5830, 5850, 5870
739-
strstr(deviceinfo.d_name, "Hemlock") || // 5970
740-
strstr(deviceinfo.d_name, "Barts")) // 6790, 6850, 6870
778+
else if (STM("Caicos") || // (6450, 8450, R5 230) 7450, 7470,
779+
STM("Cedar") || // 7350, 5450
780+
STM("Redwood") || // 5550, 5570, 5670
781+
STM("Turks") || // 6570, 6670, 7570, 7670
782+
STM("Juniper") || // 6750, 6770, 5750, 5770
783+
STM("Cypress") || // 5830, 5850, 5870
784+
STM("Hemlock") || // 5970
785+
STM("Barts")) // 6790, 6850, 6870
741786
{
742787
mystuff.gpu_type = GPU_VLIW5;
743788
}
744-
else if (strstr(deviceinfo.d_name, "RV7") || // 4xxx (ATI RV 7xx)
745-
strstr(deviceinfo.d_name, "Loveland")) // e.g. 6310 as part of E350: it reports 2 compute units, but only has a total of 80 compute elements
789+
else if (STM("RV7") || // 4xxx (ATI RV 7xx)
790+
STM("Loveland")) // e.g. 6310 as part of E350: it reports 2 compute units, but only has a total of 80 compute elements
746791
{
747792
mystuff.gpu_type = GPU_VLIW5;
748793
gpu_types[mystuff.gpu_type].CE_per_multiprocessor = 40; // though VLIW5, only 40 instead of 80 compute elements
@@ -754,8 +799,8 @@ void set_gpu_type()
754799
}
755800
}
756801
else if (
757-
strstr(deviceinfo.d_name, "CPU") ||
758-
strstr(deviceinfo.d_name, "cpu") ||
802+
STM("CPU") ||
803+
STM("cpu") ||
759804
strstr(deviceinfo.v_name, "GenuineIntel") ||
760805
strstr(deviceinfo.v_name, "AuthenticAMD"))
761806
{
@@ -764,11 +809,13 @@ void set_gpu_type()
764809
else if (strstr(deviceinfo.v_name, "NVIDIA"))
765810
{
766811
mystuff.gpu_type = GPU_NVIDIA; // working only with VectorSize=1 and GPU sieving
812+
// NVIDIA uses a non-SIMD architecture. other special trait is fast int32 mul
767813
}
768-
else if (strstr(deviceinfo.d_name, "Intel(R)") &&
769-
strstr(deviceinfo.d_name, "Graphics"))
814+
else if (STM("Intel(R)") &&
815+
STM("Graphics"))
770816
{
771817
mystuff.gpu_type = GPU_INTEL; // IntelHD
818+
// Could be a good idea to split on the fancier Arc/Xe GPUs
772819
}
773820
else
774821
{
@@ -782,6 +829,8 @@ void set_gpu_type()
782829
mystuff.gpu_type = GPU_GCN;
783830
}
784831
}
832+
#undef PAT
833+
#undef STM
785834

786835
if (((mystuff.gpu_type >= GPU_GCN) && (mystuff.gpu_type <= GPU_GCN3)) && (mystuff.vectorsize > 3))
787836
{

src/mfakto.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ along with mfaktc (mfakto). If not, see <http://www.gnu.org/licenses/>.
2121
#define mfakto_H_
2222

2323
#include "my_types.h"
24+
#include "myfnmatch.h"
2425

2526
#define NUM_KERNELS (sizeof(kernel_info)/sizeof(kernel_info[0]))
2627
#define KERNEL_FILE "mfakto_Kernels.cl"

src/mfakto.ini

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,17 @@
2626
# GPUType=VLIW5 "Evergreen" GPUs, such as the Radeon HD 5000 series. Can
2727
# also improve performance on low-end APUs
2828
# GPUType=GCN all GPUs based on the GCN architecture. Assumed for unknown
29-
# devices
30-
# GPUType=GCN2 2nd generation GCN architecture
31-
# GPUType=GCN3 3rd generation GCN architecture
32-
# GPUType=GCN4 Polaris GPUs, such as the Radeon RX 460
33-
# GPUType=GCN5 14 nm Vega GPUs, such as the Radeon RX Vega 56
34-
# GPUType=GCNF 7 nm Vega GPUs, namely the Vega 20 series
29+
# devices. 1:16 DP
30+
# GPUType=GCN2 2nd generation GCN architecture (actually, any GCN2/3 with 1:4 or better DP)
31+
# GPUType=GCN3 3rd generation GCN architecture (actually, any GCN2/3 with improved int32 and 1:8 or better DP)
32+
# GPUType=GCN4 Polaris GPUs, such as the Radeon RX 460 (actually, GCN 3/4 with bad DP)
33+
# GPUType=GCN5 14 nm Vega GPUs, such as the Radeon RX Vega 56 (actually, any GCN5 with bad DP)
34+
# GPUType=GCNF 7 nm Vega GPUs, namely the Vega 20 series (actually, any GCN5 with 1:4 or better DP)
35+
# CDNA is provisionally included
3536
# GPUType=RDNA devices using the RDNA 1 and 2 microarchitectures, such as
36-
# the Radeon RX 5000 series
37-
# GPUType=RDNA3 devices using the RDNA 3 microarchitecture
38-
# GPUType=APU all APUs. For low-end devices, using GPUType=VLIW5 may
37+
# the Radeon RX 5000 and 6000 series
38+
# GPUType=RDNA3 devices using the RDNA 3/4 microarchitecture
39+
# GPUType=APU VLIW APUs. For old low-end devices, using GPUType=VLIW5 may
3940
# result in better performance.
4041
# GPUType=CPU all CPUs. Used when no GPUs are available; also used when
4142
# the '-d c' option is specified

0 commit comments

Comments
 (0)