hand32@node0:~/llama.cpp-b3216$ make VERBOSE=1 llama-benchmark-matmult
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info:
I UNAME_S: Linux
I UNAME_P: x86_64
I UNAME_M: x86_64
I CFLAGS: -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion
I CXXFLAGS: -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE
I NVCCFLAGS: -std=c++11 -O3
I LDFLAGS:
I CC: cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
I CXX: c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/build-info.cpp -o build-info.o
cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml.c -o ggml.o
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c sgemm.cpp -o sgemm.o
cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml-alloc.c -o ggml-alloc.o
cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml-backend.c -o ggml-backend.o
cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml-quants.c -o ggml-quants.o
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c unicode.cpp -o unicode.o
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c unicode-data.cpp -o unicode-data.o
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o
c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE build-info.o ggml.o sgemm.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/benchmark/benchmark-matmult.o -o llama-benchmark-matmult
hand32@node0:~/EbbRT-llama.cpp/build$ ./benchmark-matmult -t 16
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
Starting Test
Allocating Memory of size 800194560 bytes, 763 MB
Creating new tensors
------ Test 1 - Matrix Mult via F32 code
n_threads=16
m11: type = 0 ( f32) ne = 11008 x 4096 x 1, nb = ( 4, 44032, 180355072) - Sum of tensor m11 is 45088768.00
m2: type = 0 ( f32) ne = 11008 x 128 x 1, nb = ( 4, 44032, 5636096) - Sum of tensor m2 is 2818048.00
gf->nodes[0]: type = 0 ( f32) ne = 4096 x 128 x 1, nb = ( 4, 16384, 2097152) - Sum of tensor gf->nodes[0] is 11542724608.00
------ Test 2 - Matrix Mult via q4_1 code
n_threads=16
Matrix Multiplication of (11008,4096,1) x (11008,128,1) - about 11.54 gFLOPS
Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS
=====================================================================================
0; 16; 11008; 4096; 128; 11542724608; 40343; 286.11
1; 16; 11008; 4096; 128; 11542724608; 38234; 301.90
2; 16; 11008; 4096; 128; 11542724608; 38167; 302.43
3; 16; 11008; 4096; 128; 11542724608; 38263; 301.67
4; 16; 11008; 4096; 128; 11542724608; 38230; 301.93
5; 16; 11008; 4096; 128; 11542724608; 38187; 302.27
6; 16; 11008; 4096; 128; 11542724608; 38310; 301.30
7; 16; 11008; 4096; 128; 11542724608; 38237; 301.87
8; 16; 11008; 4096; 128; 11542724608; 38227; 301.95
9; 16; 11008; 4096; 128; 11542724608; 38213; 302.06
Average 300.35
=====================================================================================
hand32@node0:~/EbbRT-llama.cpp/build$ ./benchmark-matmult -t 16
main: build = 0 (unknown)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
Starting Test
Allocating Memory of size 800194560 bytes, 763 MB
Creating new tensors
------ Test 1 - Matrix Mult via F32 code
n_threads=16
m11: type = 0 ( f32) ne = 11008 x 4096 x 1, nb = ( 4, 44032, 180355072) - Sum of tensor m11 is 45088768.00
m2: type = 0 ( f32) ne = 11008 x 128 x 1, nb = ( 4, 44032, 5636096) - Sum of tensor m2 is 2818048.00
gf->nodes[0]: type = 0 ( f32) ne = 4096 x 128 x 1, nb = ( 4, 16384, 2097152) - Sum of tensor gf->nodes[0] is 11542724608.00
------ Test 2 - Matrix Mult via q4_1 code
n_threads=16
Matrix Multiplication of (11008,4096,1) x (11008,128,1) - about 11.54 gFLOPS
Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS
=====================================================================================
0; 16; 11008; 4096; 128; 11542724608; 40359; 286.00
1; 16; 11008; 4096; 128; 11542724608; 38197; 302.19
2; 16; 11008; 4096; 128; 11542724608; 38226; 301.96
3; 16; 11008; 4096; 128; 11542724608; 38215; 302.05
4; 16; 11008; 4096; 128; 11542724608; 38192; 302.23
5; 16; 11008; 4096; 128; 11542724608; 38211; 302.08
6; 16; 11008; 4096; 128; 11542724608; 38184; 302.29
7; 16; 11008; 4096; 128; 11542724608; 38192; 302.23
8; 16; 11008; 4096; 128; 11542724608; 38213; 302.06
9; 16; 11008; 4096; 128; 11542724608; 38167; 302.43
Average 300.55
=====================================================================================
Allocating Memory of size 800194560 bytes, 763 MB
Creating new tensors
------ Test 1 - Matrix Mult via F32 code
n_threads=16
m11: type = 0 ( f32) ne = 11008 x 4096 x 1, nb = ( 4, 44032, 180355072) -
Sum of tensor m11 is 45088768.00
m2: type = 0 ( f32) ne = 11008 x 128 x 1, nb = ( 4, 44032, 5636096) -
Sum of tensor m2 is 2818048.00
gf->nodes[0]: type = 0 ( f32) ne = 4096 x 128 x 1, nb = ( 4, 16384, 2097152) -
Sum of tensor gf->nodes[0] is 11542272000.00
------ Test 2 - Matrix Mult via q4_1 code
n_threads=16
Matrix Multiplication of (11008,4096,1) x (11008,128,1) - about 11.54 gFLOPS
Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS
=====================================================================================
0; 16; 11008; 4096; 128; 11542724608; 34976; 330.02
1; 16; 11008; 4096; 128; 11542724608; 34597; 333.63
2; 16; 11008; 4096; 128; 11542724608; 34594; 333.66
3; 16; 11008; 4096; 128; 11542724608; 34589; 333.71
4; 16; 11008; 4096; 128; 11542724608; 34589; 333.71
5; 16; 11008; 4096; 128; 11542724608; 34594; 333.66
6; 16; 11008; 4096; 128; 11542724608; 34587; 333.73
7; 16; 11008; 4096; 128; 11542724608; 34594; 333.66
8; 16; 11008; 4096; 128; 11542724608; 34596; 333.64
9; 16; 11008; 4096; 128; 11542724608; 34598; 333.62
Average 333.31