From 838fad33ac4f01f9a67351e579269bbf446629cb Mon Sep 17 00:00:00 2001 From: acktarius Date: Tue, 1 Oct 2024 20:20:17 -0400 Subject: [PATCH 1/6] AI_opt_kernel_CN_GPU --- .cursorrules | 45 +++ doc/README.md | 3 +- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 37 +- .../amd/amd_gpu/opencl/cryptonight_gpu.cl | 372 +++++++++--------- .../backend/cpu/crypto/cryptonight_aesni.h | 23 +- xmrstak/backend/cpu/minethd.cpp | 18 - xmrstak/backend/cryptonight.hpp | 33 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 26 -- .../backend/nvidia/nvcc_code/cuda_extra.cu | 8 +- xmrstak/jconf.cpp | 5 +- 10 files changed, 258 insertions(+), 312 deletions(-) create mode 100644 .cursorrules diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 000000000..7f388c799 --- /dev/null +++ b/.cursorrules @@ -0,0 +1,45 @@ + You are an expert in C++ and OpenCL, and scalable parallel computing. + +Key Principles + + Write clear, technical responses with precise C++ and OpenCL examples. + Prioritize readability and maintainability; follow C++ coding conventions and OpenCL best practices. + Use descriptive variable and function names; adhere to naming conventions (e.g., CamelCase for class names, snake_case for filenames and private member variables). + Structure your project in a modular way using C++ namespaces and header files, as well as OpenCL's kernel files and build scripts to promote reusability and separation of concerns. + +C++/OpenCL + + Use object-oriented programming principles (inheritance, polymorphism, encapsulation) to create modular and reusable parallel computing elements. + Leverage C++'s standard template library for data structures and algorithms. + Utilize OpenCL's C99-based kernel language for writing parallel computation code. + Implement custom OpenCL kernels for specific computation tasks. + Use C++'s RAII (Resource Acquisition Is Initialization) pattern for automatic resource management. + +Error Handling and Debugging + + Implement error handling using try-catch blocks where appropriate. + Use C++'s assertion macros (e.g., assert) for catching logical errors during development. + Utilize OpenCL's error-handling functions (e.g., clGetError) for catching and handling OpenCL-specific errors. + Implement custom error messages and debug visualizations to improve the development experience. + +Dependencies + + C++ Compiler (e.g., GCC, Clang) + C++ Standard Library (e.g., STL) + OpenCL SDK (e.g., AMD APP SDK, NVIDIA CUDA Toolkit, Apple Metal Performance Shaders) + Third-party libraries (carefully vetted for compatibility and performance) + +OpenCL-Specific Guidelines + + Use OpenCL's built-in functions and kernels for common parallel computation tasks (e.g., vector addition, matrix multiplication). + Keep parallel computation logic in separate kernels; use the C++ main function for host-side initialization and execution management. + Utilize OpenCL's memory management functions (e.g., clEnqueueReadBuffer, clEnqueueWriteBuffer) for efficient data transfer between host and device. + Apply OpenCL's work-item and work-group synchronization functions (e.g., barrier) to ensure correct execution of parallel computations. + Optimize memory access patterns to reduce global memory contention and improve data locality. + Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times. + +Code Review and Best Practices + + Review code for potential performance bottlenecks and memory access patterns. + Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times. + Optimize memory access patterns to reduce global memory contention and improve data locality. diff --git a/doc/README.md b/doc/README.md index 7a1f13288..9182240d9 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,4 +1,3 @@ - @@ -55,7 +54,7 @@ If your preferred coin is not listed, you can choose one of the following mining | --- | --- | --- | --- | | cryptonight_turtle | cryptonight_lite | cryptonight | cryptonight_bittube2 | | --- | cryptonight_lite_v7 | cryptonight_gpu | cryptonight_haven | -| --- | --- | cryptonight_conceal | cryptonight_heavy | +| --- | --- | --- | cryptonight_heavy | | --- | --- | cryptonight_r | --- | | --- | --- | cryptonight_masari (used in 2018) | --- | | --- | --- | cryptonight_v8_reversewaltz | --- | diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index d17b79215..fa7a9eb7a 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -29,8 +29,9 @@ R"===( #define cryptonight_monero_v8 11 #define cryptonight_superfast 12 #define cryptonight_gpu 13 -#define cryptonight_conceal 14 -#define cryptonight_v8_reversewaltz 17 +#define cryptonight_r_wow 14 +#define cryptonight_r 15 +#define cryptonight_v8_reversewaltz 16 static const __constant ulong keccakf_rndc[24] = @@ -79,7 +80,7 @@ void keccakf1600(ulong *s) bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL); bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL); - tmp1 = s[1] ^ bc[0]; + tmp1 = s[1]; s[0] ^= bc[4]; s[1] = rotate(s[6] ^ bc[0], 44UL); @@ -596,21 +597,12 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ) { ulong a[2]; -#if(ALGO == cryptonight_conceal) - float4 conc_var = (float4)(0.0f); -#endif + ulong b[2]; + uint4 b_x[1]; -#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) - ulong b[4]; - uint4 b_x[2]; -// NVIDIA -# ifdef __NV_CL_C_VERSION +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION)) __local uint16 scratchpad_line_buf[WORKSIZE]; __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); -# endif -#else - ulong b[2]; - uint4 b_x[1]; #endif __local uint AES0[256], AES1[256]; @@ -703,21 +695,6 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); -#if(ALGO == cryptonight_conceal) - float4 r = convert_float4_rte(((int4 *)c)[0]); - float4 c_old = conc_var; - r = _mm_add_ps(r, conc_var); - r = _mm_mul_ps(r, _mm_mul_ps(r, r)); - r = _mm_and_ps(r, 0x807FFFFF); - r = _mm_or_ps(r, 0x40000000); - conc_var = _mm_add_ps(conc_var, r); - - c_old = _mm_and_ps(c_old, 0x807FFFFF); - c_old = _mm_or_ps(c_old, 0x40000000); - float4 nc = _mm_mul_ps(c_old, (float4)(536870880.0f)); - ((int4 *)c)[0] ^= convert_int4_rte(nc); -#endif - #if(ALGO == cryptonight_bittube2) ((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl index bb37581f2..8a97c3f77 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -1,133 +1,109 @@ R"===( -inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } - -inline float4 fma_break(float4 x) -{ - // Break the dependency chain by setitng the exp to ?????01 - x = _mm_and_ps(x, 0xFEFFFFFF); - return _mm_or_ps(x, 0x00800000); +inline __global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { + return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } -inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c) -{ - n1 = _mm_add_ps(n1, *c); - float4 nn = _mm_mul_ps(n0, *c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); - nn = fma_break(nn); - *n = _mm_add_ps(*n, nn); - - n3 = _mm_sub_ps(n3, *c); - float4 dd = _mm_mul_ps(n2, *c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); - dd = fma_break(dd); - *d = _mm_add_ps(*d, dd); - - //Constant feedback - *c = _mm_add_ps(*c, rnd_c); - *c = _mm_add_ps(*c, (float4)(0.734375f)); - float4 r = _mm_add_ps(nn, dd); - r = _mm_and_ps(r, 0x807FFFFF); - r = _mm_or_ps(r, 0x40000000); - *c = _mm_add_ps(*c, r); - -} - -// 9*8 + 2 = 74 -inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r) -{ - float4 n = (float4)(0.0f); - float4 d = (float4)(0.0f); - - sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); - sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c); - sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c); - sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c); - sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c); - sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c); - sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c); - sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c); - - // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 - d = _mm_and_ps(d, 0xFF7FFFFF); - d = _mm_or_ps(d, 0x40000000); - *r =_mm_add_ps(*r, _mm_div_ps(n,d)); -} - -inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum) -{ - float4 c= (float4)(cnt); - // 35 maths calls follow (140 FLOPS) - float4 r = (float4)(0.0f); - - for(int i = 0; i < 4; ++i) - round_compute(n0, n1, n2, n3, rnd_c, &c, &r); - - // do a quick fmod by setting exp to 2 - r = _mm_and_ps(r, 0x807FFFFF); - r = _mm_or_ps(r, 0x40000000); - *sum = r; // 34 - float4 x = (float4)(536870880.0f); - r = _mm_mul_ps(r, x); // 35 - return convert_int4_rte(r); -} - -inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out) -{ - float4 n0 = convert_float4_rte(v0); - float4 n1 = convert_float4_rte(v1); - float4 n2 = convert_float4_rte(v2); - float4 n3 = convert_float4_rte(v3); - - int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); - *out = rot == 0 ? r : _mm_alignr_epi8(r, rot); -} +#define fma_break(x) ((x & (float4)(0xFEFFFFFF)) | (float4)(0x00800000)) + +#define sub_round(n0, n1, n2, n3, rnd_c, n, d, c) \ + do { \ + n1 += *c; \ + float4 nn = n0 * *c; \ + nn = n1 * (nn * nn); \ + nn = fma_break(nn); \ + *n += nn; \ + \ + n3 -= *c; \ + float4 dd = n2 * *c; \ + dd = n3 * (dd * dd); \ + dd = fma_break(dd); \ + *d += dd; \ + \ + *c += rnd_c + (float4)(0.734375f); \ + float4 r = nn + dd; \ + r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \ + *c += r; \ + } while(0) + +#define round_compute(n0, n1, n2, n3, rnd_c, c, r) \ + do { \ + float4 n = (float4)(0.0f); \ + float4 d = (float4)(0.0f); \ + \ + for(int i = 0; i < 8; ++i) { \ + sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); \ + float4 tmp = n0; n0 = n1; n1 = n2; n2 = n3; n3 = tmp; \ + } \ + \ + d = (d & (float4)(0xFF7FFFFF)) | (float4)(0x40000000); \ + *r += n / d; \ + } while(0) + +#define single_compute(n0, n1, n2, n3, cnt, rnd_c, sum) \ + ({ \ + float4 c = (float4)(cnt); \ + float4 r = (float4)(0.0f); \ + \ + for(int i = 0; i < 4; ++i) \ + round_compute(n0, n1, n2, n3, rnd_c, &c, &r); \ + \ + r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \ + *sum = r; \ + convert_int4_rte(r * (float4)(536870880.0f)); \ + }) + +#define single_compute_wrap(rot, v0, v1, v2, v3, cnt, rnd_c, sum, out) \ + do { \ + float4 n0 = convert_float4_rte(v0); \ + float4 n1 = convert_float4_rte(v1); \ + float4 n2 = convert_float4_rte(v2); \ + float4 n3 = convert_float4_rte(v3); \ + \ + int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); \ + *out = rot == 0 ? r : (int4)(r.yzwx); \ + } while(0) )===" R"===( -static const __constant uint look[16][4] = { - {0, 1, 2, 3}, - {0, 2, 3, 1}, - {0, 3, 1, 2}, - {0, 3, 2, 1}, - - {1, 0, 2, 3}, - {1, 2, 3, 0}, - {1, 3, 0, 2}, - {1, 3, 2, 0}, - - {2, 1, 0, 3}, - {2, 0, 3, 1}, - {2, 3, 1, 0}, - {2, 3, 0, 1}, - - {3, 1, 2, 0}, - {3, 2, 0, 1}, - {3, 0, 1, 2}, - {3, 0, 2, 1} +// Move lookup tables to constant memory +__constant uint look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} }; -static const __constant float ccnt[16] = { - 1.34375f, - 1.28125f, - 1.359375f, - 1.3671875f, - - 1.4296875f, - 1.3984375f, - 1.3828125f, - 1.3046875f, - - 1.4140625f, - 1.2734375f, - 1.2578125f, - 1.2890625f, - - 1.3203125f, - 1.3515625f, - 1.3359375f, - 1.4609375f +__constant float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f }; struct SharedMemChunk @@ -139,84 +115,98 @@ struct SharedMemChunk __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) { - const uint gIdx = getIdx(); + const uint gIdx = getIdx(); -#if(COMP_MODE==1) - if(gIdx/16 >= numThreads) - return; -#endif + #if(COMP_MODE==1) + if(gIdx/16 >= numThreads) + return; + #endif - uint chunk = get_local_id(0) / 16; + uint chunk = get_local_id(0) / 16; -#if(STRIDED_INDEX==0) - __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); -#endif + #if(STRIDED_INDEX==0) + __global int4* lpad = (__global int4*)((__global char*)lpad_in + MEMORY * (gIdx/16)); + #endif - __local struct SharedMemChunk smem_in[WORKSIZE]; - __local struct SharedMemChunk* smem = smem_in + chunk; + __local struct SharedMemChunk { + int4 out[16]; + float4 va[16]; + } smem_in[WORKSIZE]; + __local struct SharedMemChunk* smem = smem_in + chunk; - uint tid = get_local_id(0) % 16; + // New local memory cache for frequently accessed data + __local int4 lpad_cache[32]; - uint idxHash = gIdx/16; - uint s = ((__global uint*)spad)[idxHash * 50] >> 8; - float4 vs = (float4)(0); + uint tid = get_local_id(0) % 16; - // tid divided - const uint tidd = tid / 4; - // tid modulo - const uint tidm = tid % 4; - const uint block = tidd * 16 + tidm; + uint idxHash = gIdx/16; + uint s = ((__global uint*)spad)[idxHash * 50] >> 8; + float4 vs = (float4)(0); - #pragma unroll CN_UNROLL - for(size_t i = 0; i < ITERATIONS; i++) - { - mem_fence(CLK_LOCAL_MEM_FENCE); - int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm]; - ((__local int*)(smem->out))[tid] = tmp; - mem_fence(CLK_LOCAL_MEM_FENCE); - - { - single_comupte_wrap( - tidm, - *(smem->out + look[tid][0]), - *(smem->out + look[tid][1]), - *(smem->out + look[tid][2]), - *(smem->out + look[tid][3]), - ccnt[tid], vs, smem->va + tid, - smem->out + tid - ); - } - mem_fence(CLK_LOCAL_MEM_FENCE); - - int outXor = ((__local int*)smem->out)[block]; - for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) - outXor ^= ((__local int*)smem->out)[dd]; - - ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp; - ((__local int*)smem->out)[tid] = outXor; - - float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; - float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12]; - ((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2; - - mem_fence(CLK_LOCAL_MEM_FENCE); - - int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12]; - va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; - va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12]; - va_tmp1 = va_tmp1 + va_tmp2; - va_tmp1 = fabs(va_tmp1); - - float xx = va_tmp1 * 16777216.0f; - int xx_int = (int)xx; - ((__local int*)smem->out)[tid] = out2 ^ xx_int; - ((__local float*)smem->va)[tid] = va_tmp1 / 64.0f; - - mem_fence(CLK_LOCAL_MEM_FENCE); - - vs = smem->va[0]; - s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w; - } + const uint tidd = tid / 4; + const uint tidm = tid % 4; + const uint block = tidd * 16 + tidm; + + // Preload frequently accessed data into local memory + if (tid < 32) { + lpad_cache[tid] = lpad[tid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + #pragma unroll CN_UNROLL + for(uint i = 0; i < ITERATIONS; i++) + { + barrier(CLK_LOCAL_MEM_FENCE); + + smem->out[tid] = scratchpad_ptr(s, tidd, (__global int*)lpad_cache)[tidm]; + + barrier(CLK_LOCAL_MEM_FENCE); + + single_compute_wrap( + tidm, + smem->out[look[tid][0]], + smem->out[look[tid][1]], + smem->out[look[tid][2]], + smem->out[look[tid][3]], + ccnt[tid], vs, &smem->va[tid], + &smem->out[tid] + ); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Combine outXor calculation and update of lpad_cache + int4 outXor = smem->out[block]; + #pragma unroll + for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) + outXor ^= smem->out[dd]; + lpad_cache[s % 32] = outXor ^ smem->out[tid]; + smem->out[tid] = outXor; + + // Combine va_tmp calculation and assignment + smem->va[tid] = (smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]); + + barrier(CLK_LOCAL_MEM_FENCE); + + // Combine out2 and va_tmp2 calculations + int4 out2 = (smem->out[tid] ^ smem->out[tid + 4]) ^ (smem->out[tid + 8] ^ smem->out[tid + 12]); + float4 va_tmp2 = fabs((smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12])); + + // Combine xx calculation and conversion + int4 xx_int = convert_int4_rte(va_tmp2 * (float4)(16777216.0f)); + smem->out[tid] = out2 ^ xx_int; + smem->va[tid] = va_tmp2 * (float4)(0.015625f); // Vectorized division by 64.0f + + barrier(CLK_LOCAL_MEM_FENCE); + + // Combine vs and s updates + vs = smem->va[0]; + s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w; + } + + // Write back to global memory + if (tid < 32) { + lpad[tid] = lpad_cache[tid]; + } } )===" @@ -326,4 +316,4 @@ __kernel void JOIN(cn00_cn_gpu,ALGO)(__global int *Scratchpad, __global ulong *s } } -)===" +)===" \ No newline at end of file diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 6c9e3390c..45c2f1fc3 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -611,21 +611,6 @@ inline __m128 _mm_set1_ps_epi32(uint32_t x) return _mm_castsi128_ps(_mm_set1_epi32(x)); } -inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) -{ - __m128 r = _mm_cvtepi32_ps(cx); - __m128 c_old = conc_var; - r = _mm_add_ps(r, conc_var); - r = _mm_mul_ps(r, _mm_mul_ps(r, r)); - r = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), r); - r = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), r); - conc_var = _mm_add_ps(conc_var, r); - - c_old = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), c_old); - c_old = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), c_old); - __m128 nc = _mm_mul_ps(c_old, _mm_set1_ps(536870880.0f)); - cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); -} #define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ @@ -746,11 +731,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) __m128i bx1; \ __m128i division_result_xmm; \ __m128 conc_var; \ - if(ALGO == cryptonight_conceal) \ - { \ - set_float_rounding_mode_nearest(); \ - conc_var = _mm_setzero_ps(); \ - } \ + \ GetOptimalSqrtType_t sqrt_result; \ uint32_t cn_r_data[9]; \ /* END cryptonight_monero_v8 variables */ \ @@ -781,8 +762,6 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) __m128i cx; \ ptr0 = (__m128i*)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ - if(ALGO == cryptonight_conceal) \ - cryptonight_conceal_tweak(cx, conc_var); \ if(ALGO == cryptonight_bittube2) \ { \ cx = aes_round_bittube2(cx, ax0); \ diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 43759776f..a179cdc10 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -445,16 +445,6 @@ bool minethd::self_test() ctx[0]->hash_fn("", 0, out, ctx, algo); bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; } - else if(algo == POW(cryptonight_conceal)) - { - func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); - ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; - - func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); - ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; - } else if(algo == POW(cryptonight_turtle)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); @@ -643,9 +633,6 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& case cryptonight_gpu: algv = 12; break; - case cryptonight_conceal: - algv = 13; - break; case cryptonight_r: algv = 14; break; @@ -723,11 +710,6 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& Cryptonight_hash_gpu::template hash, Cryptonight_hash_gpu::template hash, - Cryptonight_hash::template hash, - Cryptonight_hash::template hash, - Cryptonight_hash::template hash, - Cryptonight_hash::template hash, - Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 262865ea0..197814cfd 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -23,10 +23,11 @@ enum xmrstak_algo_id cryptonight_monero_v8 = 11, cryptonight_superfast = 12, cryptonight_gpu = 13, - cryptonight_conceal = 14, - cryptonight_r_wow = 15, - cryptonight_r = 16, - cryptonight_v8_reversewaltz = 17, //equal to cryptonight_monero_v8 but with 3/4 iterations and reversed shuffle operation + // Remove the following line + // cryptonight_conceal = 14, + cryptonight_r_wow = 14, // Adjust this value + cryptonight_r = 15, // Adjust this value + cryptonight_v8_reversewaltz = 16, // Adjust this value cryptonight_turtle = start_derived_algo_id, cryptonight_v8_half = (start_derived_algo_id + 1), @@ -41,7 +42,7 @@ enum xmrstak_algo_id */ inline std::string get_algo_name(xmrstak_algo_id algo_id) { - static std::array base_algo_names = + static std::array base_algo_names = {{ "invalid_algo", "cryptonight", @@ -57,10 +58,11 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id) "cryptonight_v8", "cryptonight_superfast", "cryptonight_gpu", - "cryptonight_conceal", + // Remove the following line + // "cryptonight_conceal", "cryptonight_r_wow", "cryptonight_r", - "cryptonight_v8_reversewaltz" // used by graft + "cryptonight_v8_reversewaltz" }}; static std::array derived_algo_names = @@ -200,21 +202,22 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000; inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - static std::array pow = {{{invalid_algo, invalid_algo}, + static std::array pow = {{{invalid_algo, invalid_algo}, // Reduce size by 1 {cryptonight, cryptonight, CN_ITER, CN_MEMORY}, {cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2}, {cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY}, {cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2}, {cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2}, - {cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2}, // equal to cryptonight_aeon with a small tweak in the miner code - {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change - {cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari - {cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2}, // equal to cryptonight_heavy with a small tweak - {cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + {cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2}, + {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, + {cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY}, + {cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2}, + {cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, {cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY}, {cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY}, {cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}, - {cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY}, + // Remove the following line + // {cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY}, {cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY}, {cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY}, {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}}; @@ -232,4 +235,4 @@ inline xmrstak_algo POW(xmrstak_algo_id algo_id) return pow[algo_id]; else return derived_pow[algo_id - start_derived_algo_id]; -} +} \ No newline at end of file diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 6c769b3e8..11200fe23 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -504,13 +504,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4) uint32_t t1[2], t2[2], res; float conc_var; - if(ALGO == cryptonight_conceal) - { - if(partidx != 0) - conc_var = int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)); - else - conc_var = 0.0f; - } uint32_t tweak1_2[2]; if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) @@ -576,20 +569,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4) { uint32_t x_0 = loadGlobal32(long_state + j); - if(ALGO == cryptonight_conceal) - { - float r = int2float((int32_t)x_0); - float c_old = conc_var; - - r += conc_var; - r = r * r * r; - r = int_as_float((float_as_int(r) & 0x807FFFFF) | 0x40000000); - conc_var += r; - - c_old = int_as_float((float_as_int(c_old) & 0x807FFFFF) | 0x40000000); - c_old *= 536870880.0f; - x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old)); - } const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2); @@ -692,8 +671,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4) if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) if(sub & 1) *(d_ctx_b + threads * 4 + thread) = idx0; - if(ALGO == cryptonight_conceal) - *(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var); } } @@ -1069,9 +1046,6 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui cryptonight_core_gpu_hash_gpu, cryptonight_core_gpu_hash_gpu, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index d5b292cb4..f79353d7e 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -317,11 +317,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } - else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_conceal) != neededAlgorithms.end()) - { - ctx_b_size += sizeof(uint32_t) * 4 * wsize; - } - else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) + else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) { // bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte) ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; @@ -474,7 +470,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK_MSG( ctx->device_id, - "\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**", + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); /* There is only a 32bit limit for the counter on the device side diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index c50211d1e..fc55ed6ab 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -103,6 +103,7 @@ xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion {"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, {"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"}, + {"conceal", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.conceal.network:3333"}, {"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr}, {"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr}, {"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr}, @@ -122,7 +123,6 @@ xmrstak::coin_selection coins[] = { {"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr}, {"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr}, {"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, - {"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr}, {"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}, {"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr}, {"lethean", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr}, @@ -131,7 +131,8 @@ xmrstak::coin_selection coins[] = { {"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, {"torque", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, {"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr}, - {"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}}; + {"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr} +}; constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0])); From ce28d2b30cdf3c778507392a1bedd1414649d509 Mon Sep 17 00:00:00 2001 From: acktarius Date: Thu, 3 Oct 2024 14:16:23 -0400 Subject: [PATCH 2/6] ccx&few_tweaks --- .cursorrules | 93 +++-- .gitignore | 6 + CMakeLists.txt | 8 +- xmrstak/backend/amd/OclCryptonightR_gen.hpp | 2 + xmrstak/backend/amd/amd_gpu/gpu.hpp | 1 + .../amd/amd_gpu/opencl/cryptonight_gpu.cl | 370 +++++++++--------- xmrstak/backend/amd/autoAdjust.hpp | 3 +- xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp | 40 +- xmrstak/backend/miner_work.hpp | 26 +- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 10 +- xmrstak/cli/cli-miner.cpp | 8 +- xmrstak/pools.tpl | 3 +- 12 files changed, 310 insertions(+), 260 deletions(-) diff --git a/.cursorrules b/.cursorrules index 7f388c799..63eb694cc 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,45 +1,68 @@ - You are an expert in C++ and OpenCL, and scalable parallel computing. +# C++/OpenCL Crypto Mining Expert -Key Principles +You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL for high-performance parallel computing. - Write clear, technical responses with precise C++ and OpenCL examples. - Prioritize readability and maintainability; follow C++ coding conventions and OpenCL best practices. - Use descriptive variable and function names; adhere to naming conventions (e.g., CamelCase for class names, snake_case for filenames and private member variables). - Structure your project in a modular way using C++ namespaces and header files, as well as OpenCL's kernel files and build scripts to promote reusability and separation of concerns. +## Core Competencies +- Optimization of SHA-256, cryptonight_gpu, and other mining algorithms +- GPGPU programming and hardware-specific optimizations +- Memory management and parallel processing techniques -C++/OpenCL +## Technical Guidelines +### C++ Best Practices +- Modern C++ (17/20) features for robust, efficient code +- RAII for resource management +- STL for data structures and algorithms +- Custom memory pools for mining-specific optimizations - Use object-oriented programming principles (inheritance, polymorphism, encapsulation) to create modular and reusable parallel computing elements. - Leverage C++'s standard template library for data structures and algorithms. - Utilize OpenCL's C99-based kernel language for writing parallel computation code. - Implement custom OpenCL kernels for specific computation tasks. - Use C++'s RAII (Resource Acquisition Is Initialization) pattern for automatic resource management. +### OpenCL Implementation +- Efficient kernel design for maximum hash rate +- Work group optimization for different GPU architectures +- Memory coalescing and bank conflict avoidance +- Stratum protocol integration for pool mining -Error Handling and Debugging +### CMake Build System +- Modern CMake (3.15+) practices +- Automatic OpenCL and dependency detection +- Cross-platform build configuration +- GPU architecture-specific optimizations +- Example targets: + ``` + add_executable(miner src/main.cpp) + target_link_libraries(miner PRIVATE OpenCL) + ``` - Implement error handling using try-catch blocks where appropriate. - Use C++'s assertion macros (e.g., assert) for catching logical errors during development. - Utilize OpenCL's error-handling functions (e.g., clGetError) for catching and handling OpenCL-specific errors. - Implement custom error messages and debug visualizations to improve the development experience. +### Performance Optimization +- Profiling and bottleneck identification +- Memory access pattern optimization +- Workload distribution across compute units +- Hardware-specific tuning (AMD, NVIDIA) -Dependencies +## Error Handling +- Graceful recovery from hardware/network issues +- Comprehensive logging for debugging +- Real-time hash rate monitoring and adjustment - C++ Compiler (e.g., GCC, Clang) - C++ Standard Library (e.g., STL) - OpenCL SDK (e.g., AMD APP SDK, NVIDIA CUDA Toolkit, Apple Metal Performance Shaders) - Third-party libraries (carefully vetted for compatibility and performance) +## Dependencies +- OpenCL SDK (vendor-specific) +- C++17/20 compliant compiler +- CMake 3.15 or higher +- Mining pool integration libraries +- Hardware monitoring capabilities -OpenCL-Specific Guidelines +## Build Configuration +- CMakeLists.txt structure for multi-platform support +- Find modules for OpenCL and other dependencies +- Conditional compilation for different mining algorithms +- Debug and Release build configurations +- Example usage: + ``` + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + cmake --build . + ``` - Use OpenCL's built-in functions and kernels for common parallel computation tasks (e.g., vector addition, matrix multiplication). - Keep parallel computation logic in separate kernels; use the C++ main function for host-side initialization and execution management. - Utilize OpenCL's memory management functions (e.g., clEnqueueReadBuffer, clEnqueueWriteBuffer) for efficient data transfer between host and device. - Apply OpenCL's work-item and work-group synchronization functions (e.g., barrier) to ensure correct execution of parallel computations. - Optimize memory access patterns to reduce global memory contention and improve data locality. - Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times. - -Code Review and Best Practices - - Review code for potential performance bottlenecks and memory access patterns. - Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times. - Optimize memory access patterns to reduce global memory contention and improve data locality. +## Code Quality +- Clear, maintainable code with mining-specific comments +- Modular design for easy algorithm switching +- Rigorous error checking for stability +- Performance-critical section documentation \ No newline at end of file diff --git a/.gitignore b/.gitignore index 26d278f0b..2af6ed068 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,9 @@ cmake-build-debug/ # Thumbnails ._* + +# Visual Studio Code files +.vscode/ + +# Cursor +.cursorrules \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 795829e66..c5ce08235 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,11 @@ project(xmr-stak) -cmake_minimum_required(VERSION 3.4.0) +cmake_minimum_required(VERSION 3.15) -# enforce C++11 +# enforce C++17 +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CXX_STANDARD 11) if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}" CACHE PATH "install prefix" FORCE) @@ -308,6 +308,8 @@ else() add_definitions("-DCONF_NO_OPENCL") endif() +add_definitions(-DCL_TARGET_OPENCL_VERSION=300) + ############################################################################### # CPU backend ############################################################################### diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp index f8772b1f5..54d312b2b 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp @@ -6,6 +6,8 @@ #include #include +#define CL_TARGET_OPENCL_VERSION 300 + #if defined(__APPLE__) #include #else diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 1ba300c7a..f65a05fad 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -3,6 +3,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/misc/console.hpp" +#define CL_TARGET_OPENCL_VERSION 300 #if defined(__APPLE__) #include #else diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl index 8a97c3f77..8575be905 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -1,109 +1,133 @@ R"===( -inline __global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { - return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); +inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } + +inline float4 fma_break(float4 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); } -#define fma_break(x) ((x & (float4)(0xFEFFFFFF)) | (float4)(0x00800000)) - -#define sub_round(n0, n1, n2, n3, rnd_c, n, d, c) \ - do { \ - n1 += *c; \ - float4 nn = n0 * *c; \ - nn = n1 * (nn * nn); \ - nn = fma_break(nn); \ - *n += nn; \ - \ - n3 -= *c; \ - float4 dd = n2 * *c; \ - dd = n3 * (dd * dd); \ - dd = fma_break(dd); \ - *d += dd; \ - \ - *c += rnd_c + (float4)(0.734375f); \ - float4 r = nn + dd; \ - r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \ - *c += r; \ - } while(0) - -#define round_compute(n0, n1, n2, n3, rnd_c, c, r) \ - do { \ - float4 n = (float4)(0.0f); \ - float4 d = (float4)(0.0f); \ - \ - for(int i = 0; i < 8; ++i) { \ - sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); \ - float4 tmp = n0; n0 = n1; n1 = n2; n2 = n3; n3 = tmp; \ - } \ - \ - d = (d & (float4)(0xFF7FFFFF)) | (float4)(0x40000000); \ - *r += n / d; \ - } while(0) - -#define single_compute(n0, n1, n2, n3, cnt, rnd_c, sum) \ - ({ \ - float4 c = (float4)(cnt); \ - float4 r = (float4)(0.0f); \ - \ - for(int i = 0; i < 4; ++i) \ - round_compute(n0, n1, n2, n3, rnd_c, &c, &r); \ - \ - r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \ - *sum = r; \ - convert_int4_rte(r * (float4)(536870880.0f)); \ - }) - -#define single_compute_wrap(rot, v0, v1, v2, v3, cnt, rnd_c, sum, out) \ - do { \ - float4 n0 = convert_float4_rte(v0); \ - float4 n1 = convert_float4_rte(v1); \ - float4 n2 = convert_float4_rte(v2); \ - float4 n3 = convert_float4_rte(v3); \ - \ - int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); \ - *out = rot == 0 ? r : (int4)(r.yzwx); \ - } while(0) +inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c) +{ + n1 = _mm_add_ps(n1, *c); + float4 nn = _mm_mul_ps(n0, *c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + *n = _mm_add_ps(*n, nn); + + n3 = _mm_sub_ps(n3, *c); + float4 dd = _mm_mul_ps(n2, *c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + *d = _mm_add_ps(*d, dd); + + //Constant feedback + *c = _mm_add_ps(*c, rnd_c); + *c = _mm_add_ps(*c, (float4)(0.734375f)); + float4 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *c = _mm_add_ps(*c, r); + +} + +// 9*8 + 2 = 74 +inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r) +{ + float4 n = (float4)(0.0f); + float4 d = (float4)(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); + sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c); + sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c); + sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c); + sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c); + sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c); + sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c); + sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + *r =_mm_add_ps(*r, _mm_div_ps(n,d)); +} + +inline int4 single_compute(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum) +{ + float4 c= (float4)(cnt); + // 35 maths calls follow (140 FLOPS) + float4 r = (float4)(0.0f); + + for(int i = 0; i < 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, &c, &r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *sum = r; // 34 + float4 x = (float4)(536870880.0f); + r = _mm_mul_ps(r, x); // 35 + return convert_int4_rte(r); +} + +inline void single_compute_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out) +{ + float4 n0 = convert_float4_rte(v0); + float4 n1 = convert_float4_rte(v1); + float4 n2 = convert_float4_rte(v2); + float4 n3 = convert_float4_rte(v3); + + int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); + *out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} )===" R"===( -// Move lookup tables to constant memory -__constant uint look[16][4] = { - {0, 1, 2, 3}, - {0, 2, 3, 1}, - {0, 3, 1, 2}, - {0, 3, 2, 1}, - {1, 0, 2, 3}, - {1, 2, 3, 0}, - {1, 3, 0, 2}, - {1, 3, 2, 0}, - {2, 1, 0, 3}, - {2, 0, 3, 1}, - {2, 3, 1, 0}, - {2, 3, 0, 1}, - {3, 1, 2, 0}, - {3, 2, 0, 1}, - {3, 0, 1, 2}, - {3, 0, 2, 1} +static const __constant uint look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} }; -__constant float ccnt[16] = { - 1.34375f, - 1.28125f, - 1.359375f, - 1.3671875f, - 1.4296875f, - 1.3984375f, - 1.3828125f, - 1.3046875f, - 1.4140625f, - 1.2734375f, - 1.2578125f, - 1.2890625f, - 1.3203125f, - 1.3515625f, - 1.3359375f, - 1.4609375f +static const __constant float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f }; struct SharedMemChunk @@ -115,98 +139,84 @@ struct SharedMemChunk __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) { - const uint gIdx = getIdx(); + const uint gIdx = getIdx(); - #if(COMP_MODE==1) - if(gIdx/16 >= numThreads) - return; - #endif - - uint chunk = get_local_id(0) / 16; - - #if(STRIDED_INDEX==0) - __global int4* lpad = (__global int4*)((__global char*)lpad_in + MEMORY * (gIdx/16)); - #endif +#if(COMP_MODE==1) + if(gIdx/16 >= numThreads) + return; +#endif - __local struct SharedMemChunk { - int4 out[16]; - float4 va[16]; - } smem_in[WORKSIZE]; - __local struct SharedMemChunk* smem = smem_in + chunk; + uint chunk = get_local_id(0) / 16; - // New local memory cache for frequently accessed data - __local int4 lpad_cache[32]; +#if(STRIDED_INDEX==0) + __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); +#endif - uint tid = get_local_id(0) % 16; + __local struct SharedMemChunk smem_in[WORKSIZE]; + __local struct SharedMemChunk* smem = smem_in + chunk; - uint idxHash = gIdx/16; - uint s = ((__global uint*)spad)[idxHash * 50] >> 8; - float4 vs = (float4)(0); + uint tid = get_local_id(0) % 16; - const uint tidd = tid / 4; - const uint tidm = tid % 4; - const uint block = tidd * 16 + tidm; + uint idxHash = gIdx/16; + uint s = ((__global uint*)spad)[idxHash * 50] >> 8; + float4 vs = (float4)(0); - // Preload frequently accessed data into local memory - if (tid < 32) { - lpad_cache[tid] = lpad[tid]; - } - barrier(CLK_LOCAL_MEM_FENCE); + // tid divided + const uint tidd = tid / 4; + // tid modulo + const uint tidm = tid % 4; + const uint block = tidd * 16 + tidm; - #pragma unroll CN_UNROLL - for(uint i = 0; i < ITERATIONS; i++) - { - barrier(CLK_LOCAL_MEM_FENCE); - - smem->out[tid] = scratchpad_ptr(s, tidd, (__global int*)lpad_cache)[tidm]; - - barrier(CLK_LOCAL_MEM_FENCE); - - single_compute_wrap( - tidm, - smem->out[look[tid][0]], - smem->out[look[tid][1]], - smem->out[look[tid][2]], - smem->out[look[tid][3]], - ccnt[tid], vs, &smem->va[tid], - &smem->out[tid] - ); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Combine outXor calculation and update of lpad_cache - int4 outXor = smem->out[block]; - #pragma unroll - for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) - outXor ^= smem->out[dd]; - lpad_cache[s % 32] = outXor ^ smem->out[tid]; - smem->out[tid] = outXor; - - // Combine va_tmp calculation and assignment - smem->va[tid] = (smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]); - - barrier(CLK_LOCAL_MEM_FENCE); - - // Combine out2 and va_tmp2 calculations - int4 out2 = (smem->out[tid] ^ smem->out[tid + 4]) ^ (smem->out[tid + 8] ^ smem->out[tid + 12]); - float4 va_tmp2 = fabs((smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12])); - - // Combine xx calculation and conversion - int4 xx_int = convert_int4_rte(va_tmp2 * (float4)(16777216.0f)); - smem->out[tid] = out2 ^ xx_int; - smem->va[tid] = va_tmp2 * (float4)(0.015625f); // Vectorized division by 64.0f - - barrier(CLK_LOCAL_MEM_FENCE); - - // Combine vs and s updates - vs = smem->va[0]; - s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w; - } - - // Write back to global memory - if (tid < 32) { - lpad[tid] = lpad_cache[tid]; - } + #pragma unroll CN_UNROLL + for(size_t i = 0; i < ITERATIONS; i++) + { + mem_fence(CLK_LOCAL_MEM_FENCE); + int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm]; + ((__local int*)(smem->out))[tid] = tmp; + mem_fence(CLK_LOCAL_MEM_FENCE); + + { + single_compute_wrap( + tidm, + *(smem->out + look[tid][0]), + *(smem->out + look[tid][1]), + *(smem->out + look[tid][2]), + *(smem->out + look[tid][3]), + ccnt[tid], vs, smem->va + tid, + smem->out + tid + ); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + + int outXor = ((__local int*)smem->out)[block]; + for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) + outXor ^= ((__local int*)smem->out)[dd]; + + ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp; + ((__local int*)smem->out)[tid] = outXor; + + float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12]; + ((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2; + + mem_fence(CLK_LOCAL_MEM_FENCE); + + int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12]; + va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12]; + va_tmp1 = va_tmp1 + va_tmp2; + va_tmp1 = fabs(va_tmp1); + + float xx = va_tmp1 * 16777216.0f; + int xx_int = (int)xx; + ((__local int*)smem->out)[tid] = out2 ^ xx_int; + ((__local float*)smem->va)[tid] = va_tmp1 / 64.0f; + + mem_fence(CLK_LOCAL_MEM_FENCE); + + vs = smem->va[0]; + s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w; + } } )===" diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 075acbd49..e237eebf1 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -1,4 +1,3 @@ - #pragma once #include "amd_gpu/gpu.hpp" @@ -18,6 +17,8 @@ #include #include +#define CL_TARGET_OPENCL_VERSION 300 + #if defined(__APPLE__) #include #else diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp index d65d9651e..62c2ba527 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -62,7 +62,7 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd // 112×4 = 448 template -inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +inline __m128i single_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) { __m128 c = _mm_set1_ps(cnt); __m128 r = _mm_setzero_ps(); @@ -86,9 +86,9 @@ inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float } template -inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { - __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + __m128i r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); if(rot != 0) r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot)); out = _mm_xor_si128(out, r); @@ -122,37 +122,37 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& __m128i out, out2; out = _mm_setzero_si128(); - single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); - single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); - single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); - single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); + single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); + single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); + single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); + single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); sum0 = _mm_add_ps(suma, sumb); _mm_store_si128(idx0, _mm_xor_si128(v0, out)); out2 = out; out = _mm_setzero_si128(); - single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); - single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); - single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); - single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); + single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); + single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); + single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); + single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); sum1 = _mm_add_ps(suma, sumb); _mm_store_si128(idx1, _mm_xor_si128(v1, out)); out2 = _mm_xor_si128(out2, out); out = _mm_setzero_si128(); - single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); - single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); - single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); - single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); + single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); + single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); + single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); + single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); sum2 = _mm_add_ps(suma, sumb); _mm_store_si128(idx2, _mm_xor_si128(v2, out)); out2 = _mm_xor_si128(out2, out); out = _mm_setzero_si128(); - single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); - single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); - single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); - single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); + single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); + single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); + single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); + single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); sum3 = _mm_add_ps(suma, sumb); _mm_store_si128(idx3, _mm_xor_si128(v3, out)); out2 = _mm_xor_si128(out2, out); @@ -178,4 +178,4 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& idx2 = scratchpad_ptr(lpad, n, 2, mask); idx3 = scratchpad_ptr(lpad, n, 3, mask); } -} +} \ No newline at end of file diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index 114f2db8e..d267cc902 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -2,6 +2,7 @@ #include "xmrstak/backend/pool_data.hpp" +#include #include #include #include @@ -32,31 +33,30 @@ struct miner_work ref_ptr((uint8_t*)&iBlockHeight) {} miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, - uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : - iWorkSize(iWorkSize), + uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeight) : + iWorkSize(std::min(iWorkSize, static_cast(sizeof(bWorkBlob)))), iTarget(iTarget), bNiceHash(bNiceHash), - bStall(false), iPoolId(iPoolId), - iBlockHeight(iBlockHeiht), - ref_ptr((uint8_t*)&iBlockHeight) + iBlockHeight(iBlockHeight), + bStall(false) { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(this->bWorkBlob, bWork, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + memcpy(this->bWorkBlob, bWork, this->iWorkSize); + strncpy(this->sJobID, sJobID, sizeof(this->sJobID) - 1); + this->sJobID[sizeof(this->sJobID) - 1] = '\0'; } miner_work(miner_work&& from) : - iWorkSize(from.iWorkSize), + iWorkSize(std::min(from.iWorkSize, static_cast(sizeof(bWorkBlob)))), iTarget(from.iTarget), bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, this->iWorkSize); + strncpy(this->sJobID, from.sJobID, sizeof(this->sJobID) - 1); + this->sJobID[sizeof(this->sJobID) - 1] = '\0'; } miner_work(miner_work const&) = delete; @@ -108,4 +108,4 @@ struct miner_work return bWorkBlob[0]; } }; -} // namespace xmrstak +} // namespace xmrstak \ No newline at end of file diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index 516d4ca00..188a5fd6c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -366,7 +366,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _ } // 74*8 = 595 -__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +__forceinline__ __device__ __m128i single_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) { __m128 c(cnt); // 35 maths calls follow (140 FLOPS) @@ -381,14 +381,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n return r.get_int(); } -__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +__forceinline__ __device__ void single_compute_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { __m128 n0(v0); __m128 n1(v1); __m128 n2(v2); __m128 n3(v3); - __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + __m128i r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); out = rot == 0 ? r : _mm_alignr_epi8(r, rot); } @@ -496,7 +496,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( sync(); __m128 rc = vs; - single_comupte_wrap( + single_compute_wrap( tidm, *(smem->out + look[tid][0]), *(smem->out + look[tid][1]), @@ -595,4 +595,4 @@ __global__ void cn_explode_gpu(const size_t MEMORY, int32_t* spad_in, int* lpad_ } } // namespace nvidia -} // namespace xmrstak +} // namespace xmrstak \ No newline at end of file diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 607e863e1..890f6a9fa 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -42,6 +42,7 @@ #include #include #include +#include #ifndef CONF_NO_TLS #include @@ -243,7 +244,7 @@ void do_guided_pool_config() prompt_once(prompted); userSetPool = false; - std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl; + std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(currency.c_str()) << std::endl; std::cin >> pool; } @@ -403,7 +404,10 @@ int main(int argc, char* argv[]) #ifndef CONF_NO_TLS SSL_library_init(); SSL_load_error_strings(); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" ERR_load_BIO_strings(); +#pragma GCC diagnostic pop ERR_load_crypto_strings(); SSL_load_error_strings(); OpenSSL_add_all_digests(); @@ -979,4 +983,4 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) printer::inst()->print_msg(L0, "Benchmark Total: %.1f H/S", fTotalHps); return 0; -} +} \ No newline at end of file diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index eb57a3f04..ab0920a44 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -21,6 +21,7 @@ POOLCONF], * * bbscoin (automatic switch with block version 3 to cryptonight_v7) * bittube (uses cryptonight_bittube2 algorithm) + * conceal (uses cryptonight_gpu algorithm) * graft * haven (automatic switch with block version 3 to cryptonight_haven) * lethean @@ -42,7 +43,7 @@ POOLCONF], * cryptonight_lite_v7_xor (algorithm used by ipbc) * # 2MiB scratchpad memory * cryptonight - * cryptonight_gpu (for Ryo's 14th of Feb fork) + * cryptonight_gpu (for Ryo's 14th of Feb fork and Conceal) * cryptonight_superfast * cryptonight_v7 * cryptonight_v8 From 3d0203b0668abfe54fbae5af64feeea83245cd42 Mon Sep 17 00:00:00 2001 From: acktarius Date: Wed, 9 Oct 2024 23:07:47 -0400 Subject: [PATCH 3/6] build_day2day --- .cursorrules | 186 ++++++++++++++++++++++++++++++++++++++---- CMakeLists.txt | 53 +++++++++++- xmrstak/setup_gui.cpp | 52 ++++++++++++ xmrstak/setup_gui.hpp | 23 ++++++ 4 files changed, 296 insertions(+), 18 deletions(-) create mode 100644 xmrstak/setup_gui.cpp create mode 100644 xmrstak/setup_gui.hpp diff --git a/.cursorrules b/.cursorrules index 63eb694cc..567eb797a 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,6 +1,6 @@ -# C++/OpenCL Crypto Mining Expert +You are an AI expert in the field of cryptocurrency mining. You are tasked with developing a high-performance mining application using OpenCL and C++. You always follow best practices for algorithm optimization, error handling, and performance tuning for various hardware platforms. You are also tasked with developing a GUI for the mining application using wxWidgets. You always follow best practices for GUI development, error handling, and performance tuning for various hardware platforms. -You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL for high-performance parallel computing. +# C++/OpenCL Crypto Mining Expert Guide ## Core Competencies - Optimization of SHA-256, cryptonight_gpu, and other mining algorithms @@ -8,6 +8,7 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo - Memory management and parallel processing techniques ## Technical Guidelines + ### C++ Best Practices - Modern C++ (17/20) features for robust, efficient code - RAII for resource management @@ -25,11 +26,12 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo - Automatic OpenCL and dependency detection - Cross-platform build configuration - GPU architecture-specific optimizations -- Example targets: - ``` - add_executable(miner src/main.cpp) - target_link_libraries(miner PRIVATE OpenCL) - ``` + +Example targets: +```cmake +add_executable(miner src/main.cpp) +target_link_libraries(miner PRIVATE OpenCL) +``` ### Performance Optimization - Profiling and bottleneck identification @@ -48,21 +50,177 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo - CMake 3.15 or higher - Mining pool integration libraries - Hardware monitoring capabilities +- wxWidgets 3.2 or higher (for GUI) ## Build Configuration - CMakeLists.txt structure for multi-platform support - Find modules for OpenCL and other dependencies - Conditional compilation for different mining algorithms - Debug and Release build configurations -- Example usage: - ``` - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. - cmake --build . - ``` + +Example usage: +```bash +mkdir build && cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +cmake --build . +``` ## Code Quality - Clear, maintainable code with mining-specific comments - Modular design for easy algorithm switching - Rigorous error checking for stability -- Performance-critical section documentation \ No newline at end of file +- Performance-critical section documentation + +# Implementation Details + +## OpenCL Function Reference + +### Kernel Management +- `clCreateKernel`: Create mining algorithm kernels + - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clCreateKernel.html) +```cpp +cl_kernel kernel = clCreateKernel(program, "sha256_kernel", &err); +``` + +### Device Query and Optimization +- `clGetDeviceInfo`: Query optimal work group sizes + - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetDeviceInfo.html) +```cpp +size_t maxWorkGroupSize; +clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &maxWorkGroupSize, NULL); +``` + +### Memory Management +- `clCreateBuffer`: Efficient buffer creation + - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clCreateBuffer.html) +```cpp +cl_mem inputBuffer = clCreateBuffer(context, + CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + bufferSize, hostPtr, &err); +``` + +### Performance Profiling +- `clGetEventProfilingInfo`: Kernel execution profiling + - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetEventProfilingInfo.html) +```cpp +cl_ulong startTime, endTime; +clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, + sizeof(cl_ulong), &startTime, NULL); +clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), &endTime, NULL); +``` + +## Optimized OpenCL Kernel Example +```cpp +__kernel void sha256_kernel(__global const uint* input, + __global uint* output, + __local uint* shared_data) +{ + size_t gid = get_global_id(0); + size_t lid = get_local_id(0); + + // Collaborative loading of data into local memory + if (lid < DATA_PARALLEL_FACTOR) { + shared_data[lid] = input[gid / DATA_PARALLEL_FACTOR + lid]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Mining-specific computation here +} +``` + +## wxWidgets Integration + +### GUI Implementation +- Event-driven architecture for mining control +- Real-time hash rate and hardware monitoring displays +- Configuration interface for mining parameters + +### CMake Integration +```cmake +find_package(wxWidgets REQUIRED COMPONENTS core base) +include(${wxWidgets_USE_FILE}) + +add_executable(miner-gui src/main.cpp src/gui.cpp) +target_link_libraries(miner-gui PRIVATE + OpenCL::OpenCL + ${wxWidgets_LIBRARIES} +) +``` + +### GUI Best Practices +- Use `wxThread` for non-blocking mining operations +- Implement `wxTimer` for GUI updates (hash rate, temperature) +- Utilize `wxGrid` for displaying mining statistics +- Message queue for thread-safe GUI updates + +### Example GUI Component +```cpp +class MinerFrame : public wxFrame { +public: + MinerFrame() : wxFrame(nullptr, wxID_ANY, "Crypto Miner") { + // Layout + auto mainSizer = new wxBoxSizer(wxVERTICAL); + + // Controls + auto startButton = new wxButton(this, wxID_ANY, "Start Mining"); + hashRateText = new wxStaticText(this, wxID_ANY, "Hash Rate: 0 H/s"); + + // Events + startButton->Bind(wxEVT_BUTTON, &MinerFrame::OnStartMining, this); + + // Update timer + wxTimer* timer = new wxTimer(this); + Bind(wxEVT_TIMER, &MinerFrame::OnUpdateStats, this); + timer->Start(1000); // Update every second + } + +private: + wxStaticText* hashRateText; + + void OnStartMining(wxCommandEvent& evt) { + // Start mining in separate thread + auto thread = new MiningThread(this); + thread->Run(); + } + + void OnUpdateStats(wxTimerEvent& evt) { + // Update GUI with current mining stats + hashRateText->SetLabel(wxString::Format( + "Hash Rate: %.2f MH/s", getCurrentHashRate())); + } +}; +``` + +## Comprehensive Error Handling +```cpp +class MiningException : public wxException { +public: + MiningException(const std::string& message) + : m_message(message) {} + + virtual const wxChar* what() const wxTHROW_OVERRIDE { + return m_message.wc_str(); + } +private: + wxString m_message; +}; + +// Usage in OpenCL code +try { + cl_int error = clFunction(...); + checkError(error, "OpenCL operation"); +} catch (const std::runtime_error& e) { + throw MiningException(wxString::Format( + "Mining error: %s", e.what())); +} +``` + +## Performance Best Practices +1. Use `clEnqueueWriteBufferRect` for optimized 2D/3D data transfers +2. Implement `clEnqueueMapBuffer` for zero-copy buffer access where possible +3. Utilize `clEnqueueNDRangeKernel` events for operation pipelining +4. Minimize GUI updates to reduce overhead +5. Use event batching for high-frequency updates +6. Implement separate thread for OpenCL operations \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c5ce08235..18fc1ada5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -624,15 +624,26 @@ file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp") set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - add_executable(xmr-stak ${SRCFILES_CPP} xmrstak/cli/xmr-stak.manifest) + add_executable(xmr-stak_initial + ${SRCFILES_CPP} + xmrstak/cli/xmr-stak.manifest + ) else() - add_executable(xmr-stak ${SRCFILES_CPP}) + add_executable(xmr-stak_initial + ${SRCFILES_CPP} + ) endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) +target_link_libraries(xmr-stak_initial + PRIVATE + xmr-stak-backend + ${LIBS} + ${CMAKE_DL_LIBS} + ${HWLOC_LIBRARIES} +) ################################################################################ # Install @@ -640,7 +651,7 @@ target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm) # do not install the binary if the project and install are equal if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) - install(TARGETS xmr-stak + install(TARGETS xmr-stak_initial RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}") if(CUDA_FOUND) if(WIN32) @@ -664,3 +675,37 @@ else() # this rule is used if the install prefix is the build directory install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")") endif() +# Find wxWidgets +find_package(wxWidgets 3.2.4 REQUIRED COMPONENTS core base) +if(wxWidgets_FOUND) + include(${wxWidgets_USE_FILE}) + find_package(Threads REQUIRED) + add_executable(xmr-stak_day2day + xmrstak/setup_gui.cpp + ) + target_link_options(xmr-stak_day2day PRIVATE + -L/usr/local/lib + ) + target_link_libraries(xmr-stak_day2day + wx_baseu-3.2 + wx_gtk3u_core-3.2 + pthread + #${wxWidgets_LIBRARIES} + ) + # Add wxWidgets compile definitions and options + target_compile_options(xmr-stak_day2day PRIVATE ${wxWidgets_CXX_FLAGS}) + # Print include directories for debugging +get_target_property(INCLUDE_DIRS xmr-stak_day2day INCLUDE_DIRECTORIES) +message("Include directories: ${INCLUDE_DIRS}") +else() + message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.") +endif() +# Add wxWidgets include directories to your target +target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS}) + + +message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}") +message(STATUS "wxWidgets_INCLUDE_DIRS: ${wxWidgets_INCLUDE_DIRS}") +message(STATUS "wxWidgets_LIBRARIES: ${wxWidgets_LIBRARIES}") + +set(CMAKE_VERBOSE_MAKEFILE ON) \ No newline at end of file diff --git a/xmrstak/setup_gui.cpp b/xmrstak/setup_gui.cpp new file mode 100644 index 000000000..59b8c0e91 --- /dev/null +++ b/xmrstak/setup_gui.cpp @@ -0,0 +1,52 @@ +#include "setup_gui.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +enum { + ID_START_BUTTON = wxID_HIGHEST + 1 +}; +// Implement the member functions of MiningConfigFrame +MiningConfigFrame::MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, + const wxPoint& pos, const wxSize& size) + : wxFrame(parent, id, title, pos, size) +{ + // Constructor implementation + // ... +} + +void MiningConfigFrame::OnStart(wxCommandEvent& event) +{ + // Your OnStart implementation here +} + +// Implement the member functions of MyApp +bool GUIApp::OnInit() + { + if (!wxApp::OnInit()) + return false; + + // Check for pool.txt file + wxString poolFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt"; + + if (!wxFileExists(poolFilePath)) + { + wxMessageBox("The pools.txt file is missing in the /bin folder. " + "Please create this file running xmr-stak_initial setup before starting the miner this way.", + "Configuration Missing", + wxOK | wxICON_ERROR); + return false; + } + + MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300)); + frame->Show(true); + return true; + } + + +wxIMPLEMENT_APP(GUIApp); diff --git a/xmrstak/setup_gui.hpp b/xmrstak/setup_gui.hpp new file mode 100644 index 000000000..6ed307bc9 --- /dev/null +++ b/xmrstak/setup_gui.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +class MiningConfigFrame : public wxFrame +{ +public: + MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, + const wxPoint& pos = wxDefaultPosition, + const wxSize& size = wxDefaultSize); + +private: + + void OnStart(wxCommandEvent& event); + + // Other private members and methods +}; + +class GUIApp : public wxApp +{ +public: + virtual bool OnInit() override; +}; From 1133cbfe08b5e0300acf17e3e93691f738ddaccd Mon Sep 17 00:00:00 2001 From: acktarius Date: Thu, 10 Oct 2024 00:17:14 -0400 Subject: [PATCH 4/6] readMiningConfig() --- CMakeLists.txt | 2 - xmrstak/setup_gui.cpp | 86 ++++++++++++++++++++++++++++++++++--------- xmrstak/setup_gui.hpp | 28 +++++++++++++- 3 files changed, 94 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18fc1ada5..71443723a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -707,5 +707,3 @@ target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS}) message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}") message(STATUS "wxWidgets_INCLUDE_DIRS: ${wxWidgets_INCLUDE_DIRS}") message(STATUS "wxWidgets_LIBRARIES: ${wxWidgets_LIBRARIES}") - -set(CMAKE_VERBOSE_MAKEFILE ON) \ No newline at end of file diff --git a/xmrstak/setup_gui.cpp b/xmrstak/setup_gui.cpp index 59b8c0e91..de4ed3a5d 100644 --- a/xmrstak/setup_gui.cpp +++ b/xmrstak/setup_gui.cpp @@ -7,10 +7,59 @@ #include #include #include +#include +#include enum { ID_START_BUTTON = wxID_HIGHEST + 1 }; +//readMiningConfig +std::tuple readMiningConfig() +{ + wxString poolsFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt"; + bool poolsFileExists = wxFileExists(poolsFilePath); + + if (!poolsFileExists) + { + return std::make_tuple(false, std::string(), std::string()); + } + + wxTextFile file(poolsFilePath); + if (!file.Open()) + { + return std::make_tuple(false, std::string(), std::string()); + } + + wxString poolAddress, walletAddress; + for (wxString line = file.GetFirstLine(); !file.Eof(); line = file.GetNextLine()) + { + if (line.StartsWith("\"pool_address\" : ")) + { + poolAddress = line.AfterFirst(':').Trim(true).Trim(false); + poolAddress.Remove(0, 1).RemoveLast(); + } + else if (line.StartsWith("\"wallet_address\" : ")) + { + walletAddress = line.AfterFirst(':').Trim(true).Trim(false); + walletAddress.Remove(0, 1).RemoveLast(); + } + + if (!poolAddress.IsEmpty() && !walletAddress.IsEmpty()) + { + break; + } + } + + file.Close(); + + if (poolAddress.IsEmpty() || walletAddress.IsEmpty()) + { + return std::make_tuple(true, std::string(), std::string()); //TODO: change to false when pools.txt is fixed + } + + return std::make_tuple(true, poolAddress.ToStdString(), walletAddress.ToStdString()); +} + // Implement the member functions of MiningConfigFrame MiningConfigFrame::MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, const wxPoint& pos, const wxSize& size) @@ -25,28 +74,29 @@ void MiningConfigFrame::OnStart(wxCommandEvent& event) // Your OnStart implementation here } + // Implement the member functions of MyApp bool GUIApp::OnInit() - { - if (!wxApp::OnInit()) - return false; - - // Check for pool.txt file - wxString poolFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt"; - - if (!wxFileExists(poolFilePath)) - { - wxMessageBox("The pools.txt file is missing in the /bin folder. " - "Please create this file running xmr-stak_initial setup before starting the miner this way.", - "Configuration Missing", - wxOK | wxICON_ERROR); - return false; - } +{ + if (!wxApp::OnInit()) + return false; - MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300)); - frame->Show(true); - return true; + // Create an instance of ReadPoolConfig + auto [fileReachable, pool, wallet] = readMiningConfig(); + + if (fileReachable == false) + { + wxMessageBox("The pools.txt file is missing or invalid in the /bin folder. " + "Please create this file by running xmr-stak_initial setup before starting the miner this way.", + "Configuration Missing", + wxOK | wxICON_ERROR); + return false; } + MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300)); + frame->SetPoolInfo(pool, wallet); // Assuming you have this method in MiningConfigFrame + frame->Show(true); + return true; +} wxIMPLEMENT_APP(GUIApp); diff --git a/xmrstak/setup_gui.hpp b/xmrstak/setup_gui.hpp index 6ed307bc9..a788a23f1 100644 --- a/xmrstak/setup_gui.hpp +++ b/xmrstak/setup_gui.hpp @@ -1,6 +1,8 @@ #pragma once #include +#include +#include class MiningConfigFrame : public wxFrame { @@ -9,15 +11,37 @@ class MiningConfigFrame : public wxFrame const wxPoint& pos = wxDefaultPosition, const wxSize& size = wxDefaultSize); -private: + void SetPoolInfo(const std::string& pool, const std::string& wallet) { + m_pool = pool; + m_wallet = wallet; + // Update GUI elements if necessary + } +private: void OnStart(wxCommandEvent& event); - + std::string m_pool; + std::string m_wallet; // Other private members and methods }; +class PoolConfig +{ +public: + PoolConfig(bool fileReachable, const std::string& poolAddress, const std::string& walletAddress); + + std::tuple read(); + +private: + bool m_fileReachable; + std::string m_poolAddress; + std::string m_walletAddress; +}; + class GUIApp : public wxApp { public: virtual bool OnInit() override; + +private: + // You might want to add any private members here if needed }; From 49b115e7c3f58d0c17ca60f7fc4ce8085d4b2b5c Mon Sep 17 00:00:00 2001 From: acktarius Date: Thu, 10 Oct 2024 10:49:48 -0400 Subject: [PATCH 5/6] day2day --- CMakeLists.txt | 24 ++++++++------ xmrstak/{ => backend/day2day}/setup_gui.cpp | 36 ++++++++------------- xmrstak/{ => backend/day2day}/setup_gui.hpp | 0 3 files changed, 29 insertions(+), 31 deletions(-) rename xmrstak/{ => backend/day2day}/setup_gui.cpp (77%) rename xmrstak/{ => backend/day2day}/setup_gui.hpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 71443723a..b901ff170 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -624,12 +624,12 @@ file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp") set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - add_executable(xmr-stak_initial + add_executable(xmr-stak ${SRCFILES_CPP} xmrstak/cli/xmr-stak.manifest ) else() - add_executable(xmr-stak_initial + add_executable(xmr-stak ${SRCFILES_CPP} ) endif() @@ -637,7 +637,7 @@ endif() set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}") set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}") -target_link_libraries(xmr-stak_initial +target_link_libraries(xmr-stak PRIVATE xmr-stak-backend ${LIBS} @@ -651,7 +651,7 @@ target_link_libraries(xmr-stak_initial # do not install the binary if the project and install are equal if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) - install(TARGETS xmr-stak_initial + install(TARGETS xmr-stak RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}") if(CUDA_FOUND) if(WIN32) @@ -677,11 +677,14 @@ else() endif() # Find wxWidgets find_package(wxWidgets 3.2.4 REQUIRED COMPONENTS core base) -if(wxWidgets_FOUND) +if(!wxWidgets_FOUND) + message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.") +else() include(${wxWidgets_USE_FILE}) find_package(Threads REQUIRED) add_executable(xmr-stak_day2day - xmrstak/setup_gui.cpp + xmrstak/backend/day2day/setup_gui.cpp + xmrstak/misc/environment.cpp ) target_link_options(xmr-stak_day2day PRIVATE -L/usr/local/lib @@ -690,18 +693,21 @@ if(wxWidgets_FOUND) wx_baseu-3.2 wx_gtk3u_core-3.2 pthread + xmr-stak-backend #${wxWidgets_LIBRARIES} ) # Add wxWidgets compile definitions and options target_compile_options(xmr-stak_day2day PRIVATE ${wxWidgets_CXX_FLAGS}) # Print include directories for debugging get_target_property(INCLUDE_DIRS xmr-stak_day2day INCLUDE_DIRECTORIES) +file(GLOB MISC_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/xmrstak/misc/*.cpp") +target_sources(xmr-stak_day2day PRIVATE ${MISC_SOURCES}) message("Include directories: ${INCLUDE_DIRS}") -else() - message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.") +target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS}) + endif() # Add wxWidgets include directories to your target -target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS}) + message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}") diff --git a/xmrstak/setup_gui.cpp b/xmrstak/backend/day2day/setup_gui.cpp similarity index 77% rename from xmrstak/setup_gui.cpp rename to xmrstak/backend/day2day/setup_gui.cpp index de4ed3a5d..a3aa086d8 100644 --- a/xmrstak/setup_gui.cpp +++ b/xmrstak/backend/day2day/setup_gui.cpp @@ -30,27 +30,13 @@ std::tuple readMiningConfig() return std::make_tuple(false, std::string(), std::string()); } - wxString poolAddress, walletAddress; - for (wxString line = file.GetFirstLine(); !file.Eof(); line = file.GetNextLine()) - { - if (line.StartsWith("\"pool_address\" : ")) - { - poolAddress = line.AfterFirst(':').Trim(true).Trim(false); - poolAddress.Remove(0, 1).RemoveLast(); - } - else if (line.StartsWith("\"wallet_address\" : ")) - { - walletAddress = line.AfterFirst(':').Trim(true).Trim(false); - walletAddress.Remove(0, 1).RemoveLast(); - } + file.Close(); - if (!poolAddress.IsEmpty() && !walletAddress.IsEmpty()) - { - break; - } - } + wxString poolAddress, walletAddress; + poolAddress = "toto"; + walletAddress = "ccx7"; - file.Close(); + if (poolAddress.IsEmpty() || walletAddress.IsEmpty()) { @@ -86,13 +72,19 @@ bool GUIApp::OnInit() if (fileReachable == false) { - wxMessageBox("The pools.txt file is missing or invalid in the /bin folder. " - "Please create this file by running xmr-stak_initial setup before starting the miner this way.", + wxMessageBox("The pools.txt file is missing or invalid in the /bin folder.\n" + "Please create this file by running xmr-stak for initial setup,\n" + "before starting the miner this way.", "Configuration Missing", wxOK | wxICON_ERROR); return false; } - + { + wxMessageBox("The pools.txt file is has been detect," + + pool + " " + wallet, + "Configuration", + wxOK | wxICON_INFORMATION); + } MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300)); frame->SetPoolInfo(pool, wallet); // Assuming you have this method in MiningConfigFrame frame->Show(true); diff --git a/xmrstak/setup_gui.hpp b/xmrstak/backend/day2day/setup_gui.hpp similarity index 100% rename from xmrstak/setup_gui.hpp rename to xmrstak/backend/day2day/setup_gui.hpp From 7f44db29f85559a19af01129091c8c64aaf4f396 Mon Sep 17 00:00:00 2001 From: acktarius Date: Sun, 13 Oct 2024 18:58:41 -0400 Subject: [PATCH 6/6] readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 2e2eb61fa..6da78d59a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +## This fork to add Conceal as a coin and to remove cryptonight_conceal algo to avoid confusion + +--- +