From 838fad33ac4f01f9a67351e579269bbf446629cb Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Tue, 1 Oct 2024 20:20:17 -0400
Subject: [PATCH 1/6] AI_opt_kernel_CN_GPU

---
 .cursorrules                                  |  45 +++
 doc/README.md                                 |   3 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl |  37 +-
 .../amd/amd_gpu/opencl/cryptonight_gpu.cl     | 372 +++++++++---------
 .../backend/cpu/crypto/cryptonight_aesni.h    |  23 +-
 xmrstak/backend/cpu/minethd.cpp               |  18 -
 xmrstak/backend/cryptonight.hpp               |  33 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu |  26 --
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |   8 +-
 xmrstak/jconf.cpp                             |   5 +-
 10 files changed, 258 insertions(+), 312 deletions(-)
 create mode 100644 .cursorrules
diff --git a/.cursorrules b/.cursorrules
new file mode 100644
index 000000000..7f388c799
--- /dev/null
+++ b/.cursorrules
@@ -0,0 +1,45 @@
+ You are an expert in C++ and OpenCL, and scalable parallel computing.
+
+Key Principles
+
+    Write clear, technical responses with precise C++ and OpenCL examples.
+    Prioritize readability and maintainability; follow C++ coding conventions and OpenCL best practices.
+    Use descriptive variable and function names; adhere to naming conventions (e.g., CamelCase for class names, snake_case for filenames and private member variables).
+    Structure your project in a modular way using C++ namespaces and header files, as well as OpenCL's kernel files and build scripts to promote reusability and separation of concerns.
+
+C++/OpenCL
+
+    Use object-oriented programming principles (inheritance, polymorphism, encapsulation) to create modular and reusable parallel computing elements.
+    Leverage C++'s standard template library for data structures and algorithms.
+    Utilize OpenCL's C99-based kernel language for writing parallel computation code.
+    Implement custom OpenCL kernels for specific computation tasks.
+    Use C++'s RAII (Resource Acquisition Is Initialization) pattern for automatic resource management.
+
+Error Handling and Debugging
+
+    Implement error handling using try-catch blocks where appropriate.
+    Use C++'s assertion macros (e.g., assert) for catching logical errors during development.
+    Utilize OpenCL's error-handling functions (e.g., clGetError) for catching and handling OpenCL-specific errors.
+    Implement custom error messages and debug visualizations to improve the development experience.
+
+Dependencies
+
+    C++ Compiler (e.g., GCC, Clang)
+    C++ Standard Library (e.g., STL)
+    OpenCL SDK (e.g., AMD APP SDK, NVIDIA CUDA Toolkit, Apple Metal Performance Shaders)
+    Third-party libraries (carefully vetted for compatibility and performance)
+
+OpenCL-Specific Guidelines
+
+    Use OpenCL's built-in functions and kernels for common parallel computation tasks (e.g., vector addition, matrix multiplication).
+    Keep parallel computation logic in separate kernels; use the C++ main function for host-side initialization and execution management.
+    Utilize OpenCL's memory management functions (e.g., clEnqueueReadBuffer, clEnqueueWriteBuffer) for efficient data transfer between host and device.
+    Apply OpenCL's work-item and work-group synchronization functions (e.g., barrier) to ensure correct execution of parallel computations.
+    Optimize memory access patterns to reduce global memory contention and improve data locality.
+    Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times.
+
+Code Review and Best Practices
+
+    Review code for potential performance bottlenecks and memory access patterns.
+    Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times.
+    Optimize memory access patterns to reduce global memory contention and improve data locality.
diff --git a/doc/README.md b/doc/README.md
index 7a1f13288..9182240d9 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,4 +1,3 @@
-
 <table>
     <tr>
         <td align="center"><a href=https://github.com/fireice-uk/xmr-stak/tree/xmr-stak-rx/doc/README.md><img src="_img/xmr-stak-rx-btn-inactive.png"></a></td>
@@ -55,7 +54,7 @@ If your preferred coin is not listed, you can choose one of the following mining
 | --- | --- | --- | --- | 
 | cryptonight_turtle  | cryptonight_lite  | cryptonight  | cryptonight_bittube2  | 
 | ---  | cryptonight_lite_v7  | cryptonight_gpu  | cryptonight_haven  | 
-| ---  | ---  | cryptonight_conceal  | cryptonight_heavy  | 
+| ---  | ---  | ---  | cryptonight_heavy  | 
 | ---  | ---  | cryptonight_r  | ---  | 
 | ---  | ---  | cryptonight_masari (used in 2018)  | ---  | 
 | ---  | ---  | cryptonight_v8_reversewaltz  | ---  | 
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index d17b79215..fa7a9eb7a 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -29,8 +29,9 @@ R"===(
 #define cryptonight_monero_v8 11
 #define cryptonight_superfast 12
 #define cryptonight_gpu 13
-#define cryptonight_conceal 14
-#define cryptonight_v8_reversewaltz 17
+#define cryptonight_r_wow 14
+#define cryptonight_r 15
+#define cryptonight_v8_reversewaltz 16
 
 
 static const __constant ulong keccakf_rndc[24] =
@@ -79,7 +80,7 @@ void keccakf1600(ulong *s)
 		bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
 		bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
 
-		tmp1 = s[1] ^ bc[0];
+		tmp1 = s[1];
 
 		s[0] ^= bc[4];
 		s[1] = rotate(s[6] ^ bc[0], 44UL);
@@ -596,21 +597,12 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 )
 {
 	ulong a[2];
-#if(ALGO == cryptonight_conceal)
-	float4 conc_var = (float4)(0.0f);
-#endif
+	ulong b[2];
+	uint4 b_x[1];
 
-#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
-	ulong b[4];
-	uint4 b_x[2];
-// NVIDIA
-#	ifdef __NV_CL_C_VERSION
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
 	__local uint16 scratchpad_line_buf[WORKSIZE];
  	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
-#	endif
-#else
-	ulong b[2];
-	uint4 b_x[1];
 #endif
 	__local uint AES0[256], AES1[256];
 
@@ -703,21 +695,6 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 
-#if(ALGO == cryptonight_conceal)
-			float4 r  = convert_float4_rte(((int4 *)c)[0]);
-			float4 c_old = conc_var;
-			r = _mm_add_ps(r, conc_var);
-			r = _mm_mul_ps(r, _mm_mul_ps(r, r));
-			r = _mm_and_ps(r, 0x807FFFFF);
-			r = _mm_or_ps(r, 0x40000000);
-			conc_var = _mm_add_ps(conc_var, r);
-
-			c_old = _mm_and_ps(c_old, 0x807FFFFF);
-			c_old = _mm_or_ps(c_old, 0x40000000);
-			float4 nc = _mm_mul_ps(c_old, (float4)(536870880.0f));
-			((int4 *)c)[0] ^= convert_int4_rte(nc);
-#endif
-
 #if(ALGO == cryptonight_bittube2)
 			((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
index bb37581f2..8a97c3f77 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -1,133 +1,109 @@
 R"===(
 
-inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); }
-
-inline float4 fma_break(float4 x)
-{
-	// Break the dependency chain by setitng the exp to ?????01
-	x = _mm_and_ps(x, 0xFEFFFFFF);
-	return _mm_or_ps(x, 0x00800000);
+inline __global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) {
+    return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16);
 }
 
-inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c)
-{
-	n1 = _mm_add_ps(n1, *c);
-	float4 nn = _mm_mul_ps(n0, *c);
-	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
-	nn = fma_break(nn);
-	*n = _mm_add_ps(*n, nn);
-
-	n3 = _mm_sub_ps(n3, *c);
-	float4 dd = _mm_mul_ps(n2, *c);
-	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
-	dd = fma_break(dd);
-	*d = _mm_add_ps(*d, dd);
-
-	//Constant feedback
-	*c = _mm_add_ps(*c, rnd_c);
-	*c = _mm_add_ps(*c, (float4)(0.734375f));
-	float4 r = _mm_add_ps(nn, dd);
-	r = _mm_and_ps(r, 0x807FFFFF);
-	r = _mm_or_ps(r, 0x40000000);
-	*c = _mm_add_ps(*c, r);
-
-}
-
-// 9*8 + 2 = 74
-inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r)
-{
-	float4 n = (float4)(0.0f);
-	float4 d = (float4)(0.0f);
-
-	sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c);
-	sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c);
-	sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c);
-	sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c);
-	sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c);
-	sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c);
-	sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c);
-	sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c);
-
-	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
-	d = _mm_and_ps(d, 0xFF7FFFFF);
-	d = _mm_or_ps(d, 0x40000000);
-	*r =_mm_add_ps(*r, _mm_div_ps(n,d));
-}
-
-inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum)
-{
-	float4 c= (float4)(cnt);
-	// 35 maths calls follow (140 FLOPS)
-	float4 r = (float4)(0.0f);
-
-	for(int i = 0; i < 4; ++i)
-		round_compute(n0, n1, n2, n3, rnd_c, &c, &r);
-
-	// do a quick fmod by setting exp to 2
-	r = _mm_and_ps(r, 0x807FFFFF);
-	r = _mm_or_ps(r, 0x40000000);
-	*sum = r; // 34
-	float4 x = (float4)(536870880.0f);
-	r = _mm_mul_ps(r, x); // 35
-	return convert_int4_rte(r);
-}
-
-inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out)
-{
-	float4 n0 = convert_float4_rte(v0);
-	float4 n1 = convert_float4_rte(v1);
-	float4 n2 = convert_float4_rte(v2);
-	float4 n3 = convert_float4_rte(v3);
-
-	int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
-	*out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
-}
+#define fma_break(x) ((x & (float4)(0xFEFFFFFF)) | (float4)(0x00800000))
+
+#define sub_round(n0, n1, n2, n3, rnd_c, n, d, c) \
+    do { \
+        n1 += *c; \
+        float4 nn = n0 * *c; \
+        nn = n1 * (nn * nn); \
+        nn = fma_break(nn); \
+        *n += nn; \
+        \
+        n3 -= *c; \
+        float4 dd = n2 * *c; \
+        dd = n3 * (dd * dd); \
+        dd = fma_break(dd); \
+        *d += dd; \
+        \
+        *c += rnd_c + (float4)(0.734375f); \
+        float4 r = nn + dd; \
+        r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \
+        *c += r; \
+    } while(0)
+
+#define round_compute(n0, n1, n2, n3, rnd_c, c, r) \
+    do { \
+        float4 n = (float4)(0.0f); \
+        float4 d = (float4)(0.0f); \
+        \
+        for(int i = 0; i < 8; ++i) { \
+            sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); \
+            float4 tmp = n0; n0 = n1; n1 = n2; n2 = n3; n3 = tmp; \
+        } \
+        \
+        d = (d & (float4)(0xFF7FFFFF)) | (float4)(0x40000000); \
+        *r += n / d; \
+    } while(0)
+
+#define single_compute(n0, n1, n2, n3, cnt, rnd_c, sum) \
+    ({ \
+        float4 c = (float4)(cnt); \
+        float4 r = (float4)(0.0f); \
+        \
+        for(int i = 0; i < 4; ++i) \
+            round_compute(n0, n1, n2, n3, rnd_c, &c, &r); \
+        \
+        r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \
+        *sum = r; \
+        convert_int4_rte(r * (float4)(536870880.0f)); \
+    })
+
+#define single_compute_wrap(rot, v0, v1, v2, v3, cnt, rnd_c, sum, out) \
+    do { \
+        float4 n0 = convert_float4_rte(v0); \
+        float4 n1 = convert_float4_rte(v1); \
+        float4 n2 = convert_float4_rte(v2); \
+        float4 n3 = convert_float4_rte(v3); \
+        \
+        int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); \
+        *out = rot == 0 ? r : (int4)(r.yzwx); \
+    } while(0)
 
 )==="
 	R"===(
 
-static const __constant uint look[16][4] = {
-	{0, 1, 2, 3},
-	{0, 2, 3, 1},
-	{0, 3, 1, 2},
-	{0, 3, 2, 1},
-
-	{1, 0, 2, 3},
-	{1, 2, 3, 0},
-	{1, 3, 0, 2},
-	{1, 3, 2, 0},
-
-	{2, 1, 0, 3},
-	{2, 0, 3, 1},
-	{2, 3, 1, 0},
-	{2, 3, 0, 1},
-
-	{3, 1, 2, 0},
-	{3, 2, 0, 1},
-	{3, 0, 1, 2},
-	{3, 0, 2, 1}
+// Move lookup tables to constant memory
+__constant uint look[16][4] = {
+    {0, 1, 2, 3},
+    {0, 2, 3, 1},
+    {0, 3, 1, 2},
+    {0, 3, 2, 1},
+    {1, 0, 2, 3},
+    {1, 2, 3, 0},
+    {1, 3, 0, 2},
+    {1, 3, 2, 0},
+    {2, 1, 0, 3},
+    {2, 0, 3, 1},
+    {2, 3, 1, 0},
+    {2, 3, 0, 1},
+    {3, 1, 2, 0},
+    {3, 2, 0, 1},
+    {3, 0, 1, 2},
+    {3, 0, 2, 1}
 };
 
-static const __constant float ccnt[16] = {
-	1.34375f,
-	1.28125f,
-	1.359375f,
-	1.3671875f,
-
-	1.4296875f,
-	1.3984375f,
-	1.3828125f,
-	1.3046875f,
-
-	1.4140625f,
-	1.2734375f,
-	1.2578125f,
-	1.2890625f,
-
-	1.3203125f,
-	1.3515625f,
-	1.3359375f,
-	1.4609375f
+__constant float ccnt[16] = {
+    1.34375f,
+    1.28125f,
+    1.359375f,
+    1.3671875f,
+    1.4296875f,
+    1.3984375f,
+    1.3828125f,
+    1.3046875f,
+    1.4140625f,
+    1.2734375f,
+    1.2578125f,
+    1.2890625f,
+    1.3203125f,
+    1.3515625f,
+    1.3359375f,
+    1.4609375f
 };
 
 struct SharedMemChunk
@@ -139,84 +115,98 @@ struct SharedMemChunk
 __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
 __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
 {
-	const uint gIdx = getIdx();
+    const uint gIdx = getIdx();
 
-#if(COMP_MODE==1)
-	if(gIdx/16 >= numThreads)
-		return;
-#endif
+    #if(COMP_MODE==1)
+    if(gIdx/16 >= numThreads)
+        return;
+    #endif
 
-	uint chunk = get_local_id(0) / 16;
+    uint chunk = get_local_id(0) / 16;
 
-#if(STRIDED_INDEX==0)
-	__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
-#endif
+    #if(STRIDED_INDEX==0)
+    __global int4* lpad = (__global int4*)((__global char*)lpad_in + MEMORY * (gIdx/16));
+    #endif
 
-	__local struct SharedMemChunk smem_in[WORKSIZE];
-	__local struct SharedMemChunk* smem = smem_in + chunk;
+    __local struct SharedMemChunk {
+        int4 out[16];
+        float4 va[16];
+    } smem_in[WORKSIZE];
+    __local struct SharedMemChunk* smem = smem_in + chunk;
 
-	uint tid = get_local_id(0) % 16;
+    // New local memory cache for frequently accessed data
+    __local int4 lpad_cache[32];
 
-	uint idxHash = gIdx/16;
-	uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
-	float4 vs = (float4)(0);
+    uint tid = get_local_id(0) % 16;
 
-	// tid divided
-	const uint tidd = tid / 4;
-	// tid modulo
-	const uint tidm = tid % 4;
-	const uint block = tidd * 16 + tidm;
+    uint idxHash = gIdx/16;
+    uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
+    float4 vs = (float4)(0);
 
-	#pragma unroll CN_UNROLL
-	for(size_t i = 0; i < ITERATIONS; i++)
-	{
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-		int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
-		((__local int*)(smem->out))[tid] = tmp;
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-
-		{
-			single_comupte_wrap(
-				tidm,
-				*(smem->out + look[tid][0]),
-				*(smem->out + look[tid][1]),
-				*(smem->out + look[tid][2]),
-				*(smem->out + look[tid][3]),
-				ccnt[tid], vs, smem->va + tid,
-				smem->out + tid
-			);
-		}
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-
-		int outXor = ((__local int*)smem->out)[block];
-		for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
-			outXor ^= ((__local int*)smem->out)[dd];
-
-		((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp;
-		((__local int*)smem->out)[tid] = outXor;
-
-		float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
-		float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12];
-		((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2;
-
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-
-		int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12];
-		va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
-		va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12];
-		va_tmp1 = va_tmp1 + va_tmp2;
-		va_tmp1 = fabs(va_tmp1);
-
-		float xx = va_tmp1 * 16777216.0f;
-		int xx_int = (int)xx;
-		((__local int*)smem->out)[tid] = out2 ^ xx_int;
-		((__local float*)smem->va)[tid] = va_tmp1 / 64.0f;
-
-		mem_fence(CLK_LOCAL_MEM_FENCE);
-
-		vs = smem->va[0];
-		s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w;
-	}
+    const uint tidd = tid / 4;
+    const uint tidm = tid % 4;
+    const uint block = tidd * 16 + tidm;
+
+    // Preload frequently accessed data into local memory
+    if (tid < 32) {
+        lpad_cache[tid] = lpad[tid];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    #pragma unroll CN_UNROLL
+    for(uint i = 0; i < ITERATIONS; i++)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        smem->out[tid] = scratchpad_ptr(s, tidd, (__global int*)lpad_cache)[tidm];
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        single_compute_wrap(
+            tidm,
+            smem->out[look[tid][0]],
+            smem->out[look[tid][1]],
+            smem->out[look[tid][2]],
+            smem->out[look[tid][3]],
+            ccnt[tid], vs, &smem->va[tid],
+            &smem->out[tid]
+        );
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Combine outXor calculation and update of lpad_cache
+        int4 outXor = smem->out[block];
+        #pragma unroll
+        for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+            outXor ^= smem->out[dd];
+        lpad_cache[s % 32] = outXor ^ smem->out[tid];
+        smem->out[tid] = outXor;
+
+        // Combine va_tmp calculation and assignment
+        smem->va[tid] = (smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]);
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Combine out2 and va_tmp2 calculations
+        int4 out2 = (smem->out[tid] ^ smem->out[tid + 4]) ^ (smem->out[tid + 8] ^ smem->out[tid + 12]);
+        float4 va_tmp2 = fabs((smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]));
+
+        // Combine xx calculation and conversion
+        int4 xx_int = convert_int4_rte(va_tmp2 * (float4)(16777216.0f));
+        smem->out[tid] = out2 ^ xx_int;
+        smem->va[tid] = va_tmp2 * (float4)(0.015625f);  // Vectorized division by 64.0f
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Combine vs and s updates
+        vs = smem->va[0];
+        s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w;
+    }
+
+    // Write back to global memory
+    if (tid < 32) {
+        lpad[tid] = lpad_cache[tid];
+    }
 }
 
 )==="
@@ -326,4 +316,4 @@ __kernel void JOIN(cn00_cn_gpu,ALGO)(__global int *Scratchpad, __global ulong *s
 	}
 }
 
-)==="
+)==="
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 6c9e3390c..45c2f1fc3 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -611,21 +611,6 @@ inline __m128 _mm_set1_ps_epi32(uint32_t x)
 	return _mm_castsi128_ps(_mm_set1_epi32(x));
 }
 
-inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
-{
-	__m128 r = _mm_cvtepi32_ps(cx);
-	__m128 c_old = conc_var;
-	r = _mm_add_ps(r, conc_var);
-	r = _mm_mul_ps(r, _mm_mul_ps(r, r));
-	r = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), r);
-	r = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), r);
-	conc_var = _mm_add_ps(conc_var, r);
-
-	c_old = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), c_old);
-	c_old = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), c_old);
-	__m128 nc = _mm_mul_ps(c_old, _mm_set1_ps(536870880.0f));
-	cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
-}
 
 #define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)                              \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */              \
@@ -746,11 +731,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	__m128i bx1;                                                                                                                                                                         \
 	__m128i division_result_xmm;                                                                                                                                                         \
 	__m128 conc_var;                                                                                                                                                                     \
-	if(ALGO == cryptonight_conceal)                                                                                                                                                      \
-	{                                                                                                                                                                                    \
-		set_float_rounding_mode_nearest();                                                                                                                                               \
-		conc_var = _mm_setzero_ps();                                                                                                                                                     \
-	}                                                                                                                                                                                    \
+	                                                                                                                                                                                 \
 	GetOptimalSqrtType_t<N> sqrt_result;                                                                                                                                                 \
 	uint32_t cn_r_data[9];                                                                                                                                                               \
 	/* END cryptonight_monero_v8 variables */                                                                                                                                            \
@@ -781,8 +762,6 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	__m128i cx;                                                                \
 	ptr0 = (__m128i*)&l0[idx0 & MASK];                                         \
 	cx = _mm_load_si128(ptr0);                                                 \
-	if(ALGO == cryptonight_conceal)                                            \
-		cryptonight_conceal_tweak(cx, conc_var);                               \
 	if(ALGO == cryptonight_bittube2)                                           \
 	{                                                                          \
 		cx = aes_round_bittube2(cx, ax0);                                      \
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 43759776f..a179cdc10 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -445,16 +445,6 @@ bool minethd::self_test()
 			ctx[0]->hash_fn("", 0, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
 		}
-		else if(algo == POW(cryptonight_conceal))
-		{
-			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
-			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
-
-			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo);
-			ctx[0]->hash_fn("", 0, out, ctx, algo);
-			bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
-		}
 		else if(algo == POW(cryptonight_turtle))
 		{
 			func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo);
@@ -643,9 +633,6 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 	case cryptonight_gpu:
 		algv = 12;
 		break;
-	case cryptonight_conceal:
-		algv = 13;
-		break;
 	case cryptonight_r:
 		algv = 14;
 		break;
@@ -723,11 +710,6 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job&
 		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, true>,
 		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>,
 
-		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, false>,
-		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, false>,
-		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, true>,
-
 		Cryptonight_hash<N>::template hash<cryptonight_r, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_r, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_r, false, true>,
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 262865ea0..197814cfd 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -23,10 +23,11 @@ enum xmrstak_algo_id
 	cryptonight_monero_v8 = 11,
 	cryptonight_superfast = 12,
 	cryptonight_gpu = 13,
-	cryptonight_conceal = 14,
-	cryptonight_r_wow = 15,
-	cryptonight_r = 16,
-	cryptonight_v8_reversewaltz = 17, //equal to cryptonight_monero_v8 but with 3/4 iterations and reversed shuffle operation
+	// Remove the following line
+	// cryptonight_conceal = 14,
+	cryptonight_r_wow = 14, // Adjust this value
+	cryptonight_r = 15, // Adjust this value
+	cryptonight_v8_reversewaltz = 16, // Adjust this value
 
 	cryptonight_turtle = start_derived_algo_id,
 	cryptonight_v8_half = (start_derived_algo_id + 1),
@@ -41,7 +42,7 @@ enum xmrstak_algo_id
  */
 inline std::string get_algo_name(xmrstak_algo_id algo_id)
 {
-	static std::array<std::string, 18> base_algo_names =
+	static std::array<std::string, 17> base_algo_names =
 		{{
 			"invalid_algo",
 			"cryptonight",
@@ -57,10 +58,11 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id)
 			"cryptonight_v8",
 			"cryptonight_superfast",
 			"cryptonight_gpu",
-			"cryptonight_conceal",
+			// Remove the following line
+			// "cryptonight_conceal",
 			"cryptonight_r_wow",
 			"cryptonight_r",
-			"cryptonight_v8_reversewaltz" // used by graft
+			"cryptonight_v8_reversewaltz"
 		}};
 
 	static std::array<std::string, 4> derived_algo_names =
@@ -200,21 +202,22 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000;
 
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	static std::array<xmrstak_algo, 18> pow = {{{invalid_algo, invalid_algo},
+	static std::array<xmrstak_algo, 17> pow = {{{invalid_algo, invalid_algo}, // Reduce size by 1
 		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
 		{cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2},
 		{cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY},
 		{cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2},
 		{cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2},
-		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2},		  // equal to cryptonight_aeon with a small tweak in the miner code
-		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY},		  //equal to cryptonight_monero but with one tiny change
-		{cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY},		  //equal to cryptonight_monero but with less iterations, used by masari
-		{cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2},		  // equal to cryptonight_heavy with a small tweak
-		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2},
+		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY},
+		{cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY},
+		{cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2},
+		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2},
 		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
 		{cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY},
 		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK},
-		{cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY},
+		// Remove the following line
+		// {cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY},
 		{cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY},
 		{cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY},
 		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}};
@@ -232,4 +235,4 @@ inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 		return pow[algo_id];
 	else
 		return derived_pow[algo_id - start_derived_algo_id];
-}
+}
\ No newline at end of file
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 6c769b3e8..11200fe23 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -504,13 +504,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4)
 	uint32_t t1[2], t2[2], res;
 
 	float conc_var;
-	if(ALGO == cryptonight_conceal)
-	{
-		if(partidx != 0)
-			conc_var = int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub));
-		else
-			conc_var = 0.0f;
-	}
 
 	uint32_t tweak1_2[2];
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -576,20 +569,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4)
 			{
 				uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
 
-				if(ALGO == cryptonight_conceal)
-				{
-					float r = int2float((int32_t)x_0);
-					float c_old = conc_var;
-
-					r += conc_var;
-					r = r * r * r;
-					r = int_as_float((float_as_int(r) & 0x807FFFFF) | 0x40000000);
-					conc_var += r;
-
-					c_old = int_as_float((float_as_int(c_old) & 0x807FFFFF) | 0x40000000);
-					c_old *= 536870880.0f;
-					x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old));
-				}
 
 				const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
 				const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
@@ -692,8 +671,6 @@ __launch_bounds__(XMR_STAK_THREADS * 4)
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			if(sub & 1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
-		if(ALGO == cryptonight_conceal)
-			*(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
 	}
 }
 
@@ -1069,9 +1046,6 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 0>,
 		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 1>,
 
-		cryptonight_core_gpu_hash<cryptonight_conceal, 0>,
-		cryptonight_core_gpu_hash<cryptonight_conceal, 1>,
-
 		cryptonight_core_gpu_hash<cryptonight_r_wow, 0>,
 		cryptonight_core_gpu_hash<cryptonight_r_wow, 1>,
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index d5b292cb4..f79353d7e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -317,11 +317,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_conceal) != neededAlgorithms.end())
-	{
-		ctx_b_size += sizeof(uint32_t) * 4 * wsize;
-	}
-	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
 	{
 		// bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte)
 		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
@@ -474,7 +470,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 	CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost));
 	CUDA_CHECK_MSG(
 		ctx->device_id,
-		"\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**",
+		"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
 		cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 	/* There is only a 32bit limit for the counter on the device side
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index c50211d1e..fc55ed6ab 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -103,6 +103,7 @@ xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
 	{"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr},
 	{"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"},
+	{"conceal", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.conceal.network:3333"},
 	{"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr},
 	{"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr},
 	{"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr},
@@ -122,7 +123,6 @@ xmrstak::coin_selection coins[] = {
 	{"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr},
 	{"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr},
 	{"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
-	{"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr},
 	{"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr},
 	{"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr},
 	{"lethean", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr},
@@ -131,7 +131,8 @@ xmrstak::coin_selection coins[] = {
 	{"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"},
 	{"torque", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr},
 	{"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr},
-	{"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}};
+	{"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}
+};
 
 constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0]));
 

From ce28d2b30cdf3c778507392a1bedd1414649d509 Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Thu, 3 Oct 2024 14:16:23 -0400
Subject: [PATCH 2/6] ccx&few_tweaks

---
 .cursorrules                                  |  93 +++--
 .gitignore                                    |   6 +
 CMakeLists.txt                                |   8 +-
 xmrstak/backend/amd/OclCryptonightR_gen.hpp   |   2 +
 xmrstak/backend/amd/amd_gpu/gpu.hpp           |   1 +
 .../amd/amd_gpu/opencl/cryptonight_gpu.cl     | 370 +++++++++---------
 xmrstak/backend/amd/autoAdjust.hpp            |   3 +-
 xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp   |  40 +-
 xmrstak/backend/miner_work.hpp                |  26 +-
 .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp |  10 +-
 xmrstak/cli/cli-miner.cpp                     |   8 +-
 xmrstak/pools.tpl                             |   3 +-
 12 files changed, 310 insertions(+), 260 deletions(-)

diff --git a/.cursorrules b/.cursorrules
index 7f388c799..63eb694cc 100644
--- a/.cursorrules
+++ b/.cursorrules
@@ -1,45 +1,68 @@
- You are an expert in C++ and OpenCL, and scalable parallel computing.
+# C++/OpenCL Crypto Mining Expert
 
-Key Principles
+You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL for high-performance parallel computing.
 
-    Write clear, technical responses with precise C++ and OpenCL examples.
-    Prioritize readability and maintainability; follow C++ coding conventions and OpenCL best practices.
-    Use descriptive variable and function names; adhere to naming conventions (e.g., CamelCase for class names, snake_case for filenames and private member variables).
-    Structure your project in a modular way using C++ namespaces and header files, as well as OpenCL's kernel files and build scripts to promote reusability and separation of concerns.
+## Core Competencies
+- Optimization of SHA-256, cryptonight_gpu, and other mining algorithms
+- GPGPU programming and hardware-specific optimizations
+- Memory management and parallel processing techniques
 
-C++/OpenCL
+## Technical Guidelines
+### C++ Best Practices
+- Modern C++ (17/20) features for robust, efficient code
+- RAII for resource management
+- STL for data structures and algorithms
+- Custom memory pools for mining-specific optimizations
 
-    Use object-oriented programming principles (inheritance, polymorphism, encapsulation) to create modular and reusable parallel computing elements.
-    Leverage C++'s standard template library for data structures and algorithms.
-    Utilize OpenCL's C99-based kernel language for writing parallel computation code.
-    Implement custom OpenCL kernels for specific computation tasks.
-    Use C++'s RAII (Resource Acquisition Is Initialization) pattern for automatic resource management.
+### OpenCL Implementation
+- Efficient kernel design for maximum hash rate
+- Work group optimization for different GPU architectures
+- Memory coalescing and bank conflict avoidance
+- Stratum protocol integration for pool mining
 
-Error Handling and Debugging
+### CMake Build System
+- Modern CMake (3.15+) practices
+- Automatic OpenCL and dependency detection
+- Cross-platform build configuration
+- GPU architecture-specific optimizations
+- Example targets:
+  ```
+  add_executable(miner src/main.cpp)
+  target_link_libraries(miner PRIVATE OpenCL)
+  ```
 
-    Implement error handling using try-catch blocks where appropriate.
-    Use C++'s assertion macros (e.g., assert) for catching logical errors during development.
-    Utilize OpenCL's error-handling functions (e.g., clGetError) for catching and handling OpenCL-specific errors.
-    Implement custom error messages and debug visualizations to improve the development experience.
+### Performance Optimization
+- Profiling and bottleneck identification
+- Memory access pattern optimization
+- Workload distribution across compute units
+- Hardware-specific tuning (AMD, NVIDIA)
 
-Dependencies
+## Error Handling
+- Graceful recovery from hardware/network issues
+- Comprehensive logging for debugging
+- Real-time hash rate monitoring and adjustment
 
-    C++ Compiler (e.g., GCC, Clang)
-    C++ Standard Library (e.g., STL)
-    OpenCL SDK (e.g., AMD APP SDK, NVIDIA CUDA Toolkit, Apple Metal Performance Shaders)
-    Third-party libraries (carefully vetted for compatibility and performance)
+## Dependencies
+- OpenCL SDK (vendor-specific)
+- C++17/20 compliant compiler
+- CMake 3.15 or higher
+- Mining pool integration libraries
+- Hardware monitoring capabilities
 
-OpenCL-Specific Guidelines
+## Build Configuration
+- CMakeLists.txt structure for multi-platform support
+- Find modules for OpenCL and other dependencies
+- Conditional compilation for different mining algorithms
+- Debug and Release build configurations
+- Example usage:
+  ```
+  mkdir build && cd build
+  cmake -DCMAKE_BUILD_TYPE=Release ..
+  cmake --build .
+  ```
 
-    Use OpenCL's built-in functions and kernels for common parallel computation tasks (e.g., vector addition, matrix multiplication).
-    Keep parallel computation logic in separate kernels; use the C++ main function for host-side initialization and execution management.
-    Utilize OpenCL's memory management functions (e.g., clEnqueueReadBuffer, clEnqueueWriteBuffer) for efficient data transfer between host and device.
-    Apply OpenCL's work-item and work-group synchronization functions (e.g., barrier) to ensure correct execution of parallel computations.
-    Optimize memory access patterns to reduce global memory contention and improve data locality.
-    Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times.
-
-Code Review and Best Practices
-
-    Review code for potential performance bottlenecks and memory access patterns.
-    Use OpenCL's profiling tools to identify performance bottlenecks and optimize kernel execution times.
-    Optimize memory access patterns to reduce global memory contention and improve data locality.
+## Code Quality
+- Clear, maintainable code with mining-specific comments
+- Modular design for easy algorithm switching
+- Rigorous error checking for stability
+- Performance-critical section documentation
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 26d278f0b..2af6ed068 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,9 @@ cmake-build-debug/
 
 # Thumbnails
 ._*
+
+# Visual Studio Code files
+.vscode/
+
+# Cursor    
+.cursorrules
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 795829e66..c5ce08235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,11 @@
 project(xmr-stak)
 
-cmake_minimum_required(VERSION 3.4.0)
+cmake_minimum_required(VERSION 3.15)
 
-# enforce C++11
+# enforce C++17
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CXX_STANDARD 11)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
     set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}" CACHE PATH "install prefix" FORCE)
@@ -308,6 +308,8 @@ else()
     add_definitions("-DCONF_NO_OPENCL")
 endif()
 
+add_definitions(-DCL_TARGET_OPENCL_VERSION=300)
+
 ###############################################################################
 # CPU backend
 ###############################################################################
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
index f8772b1f5..54d312b2b 100644
--- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
@@ -6,6 +6,8 @@
 #include <string>
 #include <vector>
 
+#define CL_TARGET_OPENCL_VERSION 300
+
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
 #else
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 1ba300c7a..f65a05fad 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -3,6 +3,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/console.hpp"
 
+#define CL_TARGET_OPENCL_VERSION 300
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
 #else
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
index 8a97c3f77..8575be905 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -1,109 +1,133 @@
 R"===(
 
-inline __global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) {
-    return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16);
+inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); }
+
+inline float4 fma_break(float4 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(x, 0xFEFFFFFF);
+	return _mm_or_ps(x, 0x00800000);
 }
 
-#define fma_break(x) ((x & (float4)(0xFEFFFFFF)) | (float4)(0x00800000))
-
-#define sub_round(n0, n1, n2, n3, rnd_c, n, d, c) \
-    do { \
-        n1 += *c; \
-        float4 nn = n0 * *c; \
-        nn = n1 * (nn * nn); \
-        nn = fma_break(nn); \
-        *n += nn; \
-        \
-        n3 -= *c; \
-        float4 dd = n2 * *c; \
-        dd = n3 * (dd * dd); \
-        dd = fma_break(dd); \
-        *d += dd; \
-        \
-        *c += rnd_c + (float4)(0.734375f); \
-        float4 r = nn + dd; \
-        r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \
-        *c += r; \
-    } while(0)
-
-#define round_compute(n0, n1, n2, n3, rnd_c, c, r) \
-    do { \
-        float4 n = (float4)(0.0f); \
-        float4 d = (float4)(0.0f); \
-        \
-        for(int i = 0; i < 8; ++i) { \
-            sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); \
-            float4 tmp = n0; n0 = n1; n1 = n2; n2 = n3; n3 = tmp; \
-        } \
-        \
-        d = (d & (float4)(0xFF7FFFFF)) | (float4)(0x40000000); \
-        *r += n / d; \
-    } while(0)
-
-#define single_compute(n0, n1, n2, n3, cnt, rnd_c, sum) \
-    ({ \
-        float4 c = (float4)(cnt); \
-        float4 r = (float4)(0.0f); \
-        \
-        for(int i = 0; i < 4; ++i) \
-            round_compute(n0, n1, n2, n3, rnd_c, &c, &r); \
-        \
-        r = (r & (float4)(0x807FFFFF)) | (float4)(0x40000000); \
-        *sum = r; \
-        convert_int4_rte(r * (float4)(536870880.0f)); \
-    })
-
-#define single_compute_wrap(rot, v0, v1, v2, v3, cnt, rnd_c, sum, out) \
-    do { \
-        float4 n0 = convert_float4_rte(v0); \
-        float4 n1 = convert_float4_rte(v1); \
-        float4 n2 = convert_float4_rte(v2); \
-        float4 n3 = convert_float4_rte(v3); \
-        \
-        int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum); \
-        *out = rot == 0 ? r : (int4)(r.yzwx); \
-    } while(0)
+inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c)
+{
+	n1 = _mm_add_ps(n1, *c);
+	float4 nn = _mm_mul_ps(n0, *c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	*n = _mm_add_ps(*n, nn);
+
+	n3 = _mm_sub_ps(n3, *c);
+	float4 dd = _mm_mul_ps(n2, *c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	*d = _mm_add_ps(*d, dd);
+
+	//Constant feedback
+	*c = _mm_add_ps(*c, rnd_c);
+	*c = _mm_add_ps(*c, (float4)(0.734375f));
+	float4 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*c = _mm_add_ps(*c, r);
+
+}
+
+// 9*8 + 2 = 74
+inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r)
+{
+	float4 n = (float4)(0.0f);
+	float4 d = (float4)(0.0f);
+
+	sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(d, 0xFF7FFFFF);
+	d = _mm_or_ps(d, 0x40000000);
+	*r =_mm_add_ps(*r, _mm_div_ps(n,d));
+}
+
+inline int4 single_compute(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum)
+{
+	float4 c= (float4)(cnt);
+	// 35 maths calls follow (140 FLOPS)
+	float4 r = (float4)(0.0f);
+
+	for(int i = 0; i < 4; ++i)
+		round_compute(n0, n1, n2, n3, rnd_c, &c, &r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*sum = r; // 34
+	float4 x = (float4)(536870880.0f);
+	r = _mm_mul_ps(r, x); // 35
+	return convert_int4_rte(r);
+}
+
+inline void single_compute_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out)
+{
+	float4 n0 = convert_float4_rte(v0);
+	float4 n1 = convert_float4_rte(v1);
+	float4 n2 = convert_float4_rte(v2);
+	float4 n3 = convert_float4_rte(v3);
+
+	int4 r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum);
+	*out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+}
 
 )==="
 	R"===(
 
-// Move lookup tables to constant memory
-__constant uint look[16][4] = {
-    {0, 1, 2, 3},
-    {0, 2, 3, 1},
-    {0, 3, 1, 2},
-    {0, 3, 2, 1},
-    {1, 0, 2, 3},
-    {1, 2, 3, 0},
-    {1, 3, 0, 2},
-    {1, 3, 2, 0},
-    {2, 1, 0, 3},
-    {2, 0, 3, 1},
-    {2, 3, 1, 0},
-    {2, 3, 0, 1},
-    {3, 1, 2, 0},
-    {3, 2, 0, 1},
-    {3, 0, 1, 2},
-    {3, 0, 2, 1}
+static const __constant uint look[16][4] = {
+	{0, 1, 2, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+
+	{1, 0, 2, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+
+	{2, 1, 0, 3},
+	{2, 0, 3, 1},
+	{2, 3, 1, 0},
+	{2, 3, 0, 1},
+
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1}
 };
 
-__constant float ccnt[16] = {
-    1.34375f,
-    1.28125f,
-    1.359375f,
-    1.3671875f,
-    1.4296875f,
-    1.3984375f,
-    1.3828125f,
-    1.3046875f,
-    1.4140625f,
-    1.2734375f,
-    1.2578125f,
-    1.2890625f,
-    1.3203125f,
-    1.3515625f,
-    1.3359375f,
-    1.4609375f
+static const __constant float ccnt[16] = {
+	1.34375f,
+	1.28125f,
+	1.359375f,
+	1.3671875f,
+
+	1.4296875f,
+	1.3984375f,
+	1.3828125f,
+	1.3046875f,
+
+	1.4140625f,
+	1.2734375f,
+	1.2578125f,
+	1.2890625f,
+
+	1.3203125f,
+	1.3515625f,
+	1.3359375f,
+	1.4609375f
 };
 
 struct SharedMemChunk
@@ -115,98 +139,84 @@ struct SharedMemChunk
 __attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
 __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
 {
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
-    #if(COMP_MODE==1)
-    if(gIdx/16 >= numThreads)
-        return;
-    #endif
-
-    uint chunk = get_local_id(0) / 16;
-
-    #if(STRIDED_INDEX==0)
-    __global int4* lpad = (__global int4*)((__global char*)lpad_in + MEMORY * (gIdx/16));
-    #endif
+#if(COMP_MODE==1)
+	if(gIdx/16 >= numThreads)
+		return;
+#endif
 
-    __local struct SharedMemChunk {
-        int4 out[16];
-        float4 va[16];
-    } smem_in[WORKSIZE];
-    __local struct SharedMemChunk* smem = smem_in + chunk;
+	uint chunk = get_local_id(0) / 16;
 
-    // New local memory cache for frequently accessed data
-    __local int4 lpad_cache[32];
+#if(STRIDED_INDEX==0)
+	__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
+#endif
 
-    uint tid = get_local_id(0) % 16;
+	__local struct SharedMemChunk smem_in[WORKSIZE];
+	__local struct SharedMemChunk* smem = smem_in + chunk;
 
-    uint idxHash = gIdx/16;
-    uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
-    float4 vs = (float4)(0);
+	uint tid = get_local_id(0) % 16;
 
-    const uint tidd = tid / 4;
-    const uint tidm = tid % 4;
-    const uint block = tidd * 16 + tidm;
+	uint idxHash = gIdx/16;
+	uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
+	float4 vs = (float4)(0);
 
-    // Preload frequently accessed data into local memory
-    if (tid < 32) {
-        lpad_cache[tid] = lpad[tid];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+	// tid divided
+	const uint tidd = tid / 4;
+	// tid modulo
+	const uint tidm = tid % 4;
+	const uint block = tidd * 16 + tidm;
 
-    #pragma unroll CN_UNROLL
-    for(uint i = 0; i < ITERATIONS; i++)
-    {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        
-        smem->out[tid] = scratchpad_ptr(s, tidd, (__global int*)lpad_cache)[tidm];
-        
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        single_compute_wrap(
-            tidm,
-            smem->out[look[tid][0]],
-            smem->out[look[tid][1]],
-            smem->out[look[tid][2]],
-            smem->out[look[tid][3]],
-            ccnt[tid], vs, &smem->va[tid],
-            &smem->out[tid]
-        );
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Combine outXor calculation and update of lpad_cache
-        int4 outXor = smem->out[block];
-        #pragma unroll
-        for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
-            outXor ^= smem->out[dd];
-        lpad_cache[s % 32] = outXor ^ smem->out[tid];
-        smem->out[tid] = outXor;
-
-        // Combine va_tmp calculation and assignment
-        smem->va[tid] = (smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]);
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Combine out2 and va_tmp2 calculations
-        int4 out2 = (smem->out[tid] ^ smem->out[tid + 4]) ^ (smem->out[tid + 8] ^ smem->out[tid + 12]);
-        float4 va_tmp2 = fabs((smem->va[block] + smem->va[block + 4]) + (smem->va[block + 8] + smem->va[block + 12]));
-
-        // Combine xx calculation and conversion
-        int4 xx_int = convert_int4_rte(va_tmp2 * (float4)(16777216.0f));
-        smem->out[tid] = out2 ^ xx_int;
-        smem->va[tid] = va_tmp2 * (float4)(0.015625f);  // Vectorized division by 64.0f
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Combine vs and s updates
-        vs = smem->va[0];
-        s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w;
-    }
-
-    // Write back to global memory
-    if (tid < 32) {
-        lpad[tid] = lpad_cache[tid];
-    }
+	#pragma unroll CN_UNROLL
+	for(size_t i = 0; i < ITERATIONS; i++)
+	{
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
+		((__local int*)(smem->out))[tid] = tmp;
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		{
+			single_compute_wrap(
+				tidm,
+				*(smem->out + look[tid][0]),
+				*(smem->out + look[tid][1]),
+				*(smem->out + look[tid][2]),
+				*(smem->out + look[tid][3]),
+				ccnt[tid], vs, smem->va + tid,
+				smem->out + tid
+			);
+		}
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		int outXor = ((__local int*)smem->out)[block];
+		for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+			outXor ^= ((__local int*)smem->out)[dd];
+
+		((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp;
+		((__local int*)smem->out)[tid] = outXor;
+
+		float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12];
+		((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2;
+
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12];
+		va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12];
+		va_tmp1 = va_tmp1 + va_tmp2;
+		va_tmp1 = fabs(va_tmp1);
+
+		float xx = va_tmp1 * 16777216.0f;
+		int xx_int = (int)xx;
+		((__local int*)smem->out)[tid] = out2 ^ xx_int;
+		((__local float*)smem->va)[tid] = va_tmp1 / 64.0f;
+
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		vs = smem->va[0];
+		s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w;
+	}
 }
 
 )==="
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 075acbd49..e237eebf1 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include "amd_gpu/gpu.hpp"
@@ -18,6 +17,8 @@
 #include <string>
 #include <vector>
 
+#define CL_TARGET_OPENCL_VERSION 300
+
 #if defined(__APPLE__)
 #include <OpenCL/cl.h>
 #else
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
index d65d9651e..62c2ba527 100644
--- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -62,7 +62,7 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd
 
 // 112×4 = 448
 template <bool add>
-inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+inline __m128i single_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
 {
 	__m128 c = _mm_set1_ps(cnt);
 	__m128 r = _mm_setzero_ps();
@@ -86,9 +86,9 @@ inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float
 }
 
 template <size_t rot>
-inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+inline void single_compute_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
-	__m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
+	__m128i r = single_compute<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
 	if(rot != 0)
 		r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
 	out = _mm_xor_si128(out, r);
@@ -122,37 +122,37 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo&
 
 		__m128i out, out2;
 		out = _mm_setzero_si128();
-		single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
-		single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
-		single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
-		single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
+		single_compute_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
+		single_compute_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
+		single_compute_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
+		single_compute_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
 		sum0 = _mm_add_ps(suma, sumb);
 		_mm_store_si128(idx0, _mm_xor_si128(v0, out));
 		out2 = out;
 
 		out = _mm_setzero_si128();
-		single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
-		single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
-		single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
-		single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
+		single_compute_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
+		single_compute_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
+		single_compute_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
+		single_compute_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
 		sum1 = _mm_add_ps(suma, sumb);
 		_mm_store_si128(idx1, _mm_xor_si128(v1, out));
 		out2 = _mm_xor_si128(out2, out);
 
 		out = _mm_setzero_si128();
-		single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
-		single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
-		single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
-		single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
+		single_compute_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
+		single_compute_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
+		single_compute_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
+		single_compute_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
 		sum2 = _mm_add_ps(suma, sumb);
 		_mm_store_si128(idx2, _mm_xor_si128(v2, out));
 		out2 = _mm_xor_si128(out2, out);
 
 		out = _mm_setzero_si128();
-		single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
-		single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
-		single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
-		single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
+		single_compute_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
+		single_compute_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
+		single_compute_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
+		single_compute_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
 		sum3 = _mm_add_ps(suma, sumb);
 		_mm_store_si128(idx3, _mm_xor_si128(v3, out));
 		out2 = _mm_xor_si128(out2, out);
@@ -178,4 +178,4 @@ void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo&
 		idx2 = scratchpad_ptr(lpad, n, 2, mask);
 		idx3 = scratchpad_ptr(lpad, n, 3, mask);
 	}
-}
+}
\ No newline at end of file
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index 114f2db8e..d267cc902 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -2,6 +2,7 @@
 
 #include "xmrstak/backend/pool_data.hpp"
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstdint>
@@ -32,31 +33,30 @@ struct miner_work
 		ref_ptr((uint8_t*)&iBlockHeight) {}
 
 	miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
-		uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) :
-		iWorkSize(iWorkSize),
+		uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeight) :
+		iWorkSize(std::min(iWorkSize, static_cast<uint32_t>(sizeof(bWorkBlob)))),
 		iTarget(iTarget),
 		bNiceHash(bNiceHash),
-		bStall(false),
 		iPoolId(iPoolId),
-		iBlockHeight(iBlockHeiht),
-		ref_ptr((uint8_t*)&iBlockHeight)
+		iBlockHeight(iBlockHeight),
+		bStall(false)
 	{
-		assert(iWorkSize <= sizeof(bWorkBlob));
-		memcpy(this->bWorkBlob, bWork, iWorkSize);
-		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+		memcpy(this->bWorkBlob, bWork, this->iWorkSize);
+		strncpy(this->sJobID, sJobID, sizeof(this->sJobID) - 1);
+		this->sJobID[sizeof(this->sJobID) - 1] = '\0';
 	}
 
 	miner_work(miner_work&& from) :
-		iWorkSize(from.iWorkSize),
+		iWorkSize(std::min(from.iWorkSize, static_cast<uint32_t>(sizeof(bWorkBlob)))),
 		iTarget(from.iTarget),
 		bStall(from.bStall),
 		iPoolId(from.iPoolId),
 		iBlockHeight(from.iBlockHeight),
 		ref_ptr((uint8_t*)&iBlockHeight)
 	{
-		assert(iWorkSize <= sizeof(bWorkBlob));
-		memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
-		memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+		memcpy(bWorkBlob, from.bWorkBlob, this->iWorkSize);
+		strncpy(this->sJobID, from.sJobID, sizeof(this->sJobID) - 1);
+		this->sJobID[sizeof(this->sJobID) - 1] = '\0';
 	}
 
 	miner_work(miner_work const&) = delete;
@@ -108,4 +108,4 @@ struct miner_work
 		return bWorkBlob[0];
 	}
 };
-} // namespace xmrstak
+} // namespace xmrstak
\ No newline at end of file
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
index 516d4ca00..188a5fd6c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -366,7 +366,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _
 }
 
 // 74*8 = 595
-__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+__forceinline__ __device__ __m128i single_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
 {
 	__m128 c(cnt);
 	// 35 maths calls follow (140 FLOPS)
@@ -381,14 +381,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n
 	return r.get_int();
 }
 
-__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+__forceinline__ __device__ void single_compute_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
 {
 	__m128 n0(v0);
 	__m128 n1(v1);
 	__m128 n2(v2);
 	__m128 n3(v3);
 
-	__m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+	__m128i r = single_compute(n0, n1, n2, n3, cnt, rnd_c, sum);
 	out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
 }
 
@@ -496,7 +496,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu(
 		sync();
 
 		__m128 rc = vs;
-		single_comupte_wrap(
+		single_compute_wrap(
 			tidm,
 			*(smem->out + look[tid][0]),
 			*(smem->out + look[tid][1]),
@@ -595,4 +595,4 @@ __global__ void cn_explode_gpu(const size_t MEMORY, int32_t* spad_in, int* lpad_
 }
 
 } // namespace nvidia
-} // namespace xmrstak
+} // namespace xmrstak
\ No newline at end of file
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index 607e863e1..890f6a9fa 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -42,6 +42,7 @@
 #include <stdlib.h>
 #include <string>
 #include <time.h>
+#include <utility>
 
 #ifndef CONF_NO_TLS
 #include <openssl/err.h>
@@ -243,7 +244,7 @@ void do_guided_pool_config()
 		prompt_once(prompted);
 
 		userSetPool = false;
-		std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl;
+		std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(currency.c_str()) << std::endl;
 		std::cin >> pool;
 	}
 
@@ -403,7 +404,10 @@ int main(int argc, char* argv[])
 #ifndef CONF_NO_TLS
 	SSL_library_init();
 	SSL_load_error_strings();
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 	ERR_load_BIO_strings();
+#pragma GCC diagnostic pop
 	ERR_load_crypto_strings();
 	SSL_load_error_strings();
 	OpenSSL_add_all_digests();
@@ -979,4 +983,4 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 
 	printer::inst()->print_msg(L0, "Benchmark Total: %.1f H/S", fTotalHps);
 	return 0;
-}
+}
\ No newline at end of file
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index eb57a3f04..ab0920a44 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -21,6 +21,7 @@ POOLCONF],
  *
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
  *    bittube (uses cryptonight_bittube2 algorithm)
+ *    conceal (uses cryptonight_gpu algorithm)
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    lethean
@@ -42,7 +43,7 @@ POOLCONF],
  *    cryptonight_lite_v7_xor (algorithm used by ipbc)
  *    # 2MiB scratchpad memory
  *    cryptonight
- *    cryptonight_gpu (for Ryo's 14th of Feb fork)
+ *    cryptonight_gpu (for Ryo's 14th of Feb fork and Conceal)
  *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8

From 3d0203b0668abfe54fbae5af64feeea83245cd42 Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Wed, 9 Oct 2024 23:07:47 -0400
Subject: [PATCH 3/6] build_day2day

---
 .cursorrules          | 186 ++++++++++++++++++++++++++++++++++++++----
 CMakeLists.txt        |  53 +++++++++++-
 xmrstak/setup_gui.cpp |  52 ++++++++++++
 xmrstak/setup_gui.hpp |  23 ++++++
 4 files changed, 296 insertions(+), 18 deletions(-)
 create mode 100644 xmrstak/setup_gui.cpp
 create mode 100644 xmrstak/setup_gui.hpp

diff --git a/.cursorrules b/.cursorrules
index 63eb694cc..567eb797a 100644
--- a/.cursorrules
+++ b/.cursorrules
@@ -1,6 +1,6 @@
-# C++/OpenCL Crypto Mining Expert
+You are an AI expert in the field of cryptocurrency mining. You are tasked with developing a high-performance mining application using OpenCL and C++. You always follow best practices for algorithm optimization, error handling, and performance tuning for various hardware platforms. You are also tasked with developing a GUI for the mining application using wxWidgets. You always follow best practices for GUI development, error handling, and performance tuning for various hardware platforms.
 
-You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL for high-performance parallel computing.
+# C++/OpenCL Crypto Mining Expert Guide
 
 ## Core Competencies
 - Optimization of SHA-256, cryptonight_gpu, and other mining algorithms
@@ -8,6 +8,7 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo
 - Memory management and parallel processing techniques
 
 ## Technical Guidelines
+
 ### C++ Best Practices
 - Modern C++ (17/20) features for robust, efficient code
 - RAII for resource management
@@ -25,11 +26,12 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo
 - Automatic OpenCL and dependency detection
 - Cross-platform build configuration
 - GPU architecture-specific optimizations
-- Example targets:
-  ```
-  add_executable(miner src/main.cpp)
-  target_link_libraries(miner PRIVATE OpenCL)
-  ```
+
+Example targets:
+```cmake
+add_executable(miner src/main.cpp)
+target_link_libraries(miner PRIVATE OpenCL)
+```
 
 ### Performance Optimization
 - Profiling and bottleneck identification
@@ -48,21 +50,177 @@ You are an AI expert in cryptocurrency mining, specializing in C++ and OpenCL fo
 - CMake 3.15 or higher
 - Mining pool integration libraries
 - Hardware monitoring capabilities
+- wxWidgets 3.2 or higher (for GUI)
 
 ## Build Configuration
 - CMakeLists.txt structure for multi-platform support
 - Find modules for OpenCL and other dependencies
 - Conditional compilation for different mining algorithms
 - Debug and Release build configurations
-- Example usage:
-  ```
-  mkdir build && cd build
-  cmake -DCMAKE_BUILD_TYPE=Release ..
-  cmake --build .
-  ```
+
+Example usage:
+```bash
+mkdir build && cd build
+cmake -DCMAKE_BUILD_TYPE=Release ..
+cmake --build .
+```
 
 ## Code Quality
 - Clear, maintainable code with mining-specific comments
 - Modular design for easy algorithm switching
 - Rigorous error checking for stability
-- Performance-critical section documentation
\ No newline at end of file
+- Performance-critical section documentation
+
+# Implementation Details
+
+## OpenCL Function Reference
+
+### Kernel Management
+- `clCreateKernel`: Create mining algorithm kernels
+  - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clCreateKernel.html)
+```cpp
+cl_kernel kernel = clCreateKernel(program, "sha256_kernel", &err);
+```
+
+### Device Query and Optimization
+- `clGetDeviceInfo`: Query optimal work group sizes
+  - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetDeviceInfo.html)
+```cpp
+size_t maxWorkGroupSize;
+clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 
+                sizeof(size_t), &maxWorkGroupSize, NULL);
+```
+
+### Memory Management
+- `clCreateBuffer`: Efficient buffer creation
+  - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clCreateBuffer.html)
+```cpp
+cl_mem inputBuffer = clCreateBuffer(context, 
+                                   CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                   bufferSize, hostPtr, &err);
+```
+
+### Performance Profiling
+- `clGetEventProfilingInfo`: Kernel execution profiling
+  - [Documentation](https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetEventProfilingInfo.html)
+```cpp
+cl_ulong startTime, endTime;
+clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, 
+                        sizeof(cl_ulong), &startTime, NULL);
+clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, 
+                        sizeof(cl_ulong), &endTime, NULL);
+```
+
+## Optimized OpenCL Kernel Example
+```cpp
+__kernel void sha256_kernel(__global const uint* input,
+                           __global uint* output,
+                           __local uint* shared_data)
+{
+    size_t gid = get_global_id(0);
+    size_t lid = get_local_id(0);
+    
+    // Collaborative loading of data into local memory
+    if (lid < DATA_PARALLEL_FACTOR) {
+        shared_data[lid] = input[gid / DATA_PARALLEL_FACTOR + lid];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    // Mining-specific computation here
+}
+```
+
+## wxWidgets Integration
+
+### GUI Implementation
+- Event-driven architecture for mining control
+- Real-time hash rate and hardware monitoring displays
+- Configuration interface for mining parameters
+
+### CMake Integration
+```cmake
+find_package(wxWidgets REQUIRED COMPONENTS core base)
+include(${wxWidgets_USE_FILE})
+
+add_executable(miner-gui src/main.cpp src/gui.cpp)
+target_link_libraries(miner-gui PRIVATE 
+    OpenCL::OpenCL
+    ${wxWidgets_LIBRARIES}
+)
+```
+
+### GUI Best Practices
+- Use `wxThread` for non-blocking mining operations
+- Implement `wxTimer` for GUI updates (hash rate, temperature)
+- Utilize `wxGrid` for displaying mining statistics
+- Message queue for thread-safe GUI updates
+
+### Example GUI Component
+```cpp
+class MinerFrame : public wxFrame {
+public:
+    MinerFrame() : wxFrame(nullptr, wxID_ANY, "Crypto Miner") {
+        // Layout
+        auto mainSizer = new wxBoxSizer(wxVERTICAL);
+        
+        // Controls
+        auto startButton = new wxButton(this, wxID_ANY, "Start Mining");
+        hashRateText = new wxStaticText(this, wxID_ANY, "Hash Rate: 0 H/s");
+        
+        // Events
+        startButton->Bind(wxEVT_BUTTON, &MinerFrame::OnStartMining, this);
+        
+        // Update timer
+        wxTimer* timer = new wxTimer(this);
+        Bind(wxEVT_TIMER, &MinerFrame::OnUpdateStats, this);
+        timer->Start(1000); // Update every second
+    }
+
+private:
+    wxStaticText* hashRateText;
+    
+    void OnStartMining(wxCommandEvent& evt) {
+        // Start mining in separate thread
+        auto thread = new MiningThread(this);
+        thread->Run();
+    }
+    
+    void OnUpdateStats(wxTimerEvent& evt) {
+        // Update GUI with current mining stats
+        hashRateText->SetLabel(wxString::Format(
+            "Hash Rate: %.2f MH/s", getCurrentHashRate()));
+    }
+};
+```
+
+## Comprehensive Error Handling
+```cpp
+class MiningException : public wxException {
+public:
+    MiningException(const std::string& message) 
+        : m_message(message) {}
+    
+    virtual const wxChar* what() const wxTHROW_OVERRIDE {
+        return m_message.wc_str();
+    }
+private:
+    wxString m_message;
+};
+
+// Usage in OpenCL code
+try {
+    cl_int error = clFunction(...);
+    checkError(error, "OpenCL operation");
+} catch (const std::runtime_error& e) {
+    throw MiningException(wxString::Format(
+        "Mining error: %s", e.what()));
+}
+```
+
+## Performance Best Practices
+1. Use `clEnqueueWriteBufferRect` for optimized 2D/3D data transfers
+2. Implement `clEnqueueMapBuffer` for zero-copy buffer access where possible
+3. Utilize `clEnqueueNDRangeKernel` events for operation pipelining
+4. Minimize GUI updates to reduce overhead
+5. Use event batching for high-frequency updates
+6. Implement separate thread for OpenCL operations
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5ce08235..18fc1ada5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -624,15 +624,26 @@ file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp")
 set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX)
 
 if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    add_executable(xmr-stak ${SRCFILES_CPP} xmrstak/cli/xmr-stak.manifest)
+    add_executable(xmr-stak_initial 
+        ${SRCFILES_CPP}
+        xmrstak/cli/xmr-stak.manifest
+    )
 else()
-    add_executable(xmr-stak ${SRCFILES_CPP})
+    add_executable(xmr-stak_initial 
+        ${SRCFILES_CPP}
+    )
 endif()
 
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
+target_link_libraries(xmr-stak_initial 
+    PRIVATE
+    xmr-stak-backend
+    ${LIBS}
+    ${CMAKE_DL_LIBS}
+    ${HWLOC_LIBRARIES}
+)
 
 ################################################################################
 # Install
@@ -640,7 +651,7 @@ target_link_libraries(xmr-stak ${LIBS} xmr-stak-c xmr-stak-backend xmr-stak-asm)
 
 # do not install the binary if the project and install are equal
 if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
-    install(TARGETS xmr-stak
+    install(TARGETS xmr-stak_initial
             RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}")
     if(CUDA_FOUND)
         if(WIN32)
@@ -664,3 +675,37 @@ else()
     # this rule is used if the install prefix is the build directory
     install(CODE "MESSAGE(\"xmr-stak installed to folder 'bin'\")")
 endif()
+# Find wxWidgets
+find_package(wxWidgets 3.2.4 REQUIRED COMPONENTS core base)
+if(wxWidgets_FOUND)
+    include(${wxWidgets_USE_FILE})
+    find_package(Threads REQUIRED)
+    add_executable(xmr-stak_day2day 
+        xmrstak/setup_gui.cpp
+    )
+    target_link_options(xmr-stak_day2day PRIVATE
+    -L/usr/local/lib
+    )
+    target_link_libraries(xmr-stak_day2day 
+        wx_baseu-3.2
+        wx_gtk3u_core-3.2
+        pthread
+        #${wxWidgets_LIBRARIES}
+    )
+    # Add wxWidgets compile definitions and options
+    target_compile_options(xmr-stak_day2day PRIVATE ${wxWidgets_CXX_FLAGS})
+    # Print include directories for debugging
+get_target_property(INCLUDE_DIRS xmr-stak_day2day INCLUDE_DIRECTORIES)
+message("Include directories: ${INCLUDE_DIRS}")
+else()
+    message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.")
+endif()
+# Add wxWidgets include directories to your target
+target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS})
+
+
+message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}")
+message(STATUS "wxWidgets_INCLUDE_DIRS: ${wxWidgets_INCLUDE_DIRS}")
+message(STATUS "wxWidgets_LIBRARIES: ${wxWidgets_LIBRARIES}")
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
\ No newline at end of file
diff --git a/xmrstak/setup_gui.cpp b/xmrstak/setup_gui.cpp
new file mode 100644
index 000000000..59b8c0e91
--- /dev/null
+++ b/xmrstak/setup_gui.cpp
@@ -0,0 +1,52 @@
+#include "setup_gui.hpp"
+#include <wx/sizer.h>
+#include <wx/button.h>
+#include <wx/textctrl.h>
+#include <wx/stattext.h>
+#include <wx/wx.h>
+#include <wx/filename.h>
+#include <wx/msgdlg.h>
+#include <wx/stdpaths.h>
+
+enum {
+    ID_START_BUTTON = wxID_HIGHEST + 1
+};
+// Implement the member functions of MiningConfigFrame
+MiningConfigFrame::MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, 
+                                     const wxPoint& pos, const wxSize& size)
+    : wxFrame(parent, id, title, pos, size)
+{
+    // Constructor implementation
+    // ...
+}
+
+void MiningConfigFrame::OnStart(wxCommandEvent& event)
+{
+    // Your OnStart implementation here
+}
+
+// Implement the member functions of MyApp
+bool GUIApp::OnInit()
+    {
+        if (!wxApp::OnInit())
+            return false;
+
+        // Check for pool.txt file
+        wxString poolFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt";
+
+        if (!wxFileExists(poolFilePath))
+        {
+            wxMessageBox("The pools.txt file is missing in the /bin folder. "
+                         "Please create this file running xmr-stak_initial setup before starting the miner this way.",
+                         "Configuration Missing",
+                         wxOK | wxICON_ERROR);
+            return false;
+        }
+
+        MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300));
+        frame->Show(true);
+        return true;
+    }
+
+
+wxIMPLEMENT_APP(GUIApp);
diff --git a/xmrstak/setup_gui.hpp b/xmrstak/setup_gui.hpp
new file mode 100644
index 000000000..6ed307bc9
--- /dev/null
+++ b/xmrstak/setup_gui.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <wx/wx.h>
+
+class MiningConfigFrame : public wxFrame
+{
+public:
+    MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, 
+                    const wxPoint& pos = wxDefaultPosition, 
+                    const wxSize& size = wxDefaultSize);
+
+private:
+
+    void OnStart(wxCommandEvent& event);
+
+    // Other private members and methods
+};
+
+class GUIApp : public wxApp
+{
+public:
+    virtual bool OnInit() override;
+};

From 1133cbfe08b5e0300acf17e3e93691f738ddaccd Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Thu, 10 Oct 2024 00:17:14 -0400
Subject: [PATCH 4/6] readMiningConfig()

---
 CMakeLists.txt        |  2 -
 xmrstak/setup_gui.cpp | 86 ++++++++++++++++++++++++++++++++++---------
 xmrstak/setup_gui.hpp | 28 +++++++++++++-
 3 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18fc1ada5..71443723a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -707,5 +707,3 @@ target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS})
 message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}")
 message(STATUS "wxWidgets_INCLUDE_DIRS: ${wxWidgets_INCLUDE_DIRS}")
 message(STATUS "wxWidgets_LIBRARIES: ${wxWidgets_LIBRARIES}")
-
-set(CMAKE_VERBOSE_MAKEFILE ON)
\ No newline at end of file
diff --git a/xmrstak/setup_gui.cpp b/xmrstak/setup_gui.cpp
index 59b8c0e91..de4ed3a5d 100644
--- a/xmrstak/setup_gui.cpp
+++ b/xmrstak/setup_gui.cpp
@@ -7,10 +7,59 @@
 #include <wx/filename.h>
 #include <wx/msgdlg.h>
 #include <wx/stdpaths.h>
+#include <wx/textfile.h>
+#include <wx/string.h>
 
 enum {
     ID_START_BUTTON = wxID_HIGHEST + 1
 };
+//readMiningConfig
+std::tuple<bool, std::string, std::string> readMiningConfig()
+{
+    wxString poolsFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt";
+    bool poolsFileExists = wxFileExists(poolsFilePath);
+
+    if (!poolsFileExists)
+    {
+        return std::make_tuple(false, std::string(), std::string());
+    }
+
+    wxTextFile file(poolsFilePath);
+    if (!file.Open())
+    {
+        return std::make_tuple(false, std::string(), std::string());
+    }
+
+    wxString poolAddress, walletAddress;
+    for (wxString line = file.GetFirstLine(); !file.Eof(); line = file.GetNextLine())
+    {
+        if (line.StartsWith("\"pool_address\" : "))
+        {
+            poolAddress = line.AfterFirst(':').Trim(true).Trim(false);
+            poolAddress.Remove(0, 1).RemoveLast();
+        }
+        else if (line.StartsWith("\"wallet_address\" : "))
+        {
+            walletAddress = line.AfterFirst(':').Trim(true).Trim(false);
+            walletAddress.Remove(0, 1).RemoveLast();
+        }
+
+        if (!poolAddress.IsEmpty() && !walletAddress.IsEmpty())
+        {
+            break;
+        }
+    }
+
+    file.Close();
+
+    if (poolAddress.IsEmpty() || walletAddress.IsEmpty())
+    {
+        return std::make_tuple(true, std::string(), std::string());  //TODO: change to false when pools.txt is fixed
+    }
+
+    return std::make_tuple(true, poolAddress.ToStdString(), walletAddress.ToStdString());
+}
+
 // Implement the member functions of MiningConfigFrame
 MiningConfigFrame::MiningConfigFrame(wxWindow* parent, wxWindowID id, const wxString& title, 
                                      const wxPoint& pos, const wxSize& size)
@@ -25,28 +74,29 @@ void MiningConfigFrame::OnStart(wxCommandEvent& event)
     // Your OnStart implementation here
 }
 
+
 // Implement the member functions of MyApp
 bool GUIApp::OnInit()
-    {
-        if (!wxApp::OnInit())
-            return false;
-
-        // Check for pool.txt file
-        wxString poolFilePath = wxFileName::GetCwd() + wxFileName::GetPathSeparator() + "pools.txt";
-
-        if (!wxFileExists(poolFilePath))
-        {
-            wxMessageBox("The pools.txt file is missing in the /bin folder. "
-                         "Please create this file running xmr-stak_initial setup before starting the miner this way.",
-                         "Configuration Missing",
-                         wxOK | wxICON_ERROR);
-            return false;
-        }
+{
+    if (!wxApp::OnInit())
+        return false;
 
-        MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300));
-        frame->Show(true);
-        return true;
+    // Create an instance of ReadPoolConfig
+    auto [fileReachable, pool, wallet] = readMiningConfig();
+    
+    if (fileReachable == false)
+    {
+        wxMessageBox("The pools.txt file is missing or invalid in the /bin folder. "
+                     "Please create this file by running xmr-stak_initial setup before starting the miner this way.",
+                     "Configuration Missing",
+                     wxOK | wxICON_ERROR);
+        return false;
     }
 
+    MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300));
+    frame->SetPoolInfo(pool, wallet);  // Assuming you have this method in MiningConfigFrame
+    frame->Show(true);
+    return true;
+}
 
 wxIMPLEMENT_APP(GUIApp);
diff --git a/xmrstak/setup_gui.hpp b/xmrstak/setup_gui.hpp
index 6ed307bc9..a788a23f1 100644
--- a/xmrstak/setup_gui.hpp
+++ b/xmrstak/setup_gui.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <wx/wx.h>
+#include <tuple>
+#include <string>
 
 class MiningConfigFrame : public wxFrame
 {
@@ -9,15 +11,37 @@ class MiningConfigFrame : public wxFrame
                     const wxPoint& pos = wxDefaultPosition, 
                     const wxSize& size = wxDefaultSize);
 
-private:
+    void SetPoolInfo(const std::string& pool, const std::string& wallet) {
+        m_pool = pool;
+        m_wallet = wallet;
+        // Update GUI elements if necessary
+    }
 
+private:
     void OnStart(wxCommandEvent& event);
-
+    std::string m_pool;
+    std::string m_wallet;
     // Other private members and methods
 };
 
+class PoolConfig
+{
+public:
+    PoolConfig(bool fileReachable, const std::string& poolAddress, const std::string& walletAddress);
+
+    std::tuple<bool, std::string, std::string> read();
+
+private:
+    bool m_fileReachable;
+    std::string m_poolAddress;
+    std::string m_walletAddress;
+};
+
 class GUIApp : public wxApp
 {
 public:
     virtual bool OnInit() override;
+
+private:
+    // You might want to add any private members here if needed
 };

From 49b115e7c3f58d0c17ca60f7fc4ce8085d4b2b5c Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Thu, 10 Oct 2024 10:49:48 -0400
Subject: [PATCH 5/6] day2day

---
 CMakeLists.txt                              | 24 ++++++++------
 xmrstak/{ => backend/day2day}/setup_gui.cpp | 36 ++++++++-------------
 xmrstak/{ => backend/day2day}/setup_gui.hpp |  0
 3 files changed, 29 insertions(+), 31 deletions(-)
 rename xmrstak/{ => backend/day2day}/setup_gui.cpp (77%)
 rename xmrstak/{ => backend/day2day}/setup_gui.hpp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71443723a..b901ff170 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -624,12 +624,12 @@ file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp")
 set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX)
 
 if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    add_executable(xmr-stak_initial 
+    add_executable(xmr-stak 
         ${SRCFILES_CPP}
         xmrstak/cli/xmr-stak.manifest
     )
 else()
-    add_executable(xmr-stak_initial 
+    add_executable(xmr-stak
         ${SRCFILES_CPP}
     )
 endif()
@@ -637,7 +637,7 @@ endif()
 set(EXECUTABLE_OUTPUT_PATH "bin" CACHE STRING "Path to place executables relative to ${CMAKE_INSTALL_PREFIX}")
 set(LIBRARY_OUTPUT_PATH "bin" CACHE STRING "Path to place libraries relative to ${CMAKE_INSTALL_PREFIX}")
 
-target_link_libraries(xmr-stak_initial 
+target_link_libraries(xmr-stak
     PRIVATE
     xmr-stak-backend
     ${LIBS}
@@ -651,7 +651,7 @@ target_link_libraries(xmr-stak_initial
 
 # do not install the binary if the project and install are equal
 if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
-    install(TARGETS xmr-stak_initial
+    install(TARGETS xmr-stak
             RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}")
     if(CUDA_FOUND)
         if(WIN32)
@@ -677,11 +677,14 @@ else()
 endif()
 # Find wxWidgets
 find_package(wxWidgets 3.2.4 REQUIRED COMPONENTS core base)
-if(wxWidgets_FOUND)
+if(!wxWidgets_FOUND)
+    message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.")
+else()
     include(${wxWidgets_USE_FILE})
     find_package(Threads REQUIRED)
     add_executable(xmr-stak_day2day 
-        xmrstak/setup_gui.cpp
+        xmrstak/backend/day2day/setup_gui.cpp
+        xmrstak/misc/environment.cpp
     )
     target_link_options(xmr-stak_day2day PRIVATE
     -L/usr/local/lib
@@ -690,18 +693,21 @@ if(wxWidgets_FOUND)
         wx_baseu-3.2
         wx_gtk3u_core-3.2
         pthread
+        xmr-stak-backend
         #${wxWidgets_LIBRARIES}
     )
     # Add wxWidgets compile definitions and options
     target_compile_options(xmr-stak_day2day PRIVATE ${wxWidgets_CXX_FLAGS})
     # Print include directories for debugging
 get_target_property(INCLUDE_DIRS xmr-stak_day2day INCLUDE_DIRECTORIES)
+file(GLOB MISC_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/xmrstak/misc/*.cpp")
+target_sources(xmr-stak_day2day PRIVATE ${MISC_SOURCES})
 message("Include directories: ${INCLUDE_DIRS}")
-else()
-    message(WARNING "wxWidgets not found. xmr-stak_day2day will not be built.")
+target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS})
+    
 endif()
 # Add wxWidgets include directories to your target
-target_include_directories(xmr-stak_day2day PRIVATE ${wxWidgets_INCLUDE_DIRS})
+
 
 
 message(STATUS "wxWidgets_FOUND: ${wxWidgets_FOUND}")
diff --git a/xmrstak/setup_gui.cpp b/xmrstak/backend/day2day/setup_gui.cpp
similarity index 77%
rename from xmrstak/setup_gui.cpp
rename to xmrstak/backend/day2day/setup_gui.cpp
index de4ed3a5d..a3aa086d8 100644
--- a/xmrstak/setup_gui.cpp
+++ b/xmrstak/backend/day2day/setup_gui.cpp
@@ -30,27 +30,13 @@ std::tuple<bool, std::string, std::string> readMiningConfig()
         return std::make_tuple(false, std::string(), std::string());
     }
 
-    wxString poolAddress, walletAddress;
-    for (wxString line = file.GetFirstLine(); !file.Eof(); line = file.GetNextLine())
-    {
-        if (line.StartsWith("\"pool_address\" : "))
-        {
-            poolAddress = line.AfterFirst(':').Trim(true).Trim(false);
-            poolAddress.Remove(0, 1).RemoveLast();
-        }
-        else if (line.StartsWith("\"wallet_address\" : "))
-        {
-            walletAddress = line.AfterFirst(':').Trim(true).Trim(false);
-            walletAddress.Remove(0, 1).RemoveLast();
-        }
+    file.Close();
 
-        if (!poolAddress.IsEmpty() && !walletAddress.IsEmpty())
-        {
-            break;
-        }
-    }
+    wxString poolAddress, walletAddress;
+    poolAddress = "toto";
+    walletAddress = "ccx7";
 
-    file.Close();
+    
 
     if (poolAddress.IsEmpty() || walletAddress.IsEmpty())
     {
@@ -86,13 +72,19 @@ bool GUIApp::OnInit()
     
     if (fileReachable == false)
     {
-        wxMessageBox("The pools.txt file is missing or invalid in the /bin folder. "
-                     "Please create this file by running xmr-stak_initial setup before starting the miner this way.",
+        wxMessageBox("The pools.txt file is missing or invalid in the /bin folder.\n"
+                     "Please create this file by running xmr-stak for initial setup,\n"
+                     "before starting the miner this way.",
                      "Configuration Missing",
                      wxOK | wxICON_ERROR);
         return false;
     }
-
+     {
+        wxMessageBox("The pools.txt file is has been detect,"
+                     + pool + " " + wallet,
+                     "Configuration",
+                     wxOK | wxICON_INFORMATION);
+    }
     MiningConfigFrame* frame = new MiningConfigFrame(nullptr, wxID_ANY, "XMR-Stak Day2Day Mining", wxDefaultPosition, wxSize(400, 300));
     frame->SetPoolInfo(pool, wallet);  // Assuming you have this method in MiningConfigFrame
     frame->Show(true);
diff --git a/xmrstak/setup_gui.hpp b/xmrstak/backend/day2day/setup_gui.hpp
similarity index 100%
rename from xmrstak/setup_gui.hpp
rename to xmrstak/backend/day2day/setup_gui.hpp

From 7f44db29f85559a19af01129091c8c64aaf4f396 Mon Sep 17 00:00:00 2001
From: acktarius <xchaps@aol.com>
Date: Sun, 13 Oct 2024 18:58:41 -0400
Subject: [PATCH 6/6] readme

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 2e2eb61fa..6da78d59a 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
+## This fork to add Conceal as a coin and to remove cryptonight_conceal algo to avoid confusion
+
+---
+
 <a href="doc/README.md" _target="blank"><img src="doc/_img/gpu.png"></a>
 <a href="#select_coin" _target="blank"><img src="doc/_img/cpu.png"></a>
 <table>