Skip to content

Commit

Permalink
Debug
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Nov 6, 2024
1 parent b2a37e9 commit 3963dee
Show file tree
Hide file tree
Showing 9 changed files with 45 additions and 19 deletions.
3 changes: 0 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ private:
uint32_t base_log;
uint32_t mask;
uint32_t num_poly;
int current_level;
T mask_mod_b;
T *state;

Expand All @@ -33,8 +32,6 @@ public:
state(state) {

mask_mod_b = (1ll << base_log) - 1ll;
current_level = level_count;
synchronize_threads_in_block();
}

// Decomposes all polynomials at once
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ __host__ void host_integer_radix_bitop_kb(

auto lut = mem_ptr->lut;

printf("Here\n");
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
Expand Down
39 changes: 30 additions & 9 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

Torus *in_ks_lwe = (Torus *)malloc(sizeof(Torus)*(big_lwe_dimension + 1)*num_radix_blocks);
cuda_memcpy_async_to_cpu(in_ks_lwe, lwe_array_pbs_in, sizeof(Torus)*(big_lwe_dimension + 1)*num_radix_blocks, streams[0], gpu_indexes[0]);
cudaDeviceSynchronize();
for (uint i = 0; i < num_radix_blocks; i++) {
printf("in body cuda before ks: %lu\n", in_ks_lwe[i * (big_lwe_dimension + 1) + big_lwe_dimension]);
}
free(in_ks_lwe);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
Expand All @@ -389,6 +396,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);

Torus *out_ks_lwe = (Torus *)malloc(sizeof(Torus)*(small_lwe_dimension + 1)*num_radix_blocks);
cuda_memcpy_async_to_cpu(out_ks_lwe, lwe_after_ks_vec[0], sizeof(Torus)*(small_lwe_dimension + 1)*num_radix_blocks, streams[0], gpu_indexes[0]);
cudaDeviceSynchronize();
for (uint i = 0; i < num_radix_blocks; i++) {
printf("out body cuda after ks: %lu\n", out_ks_lwe[i * (small_lwe_dimension + 1) + small_lwe_dimension]);
}
free(out_ks_lwe);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
Expand All @@ -397,12 +411,19 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
Torus *out_lwe = (Torus *)malloc(sizeof(Torus)*(big_lwe_dimension + 1)*num_radix_blocks);
cuda_memcpy_async_to_cpu(out_lwe, lwe_array_out, sizeof(Torus)*(big_lwe_dimension + 1)*num_radix_blocks, streams[0], gpu_indexes[0]);
cudaDeviceSynchronize();
for (uint i = 0; i < num_radix_blocks; i++) {
printf("out body cuda: %lu\n", out_lwe[i * (big_lwe_dimension + 1) + big_lwe_dimension]);
}
free(out_lwe);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand All @@ -414,12 +435,12 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride);
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();

// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ __global__ void device_programmable_bootstrap_amortized(
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
synchronize_threads_in_block();

GadgetMatrix<Torus, params> gadget(base_log, level_count,
accumulator_rotated, glwe_dimension + 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
}
synchronize_threads_in_block();

// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
Expand Down
4 changes: 2 additions & 2 deletions tfhe/src/integer/server_key/radix_parallel/bitwise_op.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ impl ServerKey {
{
ct_left
.blocks_mut()
.par_iter_mut()
.zip(ct_right.blocks().par_iter())
.iter_mut()
.zip(ct_right.blocks().iter())
.for_each(|(ct_left_i, ct_right_i)| {
self.key.unchecked_bitand_assign(ct_left_i, ct_right_i);
});
Expand Down
2 changes: 2 additions & 0 deletions tfhe/src/shortint/server_key/bivariate_pbs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ impl ServerKey {

unchecked_add_assign(ct_left, ct_right);

println!("CPU input to KS PBS");
println!("{}", &ct_left.ct.get_body().data);
// Compute the PBS
self.apply_lookup_table_assign(ct_left, &acc.acc);
}
Expand Down
12 changes: 8 additions & 4 deletions tfhe/src/shortint/server_key/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -800,10 +800,10 @@ impl ServerKey {
}

pub fn apply_lookup_table_assign(&self, ct: &mut Ciphertext, acc: &LookupTableOwned) {
if ct.is_trivial() {
self.trivial_pbs_assign(ct, acc);
return;
}
//if ct.is_trivial() {
// self.trivial_pbs_assign(ct, acc);
// return;
//}

ShortintEngine::with_thread_local_mut(|engine| {
let (mut ciphertext_buffers, buffers) = engine.get_buffers(self);
Expand All @@ -814,6 +814,8 @@ impl ServerKey {
&ct.ct,
&mut ciphertext_buffers.buffer_lwe_after_ks,
);
println!("CPU after keyswitch");
println!("{}", &ct.ct.get_body().data);

apply_programmable_bootstrap(
&self.bootstrapping_key,
Expand All @@ -822,6 +824,8 @@ impl ServerKey {
&acc.acc,
buffers,
);
println!("CPU after PBS");
println!("{}", &ct.ct.get_body().data);
}
PBSOrder::BootstrapKeyswitch => {
apply_programmable_bootstrap(
Expand Down

0 comments on commit 3963dee

Please sign in to comment.