Skip to content

Commit 4e19ddb

Browse files
Implement chunked buffering for output to optimize speed and memory usage in similarity computation
1 parent ec330b9 commit 4e19ddb

File tree

1 file changed

+36
-15
lines changed

1 file changed

+36
-15
lines changed

src/subcommand/similarity_main.cpp

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include "split.hpp"
55
#include <omp.h>
66
#include "utils.hpp"
7+
#include <sstream>
8+
#include <iomanip>
79

810
namespace odgi {
911

@@ -371,6 +373,13 @@ int main_similarity(int argc, char** argv) {
371373
}
372374

373375
std::cout << std::endl;
376+
377+
// Use chunked buffering to balance speed and memory usage
378+
std::ostringstream output_buffer;
379+
output_buffer << std::fixed << std::setprecision(8);
380+
const size_t buffer_chunk_size = 100000; // Lines per chunk
381+
size_t lines_written = 0;
382+
374383
for (auto& p : path_intersection_length) {
375384
uint32_t id_a, id_b;
376385
decode_pair(p.first, &id_a, &id_b);
@@ -383,28 +392,40 @@ int main_similarity(int argc, char** argv) {
383392
const double dice = 2.0 * ((double) intersection / (double)(bp_count[id_a] + bp_count[id_b]));
384393
const double estimated_identity = 2.0 * jaccard / (1.0 + jaccard);
385394

386-
std::cout << get_path_name(id_a) << "\t"
387-
<< get_path_name(id_b) << "\t"
388-
<< bp_count[id_a] << "\t"
389-
<< bp_count[id_b] << "\t"
390-
<< intersection << "\t";
395+
output_buffer << get_path_name(id_a) << "\t"
396+
<< get_path_name(id_b) << "\t"
397+
<< bp_count[id_a] << "\t"
398+
<< bp_count[id_b] << "\t"
399+
<< intersection << "\t";
391400

392401
if (emit_distances) {
393402
const double euclidian_distance = std::sqrt((double)((bp_count[id_a] + bp_count[id_b] - intersection) - intersection));
394403
const uint64_t manhattan_distance = (bp_count[id_a] + bp_count[id_b] - intersection) - intersection;
395-
std::cout << (1.0 - jaccard) << "\t"
396-
<< (1.0 - cosine) << "\t"
397-
<< (1.0 - dice) << "\t"
398-
<< (1.0 - estimated_identity) << "\t"
399-
<< euclidian_distance << "\t"
400-
<< manhattan_distance << std::endl;
404+
output_buffer << (1.0 - jaccard) << "\t"
405+
<< (1.0 - cosine) << "\t"
406+
<< (1.0 - dice) << "\t"
407+
<< (1.0 - estimated_identity) << "\t"
408+
<< euclidian_distance << "\t"
409+
<< manhattan_distance << "\n";
401410
} else {
402-
std::cout << jaccard << "\t"
403-
<< cosine << "\t"
404-
<< dice << "\t"
405-
<< estimated_identity << std::endl;
411+
output_buffer << jaccard << "\t"
412+
<< cosine << "\t"
413+
<< dice << "\t"
414+
<< estimated_identity << "\n";
415+
}
416+
417+
// Flush buffer every chunk_size lines
418+
if (++lines_written % buffer_chunk_size == 0) {
419+
std::cout << output_buffer.str();
420+
output_buffer.str("");
421+
output_buffer.clear();
406422
}
407423
}
424+
425+
// Write remaining buffer
426+
if (!output_buffer.str().empty()) {
427+
std::cout << output_buffer.str();
428+
}
408429

409430
return 0;
410431
}

0 commit comments

Comments
 (0)