Skip to content

Commit

Permalink
Compress painting output using zlib
Browse files Browse the repository at this point in the history
  • Loading branch information
scwatts committed Mar 28, 2018
1 parent b46c320 commit 9d797d3
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 8 deletions.
3 changes: 3 additions & 0 deletions src/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include <sys/stat.h>


#define MAX_FASTA_DESC_LEN 65536


namespace common {


Expand Down
10 changes: 9 additions & 1 deletion src/painter/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ int main(int argc, char *argv[]) {
#pragma omp critical
fprintf(stdout, "Painting %s\n", options.genome_fps[i].c_str());
for (auto& fasta : fastas) {
// Check that the FASTA name is not excessively large. Simplifies zlib buffering
// TODO: move this before database loading so that errors are raised early
if (fasta.name.size() > MAX_FASTA_DESC_LEN) {
fprintf(stderr, "Input file has %s an excessively large ", options.genome_fps[i].c_str());
fprintf(stderr, "FASTA description (over %d characters), please truncate and try again\n", MAX_FASTA_DESC_LEN);
// TODO: confirm how exits are handled by openmp
exit(1);
}
// Skip sequences less than size of kmer
if (fasta.sequence.size() < KMER_SIZE) {
continue;
Expand All @@ -46,7 +54,7 @@ int main(int argc, char *argv[]) {
// Write painted genome
#pragma omp critical
fprintf(stdout, "Writing results for %s\n", options.genome_fps[i].c_str());
std::string output_suffix = "_painted.tsv";
std::string output_suffix = "_painted.tsv.gz";
std::string output_fp = output::construct_output_fp(options.genome_fps[i], output_suffix, options.output_dir);
output::write_painted_genome(fasta_painting, database.header.species_counts, output_fp);
}
Expand Down
41 changes: 34 additions & 7 deletions src/painter/output.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#include "output.h"




#include "zlib.h"


namespace output {


Expand All @@ -25,31 +30,53 @@ std::string construct_output_fp(std::string &genome_fp, std::string &suffix, std


void write_painted_genome(std::vector<paint::FastaPaint> &fasta_painting, std::vector<file::SpeciesCount> species_counts, std::string &output_fp) {
FILE *output_fh = fopen(output_fp.c_str(), "w");
gzFile output_fh = gzopen(output_fp.c_str(), "wb");
char *buffer = (char *)malloc(CHUNK_SIZE);
int buffer_size = 0;
size_t line_size = (2 + species_counts.size()) + /* separators */
MAX_FASTA_DESC_LEN + /* contig name */
sizeof(size_t) + /* position */
(PROB_FIELD_SIZE * species_counts.size()); /* probabilities */

// Header
for (const auto& species_count: species_counts) {
fprintf(output_fh, "#%s\n", species_count.name.c_str());
// Using very conservative line size here but must ensure we don't discard bytes
if ( (buffer_size + line_size) > CHUNK_SIZE) {
gzwrite(output_fh, buffer, buffer_size);
buffer_size = 0;
}
buffer_size += snprintf(buffer+buffer_size, CHUNK_SIZE-buffer_size, "#%s\n", species_count.name.c_str());
}

// Data
for (const auto& fasta_paint : fasta_painting) {
size_t position = 0;
for (const auto& paint_bucket : fasta_paint.paint) {
position++;

// Write only if we have probabilities
if (!paint_bucket.probabilities.empty()) {
fprintf(output_fh, "%s\t%lu", fasta_paint.name.c_str(), position);
// Empty buffer if it's full
if ( (buffer_size + line_size) > CHUNK_SIZE) {
gzwrite(output_fh, buffer, buffer_size);
buffer_size = 0;
}

// Add to buffer
buffer_size += snprintf(buffer+buffer_size, 5000, "%s\t%zd", fasta_paint.name.c_str(), position);
for (auto probability : paint_bucket.probabilities) {
fprintf(output_fh, "\t%f", probability);
buffer_size += snprintf(buffer+buffer_size, CHUNK_SIZE-buffer_size, "\t%f", probability);
}
fprintf(output_fh, "\n");
buffer[buffer_size++] = '\n';
}
}
// Flush buffer
gzwrite(output_fh, buffer, buffer_size);
buffer_size = 0;
}

fclose(output_fh);
// Deallocate resources
free(buffer);
gzclose(output_fh);
}


Expand Down
3 changes: 3 additions & 0 deletions src/painter/output.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "merger/file.h"


#define CHUNK_SIZE 1024 * 25


namespace output {


Expand Down

0 comments on commit 9d797d3

Please sign in to comment.