From 31fdac3080ecd5a0d79c56386cfd318f32c7920d Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 14 Jun 2016 09:48:37 -0400 Subject: [PATCH] Output the MD tag for each XA record with -D. --- bwase.c | 27 ++++++++++++++++++++++----- bwase.h | 1 + bwtaln.c | 11 +++++++---- bwtaln.h | 2 ++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/bwase.c b/bwase.c index 20cf53f..9188ff9 100644 --- a/bwase.c +++ b/bwase.c @@ -291,11 +291,17 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) + +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) +{ + bwa_refine_gapped2(bns, n_seqs, seqs, _pacseq, 0); +} + +void bwa_refine_gapped2(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md) { ubyte_t *pacseq; int i, j, k; - kstring_t *str; + kstring_t *str = (kstring_t*)calloc(1, sizeof(kstring_t)); if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); @@ -312,7 +318,18 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); q->n_cigar = n_cigar; if (q->cigar) s->multi[k++] = *q; - } else s->multi[k++] = *q; + } else { + s->multi[k++] = *q; + if (with_md) { // create the cigar, needed for bwa_cal_md1 below + q->n_cigar = 1; + q->cigar = calloc(q->n_cigar, sizeof(uint32_t)); + q->cigar[0] = __cigar_create(FROM_M, s->len); + } + } + if (with_md) { + int nm; + q->md = bwa_cal_md1(q->n_cigar, q->cigar, s->len, q->pos, q->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); + } } s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; @@ -320,7 +337,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag - str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { @@ -480,7 +496,8 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in for (k = 0; k < q->n_cigar; ++k) err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); } else err_printf("%dM", p->len); - err_printf(",%d;", q->gap + q->mm); + if (q->md) err_printf(",%d,%s;", q->gap + q->mm, q->md); + else err_printf(",%d,.;", q->gap + q->mm); } } } diff --git a/bwase.h b/bwase.h index e9a2a23..34314a7 100644 --- a/bwase.h +++ b/bwase.h @@ -26,6 +26,7 @@ extern "C" { void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr); void bwa_cal_pac_pos_with_bwt(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr, bwt_t *bwt); void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); + void bwa_refine_gapped2(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); #ifdef __cplusplus diff --git a/bwtaln.c b/bwtaln.c index 08a1ffd..80ff48b 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -37,10 +37,11 @@ gap_opt_t *gap_init_opt() o->n_threads = 1; o->max_top2 = 30; o->trim_qual = 0; + o->n_occ = 3; o->sam = 0; - o->interactive_mode = 0; o->rg_line = NULL; - o->n_occ = 3; + o->interactive_mode = 0; + o->with_md = 0; return o; } @@ -247,7 +248,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0); + bwa_refine_gapped2(bns, n_seqs, seqs, 0, opt->with_md); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -283,7 +284,7 @@ int bwa_aln(int argc, char *argv[]) char *prefix; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:Sr:X:Z")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:Sr:X:ZD")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -318,6 +319,7 @@ int bwa_aln(int argc, char *argv[]) break; case 'X': opt->n_occ = atoi(optarg); break; case 'Z': opt->interactive_mode = 1; break; + case 'D': opt->with_md = 1; break; default: return 1; } } @@ -359,6 +361,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -r STR read group line for SAM output\n"); fprintf(stderr, " -X INT maximum # of hits to report (SAM output only, equivalent to -n in samse)\n"); fprintf(stderr, " -Z interactive mode (no input buffer and empty lines between records force processing)\n"); + fprintf(stderr, " -D output the MD to each alignment in the XA tag, otherwise use \".\"\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 7021087..6c398bc 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -61,6 +61,7 @@ typedef struct { int ref_shift; bwtint_t pos; bwa_cigar_t *cigar; + char *md; } bwt_multi1_t; typedef struct { @@ -117,6 +118,7 @@ typedef struct { char *rg_line; int n_occ; int interactive_mode; + int with_md; } gap_opt_t; #define BWA_PET_STD 1