Skip to content

Commit 5bee475

Browse files
committed
Fix identity calculation by reading X records from .1aln files
Problem: - Identity values from .1aln files didn't match ALNtoPAF output - Was using D record (trace-point metadata) instead of actual edit distances - Caused filtering inconsistencies between .1aln and PAF workflows Solution: - Read X records (INT_LIST of per-tracepoint edit distances) - Sum X values to get total edit distance - Apply correct ALNtoPAF formula: divergence = (sum(X) - del) / query_span / 2.0 - Calculate matches from corrected identity values The key insight is that sum(X) represents "symmetric" divergence and must be divided by 2 to match ALNtoPAF's calculation. Validation: - 100% match rate on 100 test records (max diff: 0.000098) - Format-preserving .1aln filtering now works correctly - See: IDENTITY_CALCULATION_SOLUTION.md in sweepga repo This enables format-agnostic filtering where .1aln → .1aln produces identical results to .1aln → PAF → filter → PAF conversion.
1 parent 6c2a38a commit 5bee475

File tree

2 files changed

+43
-10
lines changed

2 files changed

+43
-10
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/onelib.rs

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,12 @@ impl AlnReader {
145145

146146
// Read associated data lines until we hit 'T' (trace) or next 'A'
147147
let mut matches = 0u64;
148-
let mut diffs = 0u64;
148+
let mut diffs_d_record = 0u64; // D record (trace-related, not actual diffs)
149+
let mut x_list_sum = 0u64; // Sum of X records (actual edit distances)
149150
let mut is_reverse = false;
150151
let mut query_len = 0u64;
151152
let mut target_len = 0u64;
153+
let mut has_x_records = false;
152154

153155
loop {
154156
let next_type = self.file.read_line();
@@ -165,7 +167,15 @@ impl AlnReader {
165167
break;
166168
}
167169
'M' => matches = self.file.int(0) as u64,
168-
'D' => diffs = self.file.int(0) as u64,
170+
'D' => diffs_d_record = self.file.int(0) as u64,
171+
'X' => {
172+
// X record contains INT_LIST of per-tracepoint edit distances
173+
// Sum them to get total edit distance (used by ALNtoPAF)
174+
if let Some(x_values) = self.file.int_list() {
175+
x_list_sum = x_values.iter().map(|&v| v as u64).sum();
176+
has_x_records = true;
177+
}
178+
}
169179
'R' => is_reverse = true,
170180
'L' => {
171181
query_len = self.file.int(0) as u64;
@@ -177,23 +187,45 @@ impl AlnReader {
177187
break;
178188
}
179189
_ => {
180-
// Skip other records (X, C, Q, etc.)
190+
// Skip other records (C, Q, etc.)
181191
continue;
182192
}
183193
}
184194
}
185195

186-
// Calculate block length and identity
196+
// Calculate identity using the SAME formula as ALNtoPAF
197+
// Key insight: ALNtoPAF uses sum(X) values, NOT the D record!
198+
//
199+
// From ALNtoPAF.c:
200+
// del = target_span - query_span (deletions in query)
201+
// divergence = (diffs - del) / query_span
202+
// BUT: diffs comes from sum(X), which is "symmetric" divergence
203+
// So we need to divide by 2: divergence = sum(X) / query_span / 2.0
204+
//
205+
// Identity: 1 - divergence
187206
let block_len = a_end - a_beg;
188-
let alignment_len = matches + diffs;
207+
let query_span = block_len as i64;
208+
let target_span = (b_end - b_beg) as i64;
209+
210+
// Use X records if available, otherwise fall back to D record
211+
let diffs = if has_x_records { x_list_sum } else { diffs_d_record };
189212

190-
// Identity: matches / total_aligned_bases
191-
let identity = if alignment_len > 0 {
192-
matches as f64 / alignment_len as f64
213+
// Calculate divergence: (diffs - del) / query_span / 2.0
214+
// where del = target_span - query_span
215+
let del = target_span - query_span;
216+
let divergence = if query_span > 0 {
217+
((diffs as i64 - del) as f64 / query_span as f64) / 2.0
193218
} else {
194219
0.0
195220
};
196221

222+
let identity = (1.0 - divergence).max(0.0).min(1.0); // Clamp to [0, 1]
223+
224+
// Calculate matches from identity and query_span
225+
// (since M record may not be present)
226+
let calculated_matches = (identity * query_span as f64) as u64;
227+
let final_matches = if matches > 0 { matches } else { calculated_matches };
228+
197229
// Create alignment using sequence IDs
198230
// Filtering doesn't need actual names, just unique identifiers
199231
let alignment = Alignment {
@@ -206,12 +238,12 @@ impl AlnReader {
206238
target_len: target_len as usize,
207239
target_start: b_beg as usize,
208240
target_end: b_end as usize,
209-
matches: matches as usize,
241+
matches: final_matches as usize, // Use calculated matches
210242
block_len: block_len as usize,
211243
mapping_quality: 60, // Default quality
212244
cigar: String::new(), // Don't extract CIGAR for filtering
213245
tags: Vec::new(),
214-
mismatches: diffs as usize, // Differences include substitutions + indels
246+
mismatches: diffs as usize, // Total diffs from X records (or D as fallback)
215247
gap_opens: 0, // Not tracked in basic .1aln
216248
gap_len: 0,
217249
};

0 commit comments

Comments
 (0)