-
-
Notifications
You must be signed in to change notification settings - Fork 73
feat: add --umi-prefix to CopyUmiFromReadName #958
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 7 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
4454321
feat: add --umi-prefix to CopyUmiFromReadName
msto 2fd68b2
introduce umiDelimiter, rcPrefix, and normalizeRcUmis options; make c…
jdidion 4d056ec
add codeowners
jdidion c17ed1e
fix typo
jdidion e58a576
Merge branch 'main' into ms_add-umi-prefix
jdidion 51369ba
Update src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala
jdidion 41882df
update tests
jdidion 700741e
remove option to normalize reverse-complemented UMIs
jdidion 56b4d75
change to having an option that disables default behavior
jdidion File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,29 +36,51 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} | |
""" | ||
|Copies the UMI at the end of the BAM's read name to the RX tag. | ||
| | ||
|The read name is split on `:` characters with the last field is assumed to be the UMI sequence. The UMI | ||
|The read name is split on `:` characters with the last field assumed to be the UMI sequence. The UMI | ||
|will be copied to the `RX` tag as per the SAM specification. If any read does not have a UMI composed of | ||
|valid bases (ACGTN), the program will report the error and fail. | ||
| | ||
|If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The | ||
|resulting UMI in the `RX` tag will always be hyphen delimited. | ||
|If a read name contains multiple UMIs they may be delimited (typically by a hyphen (`-`) or plus (`+`)). | ||
|The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag | ||
|will always be hyphen delimited. | ||
| | ||
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence | ||
|has been reverse-complemented. The `--rc-prefix` option specifies the prefix character(s) and causes them to | ||
|be removed. Additionally, if the `--normalize-rc-umis` flag is specified, any reverse-complemented UMIs will | ||
|be normalized (i.e., reverse-complemented back to be in the forward orientation). | ||
| | ||
|To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and | ||
|prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`. | ||
""") | ||
class CopyUmiFromReadName | ||
( @arg(flag='i', doc="The input BAM file") input: PathToBam, | ||
@arg(flag='o', doc="The output BAM file") output: PathToBam, | ||
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false | ||
( @arg(flag='i', doc="The input BAM file.") input: PathToBam, | ||
@arg(flag='o', doc="The output BAM file.") output: PathToBam, | ||
@arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, | ||
@arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', | ||
@arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', | ||
@arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None, | ||
@arg(flag='r', doc="Whether to reverse-complement UMI sequences with the '--reverse-complement-prefix'.") normalizeReverseComplementUmis: Boolean = false, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove, and condition on if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
) extends FgBioTool with LazyLogging { | ||
|
||
nh13 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Io.assertReadable(input) | ||
Io.assertCanWriteFile(output) | ||
validate(reverseComplementPrefix.forall(_.nonEmpty), "--reverse-complement-prefix cannot be an empty string") | ||
|
||
override def execute(): Unit = { | ||
val source = SamSource(input) | ||
val writer = SamWriter(output, source.header) | ||
val progress = new ProgressLogger(logger) | ||
source.foreach { rec => | ||
progress.record(rec) | ||
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi) | ||
|
||
writer += Umis.copyUmiFromReadName( | ||
rec = rec, | ||
removeUmi = removeUmi, | ||
fieldDelimiter = fieldDelimiter, | ||
umiDelimiter = umiDelimiter, | ||
reverseComplementPrefix = reverseComplementPrefix, | ||
normalizeReverseComplementUmis = normalizeReverseComplementUmis | ||
) | ||
} | ||
progress.logLast() | ||
source.safelyClose() | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
usage needs to be updated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done