diff --git a/CHANGES.md b/CHANGES.md index b7cd2923f..ddb21b0d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,11 +2,14 @@ ## __NEXT__ +* align: Added `--alignment-args` options for passing arguments to the alignment program. [#1789] (@vbadelita) + ### Bug fixes * export v2: Improved the error message that is displayed when the metadata index column has duplicated values [#1791][] (@genehack) * tree: Improved help text for `--tree-builder-args` to explain some IQ-TREE options won't work because of defline rewriting [#875][] (@genehack) +[#1789]: https://github.com/nextstrain/augur/issues/1789 [#875]: https://github.com/nextstrain/augur/issues/875 [#1791]: https://github.com/nextstrain/augur/issues/1791 diff --git a/augur/align.py b/augur/align.py index 37551903f..bf7a45543 100644 --- a/augur/align.py +++ b/augur/align.py @@ -14,6 +14,10 @@ from .utils import nthreads_value from collections import defaultdict +DEFAULT_ARGS = { + "mafft": "--reorder --anysymbol --nomemsave --adjustdirection", +} + class AlignmentError(Exception): # TODO: this exception should potentially be renamed and made augur-wide # thus allowing any module to raise it and have the message printed & augur @@ -31,6 +35,9 @@ def register_arguments(parser): parser.add_argument('--nthreads', type=nthreads_value, default=1, help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used") parser.add_argument('--method', default='mafft', choices=["mafft"], help="alignment program to use") + parser.add_argument('--alignment-args', help="arguments to pass to the alignment program (except for threads, keeplength if `--existing-alignment` is passed), overriding defaults. " + + f"mafft defaults: `{DEFAULT_ARGS['mafft']}`") + parser.add_argument('--override-default-args', action="store_true", help="override default alignment program arguments with the values provided by the user in `--alignment-args` instead of augmenting the existing defaults.") parser.add_argument('--reference-name', metavar="NAME", type=str, help="strip insertions relative to reference sequence; use if the reference is already in the input sequences") parser.add_argument('--reference-sequence', metavar="PATH", type=str, help="Add this reference sequence to the dataset & strip insertions relative to this. Use if the reference is NOT already in the input sequences") parser.add_argument('--remove-reference', action="store_true", default=False, help="remove reference sequence from the alignment") @@ -132,7 +139,7 @@ def run(args): # generate alignment command & run log = args.output + ".log" - cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log) + cmd = generate_alignment_cmd(args.method, args.nthreads, existing_aln_fname, seqs_to_align_fname, args.output, log, alignment_args=args.alignment_args, override_default_args=args.override_default_args) success = run_shell_command(cmd) if not success: raise AlignmentError(f"Error during alignment: please see the log file {log!r} for more details") @@ -248,17 +255,32 @@ def read_reference(ref_fname): "\n\tmake sure the file %s contains one sequence in genbank or fasta format"%ref_fname) return ref_seq -def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname): +def generate_alignment_cmd(method, nthreads, existing_aln_fname, seqs_to_align_fname, aln_fname, log_fname, alignment_args=None, override_default_args=False): + if method not in DEFAULT_ARGS: + raise AlignmentError('ERROR: alignment method %s not implemented'%method) + + if alignment_args is None: + alignment_args = DEFAULT_ARGS[method] + elif override_default_args: + alignment_args = alignment_args + else: + alignment_args = f"{DEFAULT_ARGS[method]} {alignment_args}" + if method=='mafft': + files_to_align = shquote(seqs_to_align_fname) if existing_aln_fname: - cmd = "mafft --add %s --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s"%(shquote(seqs_to_align_fname), nthreads, shquote(existing_aln_fname), shquote(aln_fname), shquote(log_fname)) - else: - cmd = "mafft --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s"%(nthreads, shquote(seqs_to_align_fname), shquote(aln_fname), shquote(log_fname)) + # If there is an existing alignment, then seqs_to_align_fname becomes a parameter of --add + # and existing_aln_fname becomes the anonymous parameter + files_to_align = f"--add {shquote(seqs_to_align_fname)} {shquote(existing_aln_fname)}" + alignment_args = " ".join(["--keeplength", alignment_args]) + + cmd = f"mafft {alignment_args} --thread {nthreads} {files_to_align} 1> {shquote(aln_fname)} 2> {shquote(log_fname)}" print("\nusing mafft to align via:\n\t" + cmd + " \n\n\tKatoh et al, Nucleic Acid Research, vol 30, issue 14" "\n\thttps://doi.org/10.1093%2Fnar%2Fgkf436\n") else: raise AlignmentError('ERROR: alignment method %s not implemented'%method) + return cmd diff --git a/tests/test_align.py b/tests/test_align.py index 1f6c2eb83..0d0957c58 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -187,7 +187,7 @@ def test_prettify_alignment(self): def test_generate_alignment_cmd_non_mafft(self): with pytest.raises(align.AlignmentError): - assert align.generate_alignment_cmd('no-mafft', 1, None, None, None, None) + assert align.generate_alignment_cmd('no-mafft', 1, None, None, None, None, alignment_args=None) def test_generate_alignment_cmd_mafft_existing_aln_fname(self): existing_aln_fname = "existing_aln" @@ -199,9 +199,10 @@ def test_generate_alignment_cmd_mafft_existing_aln_fname(self): existing_aln_fname, seqs_to_align_fname, aln_fname, - log_fname) + log_fname, + alignment_args=None) - expected = "mafft --add %s --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (quote(seqs_to_align_fname), 1, quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + expected = "mafft --keeplength --reorder --anysymbol --nomemsave --adjustdirection --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) assert result == expected @@ -214,12 +215,66 @@ def test_generate_alignment_cmd_mafft_no_existing_aln_fname(self): None, seqs_to_align_fname, aln_fname, - log_fname) + log_fname, + alignment_args=None) expected = "mafft --reorder --anysymbol --nomemsave --adjustdirection --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_existing_aln_fname(self): + existing_aln_fname = "existing_aln" + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + existing_aln_fname, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto", + override_default_args=False) + + expected = "mafft --keeplength --reorder --anysymbol --nomemsave --adjustdirection --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_no_existing_aln_fname(self): + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + None, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto", + override_default_args=False) + expected = "mafft --reorder --anysymbol --nomemsave --adjustdirection --auto --thread %d %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected + + def test_generate_alignment_cmd_mafft_custom_args_override_args(self): + existing_aln_fname = "existing_aln" + seqs_to_align_fname = "seqs_to_align" + aln_fname = "aln_fname" + log_fname = "log_fname" + + result = align.generate_alignment_cmd("mafft", 1, + existing_aln_fname, + seqs_to_align_fname, + aln_fname, + log_fname, + alignment_args="--auto", + override_default_args=True) + + expected = "mafft --keeplength --auto --thread %d --add %s %s 1> %s 2> %s" % (1, quote(seqs_to_align_fname), quote(existing_aln_fname), quote(aln_fname), quote(log_fname)) + + assert result == expected + def test_read_alignment(self): data_file = pathlib.Path('tests/data/align/test_aligned_sequences.fasta') result = align.read_alignment(str(data_file.resolve()))