neherlab
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎Manifest.toml‎
Lines changed: 2 additions & 2 deletions b/‎Manifest.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 2 deletions b/‎Project.toml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/src/cli/build.md‎
Lines changed: 14 additions & 12 deletions b/‎docs/src/cli/build.md‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎docs/src/cli/marginalize.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/src/cli/marginalize.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/PanGraph.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/PanGraph.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/align.jl‎
Lines changed: 79 additions & 17 deletions b/‎src/align.jl‎
Lines changed: 79 additions & 17 deletions
@@ -1,8 +1,11 @@
 # PanGraph Changelog
 
-## v0.6.4 (draft)
+## v0.7.0
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
+- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). For the build command, a random seed can be set with the `-r` flag.
+- introduced the `-t` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57).
+- Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
 ## v0.6.3
 
 
@@ -247,9 +247,9 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.5+0"
 
 [[deps.OrderedCollections]]
-git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
+git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.4.1"
+version = "1.6.0"
 
 [[deps.PDMats]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
 
@@ -1,7 +1,7 @@
 name = "PanGraph"
 uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814"
 authors = ["Nicholas Noll <[email protected]>"]
-version = "0.6.3"
+version = "0.7.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -12,6 +12,7 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -23,4 +24,4 @@ TreeTools = "62f0eae3-8c0e-4032-a621-7756092209e5"
 minimap2_jll = "d341526d-637d-5003-8fc4-9c6812cd2b55"
 
 [compat]
-TreeTools = ">= 0.6.2"
+TreeTools = ">= 0.6.2"
@@ -4,18 +4,20 @@
 Build a multiple sequence alignment pangraph.
 
 ## Options
-| Name                 | Type    | Short Flag | Long Flag        | Description                                                                                               |
-| :------------------- | :------ | :--------- | :--------------- | :-------------------------------------------------------------------------------------------------------- |
-| minimum length       | Integer | l          | len              | minimum block size for alignment graph (in nucleotides)                                                   |
-| block junction cost  | Float   | a          | alpha            | energy cost for introducing block partitions due to alignment merger                                      |
-| block diversity cost | Float   | b          | beta             | energy cost for interblock diversity due to alignment merger                                              |
-| circular genomes     | Boolean | c          | circular         | toggle if input genomes are circular                                                                      |
-| pairwise sensitivity | String  | s          | sensitivity      | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20" |
-| maximum self-maps    | Integer | x          | max-self-map     | maximum number of iterations to perform block self maps per pairwise graph merger                         |
-| enforce uppercase    | Boolean | u          | upper-case       | toggle to force genomes to uppercase characters                                                           |
-| distance calculator  | String  | d          | distance-backend | only accepts "native" or "mash"                                                                           |
-| alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                       |
-| kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.            |
+| Name                 | Type    | Short Flag | Long Flag        | Description                                                                                                   |
+| :------------------- | :------ | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------ |
+| minimum length       | Integer | l          | len              | minimum block size for alignment graph (in nucleotides)                                                       |
+| block junction cost  | Float   | a          | alpha            | energy cost for introducing block partitions due to alignment merger                                          |
+| block diversity cost | Float   | b          | beta             | energy cost for interblock diversity due to alignment merger                                                  |
+| circular genomes     | Boolean | c          | circular         | toggle if input genomes are circular                                                                          |
+| pairwise sensitivity | String  | s          | sensitivity      | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20"     |
+| maximum self-maps    | Integer | x          | max-self-map     | maximum number of iterations to perform block self maps per pairwise graph merger                             |
+| enforce uppercase    | Boolean | u          | upper-case       | toggle to force genomes to uppercase characters                                                               |
+| distance calculator  | String  | d          | distance-backend | only accepts "native" or "mash"                                                                               |
+| alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
+| kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
+| consistency check    | Boolean | t          | test             | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
+| random seed          | Int     | r          | random-seed      | random seed for pangraph construction.                                                                        |
 
 ## Arguments
 Expects one or more fasta files.
 
@@ -9,6 +9,7 @@ Compute all pairwise marginalizations of a multiple sequence alignment pangraph.
 | Output path        | String  | o          | output-path    | Path to direcotry where the output of all pairwise mariginalizations will be stored if supplied                                        |
 | Reduce paralogs    | Boolean | r          | reduce-paralog | Collapses coparallel paths through duplicated blocks.                                                                                  |
 | Projection strains | String  | s          | Strains        | Collapses the graph structure to only blocks and edges contained by the paths of the supplied strain names. comma seperated, no spaces |
+| Consistency check  | Boolean | t          | test           | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes                                  |
 
 ## Arguments
 Zero or one pangraph file which must be formatted as a JSON.
 
@@ -163,6 +163,8 @@ function main(args)
 end
 
 function julia_main()::Cint
+    # initialize random seed for reproducibility
+    seed!(0)
     try
         main(ARGS)
     catch
 
@@ -3,6 +3,7 @@ module Align
 using Rematch, Dates
 using LinearAlgebra
 using ProgressMeter
+using Random
 
 using Base.Threads: @spawn, @threads
 
@@ -26,28 +27,32 @@ end
 # ------------------------------------------------------------------------
 # guide tree for order of pairwise comparison for multiple alignments
 
+
 # TODO: distance?
 """
 	mutable struct Clade
 		name   :: String
 		parent :: Union{Clade,Nothing}
 		left   :: Union{Clade,Nothing}
 		right  :: Union{Clade,Nothing}
-		graph  :: Channel{Graph}
+		graph  :: Channel{Tuple{Graph,Int}}
 	end
 
 Clade is a node (internal or leaf) of a binary guide tree used to order pairwise alignments
 associated to a multiple genome alignment in progress.
 `name` is only non-empty for leaf nodes.
 `parent` is `nothing` for the root node.
 `graph` is a 0-sized channel that is used as a message passing primitive in alignment.
+    It contains the graph and an index used to decide the order of items in a pair in
+    pairwise graph merge.
 """
+Message=Tuple{Graph,Int}
 mutable struct Clade
     name   :: String
     parent :: Union{Clade,Nothing}
     left   :: Union{Clade,Nothing}
     right  :: Union{Clade,Nothing}
-    graph  :: Channel{Graph}
+    graph  :: Channel{Message}
 end
 
 # ---------------------------
@@ -58,20 +63,20 @@ end
 
 Generate an empty, disconnected clade.
 """
-Clade()     = Clade("",nothing,nothing,nothing,Channel{Graph}(2))
+Clade() = Clade("", nothing, nothing, nothing, Channel{Message}(2))
 """
 	Clade(name)
 
 Generate an empty, disconnected clade with name `name`.
 """
-Clade(name) = Clade(name,nothing,nothing,nothing,Channel{Graph}(2))
+Clade(name) = Clade(name, nothing, nothing, nothing, Channel{Message}(2))
 """
 	Clade(left::Clade, right::Clade)
 
 Generate an nameless clade with `left` and `right` children.
 """
 function Clade(left::Clade, right::Clade)
-    parent = Clade("",nothing,left,right,Channel{Graph}(2))
+    parent = Clade("", nothing, left, right, Channel{Message}(2))
 
     left.parent = parent
     right.parent = parent
@@ -351,6 +356,31 @@ function preprocess(hits, skip, energy, blocks!)
     return hits
 end
 
+# DEBUG
+function log_alignment(G₁::Graph, G₂::Graph, hits, fname::String)
+    open(fname, "w") do io
+        for G in (G₁, G₂)
+            write(io, "------------ G ------------\n")
+            PC = pancontigs(G)
+            for (name, seq) in zip(PC.name, PC.sequence)
+                write(io, ">$name\n")
+                write(io, seq, "\n")
+            end
+        end
+        write(io, "------------ hits ------------\n")
+        for h in hits
+            write(io, """
+            .........................
+            qry -> $(h.qry.name) | $(h.qry.start) -> $(h.qry.stop) | $(h.qry.length)
+            ref -> $(h.ref.name) | $(h.ref.start) -> $(h.ref.stop) | $(h.ref.length)
+            len -> $(h.length)
+            strand -> $(h.orientation)
+            cigar -> $(h.cigar)
+            """)
+        end
+    end
+end
+
 function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function)
     hits = if G₁ == G₂
         self = pancontigs(G₁)
@@ -359,6 +389,9 @@ function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function)
         aligner(pancontigs(G₁), pancontigs(G₂))
     end
     sort!(hits; by=energy)
+    
+    # DEBUG
+    # log_alignment(G₁, G₂, hits, "issue/minimap/$(randstring(10)).log")
 
     return hits
 end
@@ -398,7 +431,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 `minblock` is the minimum size block that will be produced from the algorithm.
 `maxiter` is maximum number of duplications that will be considered during this alignment.
 """
-function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100, sensitivity="asm10")
+function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100)
     G₀ = G₁
 
     for niter in 1:maxiter
@@ -440,6 +473,9 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct
         detransitive!(G₀)
         purge!(G₀)
         prune!(G₀)
+
+        # verify that isolates are correctly reconstructed (-v flag)
+        verify(G₀, msg="verify align-self $niter")
     end
 
     return G₀
@@ -573,6 +609,9 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a
     purge!(G)
     prune!(G)
 
+    # verify that isolates are correctly reconstructed (-v flag)
+    verify(G, msg="verify align-pair")
+
     return G
 end
 
@@ -592,8 +631,8 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 
 `compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
 """
-function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100)
-    function verify(graph, msg="")
+function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0)
+    function verify(graph; msg="")
         if reference !== nothing
             for (name,path) ∈ graph.sequence
                 seq = sequence(path)
@@ -630,7 +669,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                     println("--> insert:           $(path.node[i].block.insert[path.node[i]])")
                     println("--> delete:           $(path.node[i].block.delete[path.node[i]])")
 
-                    error("--> isolate '$name' incorrectly reconstructed")
+                    error("$msg\n--> isolate '$name' incorrectly reconstructed")
                 end
             end
         end
@@ -655,30 +694,53 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
     meter_lock = ReentrantLock()
 
     G = nothing
-    for clade ∈ postorder(tree)
+    for (n_clade, clade) ∈ enumerate(postorder(tree))
         @spawn try
+
+            # random seed for the thread - to ensure deterministic reproducibility
+            # in block names
+            Random.seed!(rand_seed+n_clade)
+
             if isleaf(clade)
                 close(clade.graph)
-                put!(clade.parent.graph, tips[clade.name])
+                msg = (tips[clade.name], n_clade)
+                put!(clade.parent.graph, msg)
             else
-                Gₗ = take!(clade.graph)
-                Gᵣ = take!(clade.graph)
+                Gₗ, Pₗ = take!(clade.graph)
+                Gᵣ, Pᵣ = take!(clade.graph)
                 close(clade.graph)
-
+                # ensure a consistent ordering of the two graphs,
+                # irrespective of which process is sending the message first.
+                if Pₗ > Pᵣ
+                    Gₗ, Gᵣ = Gᵣ, Gₗ
+                end
+                
                 # the lock ensures that at most N=Threads.nthreads() processes are
                 # spawning run(`cmd`) instances at the same time
                 G₀ = lock_semaphore(s) do
-                    G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, false)
-                    align_self(G₀, energy, minblock, aligner, verify, false)
+                    verbose && log("--> align-pair for clade n. $n_clade")
+                    G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose)
+                    verbose && log("--> align-self for clade n. $n_clade")
+                    G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter)
+                    verbose && log("--> graph merging for clade n. $n_clade completed")
+                    G₀
                 end
 
+                # DEBUG : save graph at each iteration in a file
+                # open("issue/comp/graph_iteration_$(n_clade).json", "w") do io
+                #     finalize!(G₀)
+                #     marshal(io, G₀; fmt=:json)
+                # end
+
+
                 # advance progress bar in a thread-safe way
                 lock(meter_lock) do
                     next!(meter)
                 end
 
                 if clade.parent !== nothing
-                    put!(clade.parent.graph, G₀)
+                    msg = (G₀, n_clade)
+                    put!(clade.parent.graph, msg)
                 else
                     G = G₀
                     close(error_channel)