Skip to content

Commit c99aa41

Browse files
committed
chore: updated mash documentation
1 parent 72b4f5e commit c99aa41

File tree

2 files changed

+32
-0
lines changed

2 files changed

+32
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ viewer
44
figs
55
log
66
docs/build
7+
docs/repo
78
notes/
89
vendor
910
pangraph

src/mash.jl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,28 @@ const map = UInt64[
2323

2424
const maxU64 = typemax(UInt64)
2525

26+
"""
27+
struct Minimizer
28+
value :: UInt64
29+
position :: UInt64
30+
end
31+
32+
A minimizer is a kmer that, given a hash function that maps kmers to integers, is the minimum kmer within a given set of kmers.
33+
The value is the result of applying the hash function to the kmer.
34+
The position is a bitpacked integer that includes reference ID, locus, and strand
35+
"""
2636
struct Minimizer
2737
value :: UInt64
2838
position :: UInt64
2939
end
3040

3141
# transliteration of the invertible hash function found in minimap
42+
"""
43+
hash(x::UInt64, mask::UInt64)
44+
45+
A transliteration of Jenkin's invertible hash function for 64 bit integers.
46+
Bijectively maps any kmer to an integer.
47+
"""
3248
function hash(x::UInt64, mask::UInt64)::UInt64
3349
x = (~x + (x << 21)) & mask
3450
x = x x >> 24
@@ -40,6 +56,14 @@ function hash(x::UInt64, mask::UInt64)::UInt64
4056
return x
4157
end
4258

59+
"""
60+
sketch(seq::Array{UInt8}, k::Int, w::Int, id::Int)
61+
62+
Sketch a linear sequence into a vector of minimizers.
63+
`k` sets the kmer size.
64+
`w` sets the number of contiguous kmers that will be used in the window minimizer comparison.
65+
`id` is a unique integer that corresponds to the sequence. It will be bitpacked into the minimizer position.
66+
"""
4367
function sketch(seq::Array{UInt8}, k::Int, w::Int, id::Int)
4468
(k < 0 || k > 32) && error("k='$(k)' must be ∈ [0,32]")
4569
(w < 0 || w > 255) && error("w='$(w)' must be ∈ [0,255]")
@@ -147,6 +171,13 @@ end
147171

148172
tuples(iter) = ((x,y) for (i,x) in enumerate(iter) for y in iter[i:end])
149173

174+
"""
175+
distance(graphs...; k=15, w=100)
176+
177+
Compute the pairwise distance between all input graphs.
178+
Distance is the set distance between minimizers.
179+
Linear-time algorithm using hash collisions.
180+
"""
150181
function distance(graphs...; k=15, w=100)
151182
sequences = Dict(seq for graph in graphs for seq in sequence(graph))
152183

0 commit comments

Comments
 (0)