Skip to content

Commit f47ce04

Browse files
committed
duplicate vcf pos deleted [run ci]
1 parent 99dde85 commit f47ce04

File tree

11 files changed

+2373
-2
lines changed

11 files changed

+2373
-2
lines changed

.github/workflows/linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
2727
- name: Build selscan
2828
working-directory: src/
29-
run: make -f ../unused/static_makefiles/Makefile_linux -j
29+
run: make -f Makefile_linux -j
3030

3131
- name: Test selscan installation
3232
working-directory: src/

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,15 @@ in the construction of your haplotypes please use the --keep-low-freq flag.
238238

239239
## 📝 Change Log
240240
```
241-
241+
26SEP2025 - selscan v2.1.1 - Bug fixes for the fast and memory-efficient version introduced in v2.1:
242+
243+
- Refined cutoff handling for edge cases, improving correlation with v2.0 outputs.
244+
- Corrected reporting of sites with low minor allele counts (avoids zero-area and infinite nSL/iHS).
245+
- Fixed nSL distance cutoff in multi-parameter mode.
246+
- XP-EHH now uses genetic distance from map files instead of defaulting to physical distance.
247+
- Binaries updated for compatibility with older GLIBC.
248+
- Improvements in manual: added a section describing norm usage.
249+
242250
09APR2025 - selscan v2.1.0 - Introducing fast and memory-efficient versions for all statistics. See Rahman, et al. (2025) Biorxiv for details.
243251
244252
There is a new batch option for efficient processing of multiple statistics or parameters at once. See the manual for full details.

Selscan_Manual_v2_1.pdf

-246 KB
Binary file not shown.

Selscan_Manual_v2_1_1.pdf

321 KB
Binary file not shown.

example/multiallele2.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
1 100 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
2+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
3+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
4+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
5+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
6+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
7+
1 1000 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1
8+
1 1100 rs10 G A . PASS . GT 0|1 0|0 0|1 1|1 0|0 1|1 0|1 0|0 0|1 1|1

scripts/colormap.plotting.R

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
plotHaplotypes <- function(filename)
2+
{
3+
#filename <- "outfile.rs7215219.ihs.out";
4+
5+
# filename.der <- paste(filename,".der.colormap",sep="")
6+
# filename.anc <- paste(filename,".anc.colormap",sep="")
7+
# data<-as.matrix(read.table(filename.der))
8+
rawData<-read.table(filename)
9+
10+
11+
print("Plotting EHH decay...")
12+
setEPS()
13+
postscript(file=paste(filename,".ehh.eps",sep=""))
14+
plot(rawData$V1/1000000,rawData$V2,pch=" ",xlab="Distance from locus (Mb)",ylab="EHH",ylim=c(0,1))
15+
lines(rawData$V1/1000000,rawData$V3,col="red")
16+
lines(rawData$V1/1000000,rawData$V4,col="blue")
17+
legend("topright",legend=c("Derived","Ancestral"),col=c("red","blue"),lty=1)
18+
dev.off()
19+
20+
numHaps <- dim(data)[1]
21+
numSites <- dim(data)[2]
22+
numCols<-range(data)[2]-range(data)[1]
23+
24+
pos <- rawData$V1
25+
hap <- seq(1,numHaps)
26+
27+
numSortCols <- min(numCols,5)
28+
29+
ordering <- as.data.frame(matrix(rep(0,numSortCols*numHaps),nrow=numHaps,ncol=numSortCols))
30+
31+
print("Sorting derived colors...")
32+
33+
for (h in 1:numSortCols)
34+
{
35+
for (i in 1:numHaps)
36+
{
37+
for (j in 1:numSites)
38+
{
39+
if ( data[i,j] == h-1 )
40+
{
41+
ordering[i,h] <- ordering[i,h]+1
42+
}
43+
}
44+
}
45+
}
46+
47+
order.string <- "sorted.order<-order("
48+
for (h in 1:numSortCols)
49+
{
50+
order.string <- paste(order.string,"ordering$V",h,",",sep="")
51+
}
52+
substr(order.string,nchar(order.string),nchar(order.string)) <- ")"
53+
eval(parse(text = order.string))
54+
55+
blank<-rep("",length(pos))
56+
57+
58+
data2<-as.matrix(read.table(filename.anc))
59+
numHaps2 <- dim(data2)[1]
60+
numSites2 <- dim(data2)[2]
61+
numCols2 <- range(data2)[2]-range(data2)[1]
62+
63+
pos2 <- rawData$V1
64+
hap2 <- seq(1,numHaps2)
65+
66+
numSortCols2 <- min(numCols2,5)
67+
68+
ordering2 <- as.data.frame(matrix(rep(0,numSortCols2*numHaps2),nrow=numHaps2,ncol=numSortCols2))
69+
70+
print("Sorting ancestral colors...")
71+
72+
for (h in 1:numSortCols2)
73+
{
74+
for (i in 1:numHaps2)
75+
{
76+
for (j in 1:numSites2)
77+
{
78+
if ( data2[i,j] == h-1 )
79+
{
80+
ordering2[i,h] <- ordering2[i,h]+1
81+
}
82+
}
83+
}
84+
}
85+
86+
order.string2 <- "sorted.order2<-order("
87+
for (h in 1:numSortCols2)
88+
{
89+
order.string2 <- paste(order.string2,"ordering2$V",h,",",sep="")
90+
}
91+
substr(order.string2,nchar(order.string2),nchar(order.string2)) <- ")"
92+
eval(parse(text = order.string2))
93+
94+
blank2<-rep("",length(pos2))
95+
space <- rep(-1,numSites)
96+
97+
padding <- as.integer((numHaps+numHaps2) * 0.05)
98+
if(padding < 2)
99+
{
100+
padding <- 2
101+
} else if(padding %% 2 == 1)
102+
{
103+
padding <- padding-1
104+
}
105+
106+
combined.data <- rbind(data[sorted.order,],space);
107+
108+
for (i in 1:padding)
109+
{
110+
combined.data <- rbind(combined.data,space);
111+
}
112+
113+
combined.data <- rbind(combined.data,data2[sorted.order2,])
114+
combined.numHaps <- dim(combined.data)[1]
115+
combined.hap <- seq(1,combined.numHaps)
116+
combined.numCols <- max(numCols,numCols2)
117+
118+
col.der <- c(rgb(227/255, 26/255, 28/255),rgb(51/255, 160/255, 44/255),rgb(31/255, 120/255, 180/255),rgb(255/255, 127/255, 0/255),rgb(106/255, 61/255, 154/255),rgb(251/255, 154/255, 153/255),rgb(178/255, 223/255, 138/255),rgb(166/255, 206/255, 227/255),rgb(253/255, 191/255, 111/255),rgb(202/255, 178/255, 214/255))
119+
colors.der <- rep("",combined.numCols)
120+
121+
col.anc <- c(rgb(31/255, 120/255, 180/255),rgb(255/255, 127/255, 0/255),rgb(106/255, 61/255, 154/255),rgb(227/255, 26/255, 28/255),rgb(51/255, 160/255, 44/255),rgb(166/255, 206/255, 227/255),rgb(253/255, 191/255, 111/255),rgb(202/255, 178/255, 214/255),rgb(251/255, 154/255, 153/255),rgb(178/255, 223/255, 138/255))
122+
colors.anc <- rep("",combined.numCols)
123+
124+
index <- 1;
125+
for(i in 0:(combined.numCols-1))
126+
{
127+
colors.der[i+1] <- col.der[(i %% 10) + 1]
128+
colors.anc[i+1] <- col.anc[(i %% 10) + 1]
129+
}
130+
131+
pos <- pos/1000000
132+
133+
print("Plotting haplotype colors...")
134+
setEPS()
135+
postscript(file=paste(filename,".hapcolor.eps",sep=""))
136+
137+
image(pos,combined.hap,t(combined.data),zlim=c(-0.1,-0.01),yaxt="n",xaxt="n",ylab="",xlab="Distance from locus (Mb)",ylim=c(1-(padding/2),combined.numHaps+(padding/2)))
138+
axis(3,at=pos,lab=blank,tck=(1/(combined.numHaps) * padding/4))
139+
axis(3)
140+
axis(1,at=pos,lab=blank,tck=(1/(combined.numHaps) * padding/4))
141+
axis(1)
142+
abline(h=numHaps+(padding/2)+1)
143+
144+
image(pos,combined.hap[1:numHaps],t(combined.data[1:numHaps,]),zlim=c(0,max(combined.data)),col=colors.der,add=TRUE)#,yaxt="n",xaxt="n",ylab="",ylim=c(1-(padding/10),combined.numHaps+(padding/10)),xlab="")
145+
mtext("Derived",2,at=(numHaps/2 + padding/2))
146+
image(pos,combined.hap[(numHaps+1):(combined.numHaps)],t(combined.data[(numHaps+1):(combined.numHaps),]),zlim=c(0,max(combined.data)),col=colors.anc,add=TRUE)#,yaxt="n",xaxt="n",ylab="",ylim=c(-1,combined.numHaps+2),xlab="")
147+
mtext("Ancestral",2,at= (numHaps + (padding+1) + numHaps2/2 ) )
148+
149+
dev.off()
150+
151+
return
152+
}
153+
154+
plotHaplotypes("/Users/amatur/code/selscan/src/out/outfile.ihs.out")

scripts/get_map.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
get_map.py
3+
4+
Generate a 4-column space-delimited map file accepted by selscan from ms or VCF input.
5+
6+
Output columns:
7+
1) Chromosome name (default: "chr1")
8+
2) SNP serial number starting at 1
9+
3) Genetic position (coordinate used for integrating iHH decay in iHS)
10+
4) Physical position (bp)
11+
12+
Notes:
13+
- Column 3: genetic position used for iHH integration (here equal to physical position).
14+
- Column 4: physical position used by selscan to determine gap thresholds for EHH truncation.
15+
"""
16+
17+
import gzip
18+
19+
def open_maybe_gz(file_path):
20+
if file_path.endswith(".gz"):
21+
return gzip.open(file_path, "rt")
22+
else:
23+
return open(file_path, "r")
24+
25+
26+
import argparse
27+
import sys
28+
29+
def parse_ms_positions(ms_file, genome_length):
30+
physical_positions = []
31+
with open_maybe_gz(ms_file) as f:
32+
for line in f:
33+
line = line.strip()
34+
if line.startswith("positions:"):
35+
parts = line.split()
36+
rel_pos = parts[1:] # skip 'positions:'
37+
physical_positions = [int(float(p) * genome_length) for p in rel_pos]
38+
break
39+
if not physical_positions:
40+
raise ValueError("No positions line found in ms file.")
41+
return physical_positions
42+
43+
def parse_vcf_positions(vcf_file):
44+
physical_positions = []
45+
with open_maybe_gz(vcf_file) as f:
46+
for line in f:
47+
if line.startswith("#"):
48+
continue
49+
parts = line.strip().split('\t')
50+
pos = int(parts[1])
51+
physical_positions.append(pos)
52+
if not physical_positions:
53+
raise ValueError("No variant positions found in VCF file.")
54+
return physical_positions
55+
56+
def write_map(positions, chrom="chr1", out_file="output.map"):
57+
with open(out_file, "w") as f:
58+
## Write header line describing columns (space-delimited)
59+
# f.write("chrom snp_index genetic_pos physical_pos\n")
60+
for i, pos in enumerate(positions, start=0):
61+
f.write(f"{chrom} rs{i} {pos} {pos}\n")
62+
63+
def main():
64+
parser = argparse.ArgumentParser(
65+
description="Generate a 4-column (<chrom> <snp_index> <genetic_pos> <physical_pos>) space-delimited map file from ms or VCF input.",
66+
epilog="Note: Col3=genetic pos for iHH integration (here equals physical pos, assuming constant recombination rate), Col4=physical pos for maximum allowable EHH gap detection."
67+
)
68+
group = parser.add_mutually_exclusive_group(required=True)
69+
group.add_argument("--ms", help="Input ms file with positions line")
70+
group.add_argument("--vcf", help="Input VCF file")
71+
72+
parser.add_argument("--len", type=int,
73+
help="Genome length for scaling relative positions in ms file (required if --ms is used)")
74+
parser.add_argument("--chrom", default="chr1", help="Chromosome name to use in output (default: chr1)")
75+
parser.add_argument("--out", default="output.map", help="Output map file name (default: output.map)")
76+
77+
args = parser.parse_args()
78+
79+
if args.ms and args.len is None:
80+
print("Error: --len is required when using --ms", file=sys.stderr)
81+
sys.exit(1)
82+
83+
if args.ms:
84+
positions = parse_ms_positions(args.ms, args.len)
85+
else:
86+
positions = parse_vcf_positions(args.vcf)
87+
88+
write_map(positions, chrom=args.chrom, out_file=args.out)
89+
print(f"Map file written to {args.out}")
90+
91+
if __name__ == "__main__":
92+
main()

scripts/visualize_ehh.R

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
library(ggplot2)
2+
library(reshape2)
3+
4+
# --- Read the data ---
5+
ehh_data <- read.table("outfile.ehh.1943.out", header = TRUE, sep = "\t", stringsAsFactors = FALSE)
6+
7+
# Rename derEHH to derived for plotting
8+
colnames(ehh_data)[colnames(ehh_data) == "derEHH"] <- "derived"
9+
10+
# Choose x-axis: pdist or gdist
11+
x_axis <- "pdist" # Change to "gdist" if needed
12+
13+
ehh_data$pdist <- ehh_data$pdist / 1e6
14+
15+
# Reshape data to long format
16+
ehh_long <- melt(ehh_data, id.vars = x_axis,
17+
measure.vars = c("derived", "ancEHH", "EHH"),
18+
variable.name = "Type", value.name = "EHH_Value")
19+
20+
# Define custom colors
21+
ehh_colors <- c("derived" = "red", "ancEHH" = "blue", "EHH" = "gray50")
22+
23+
# Plot
24+
ggplot(ehh_long, aes_string(x = x_axis, y = "EHH_Value", color = "Type")) +
25+
geom_line(size = 1) +
26+
scale_color_manual(values = ehh_colors) +
27+
labs(x = ifelse(x_axis == "pdist", "Physical Distance (bp)", "Genetic Distance (cM)"),
28+
y = "EHH",
29+
title = "EHH Decay Plot") +
30+
theme_minimal() +
31+
theme(text = element_text(size = 14))

unused/Makefile_norm2

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Makefile for selscan (macOS ARM) -
2+
3+
# Compiler
4+
CC = g++
5+
#clang++ and g++ are both supported
6+
7+
# Optimization and architecture flags and language standard setup
8+
G++FLAG = -O3 -m64 -ftree-vectorize -w -Wno-psabi -std=c++17
9+
#G++FLAG = -DDEBUG -O0 -g -w -Wno-psabi -std=c++17
10+
# -O3 : Maximum optimization level
11+
# -m64 : Generate 64-bit code
12+
# -arch arm64 : Target Apple Silicon (ARM64) architecture
13+
# -ftree-vectorize : Enable automatic vectorization of loops
14+
# -w : Suppress all warnings
15+
# -Wno-psabi : Disable warnings about ABI differences (useful for ARM cross-compilation)
16+
# -std=c++17 : Use the C++17 language standard
17+
18+
19+
# Include and library paths
20+
I_PATH = -I../include # Header file search path
21+
L_PATH = ../lib/macos-arm # Path to GSL libraries
22+
23+
# Linking options
24+
LINK_OPTS_SELSCAN = -lpthread -lz # Link with pthread and zlib for selscan
25+
LINK_OPTS_NORM = -L$(L_PATH) -lgsl -lgslcblas # Link with GSL and CBLAS for norm #LINK_OPTS_NORM = $(L_PATH)/libgsl.a $(L_PATH)/libgslcblas.a #For static linking of norm program to gsl libs
26+
27+
# Object file groups
28+
OBJ_STATS = xpihh.o ehh.o ihs.o ihh12.o pi.o ehh12.o
29+
OBJ_HAPMAP = hapdata.o mapdata.o bitset.o
30+
31+
# Build all targets
32+
all: selscan norm2
33+
34+
# Build selscan binary
35+
selscan: selscan-main.o $(OBJ_STATS) selscan-data.o $(OBJ_HAPMAP) selscan-cli.o binom.o param_t.o gzstream.o thread_pool.o
36+
$(CC) $(G++FLAG) -o selscan $^ $(LINK_OPTS_SELSCAN)
37+
38+
# Build norm binary
39+
norm2: norm2.o param_t.o
40+
$(CC) $(G++FLAG) -o norm2 $^ $(LINK_OPTS_NORM)
41+
42+
# Compilation rules
43+
selscan-main.o: selscan-main.cpp
44+
$(CC) $(G++FLAG) -c $< $(I_PATH)
45+
46+
selscan-data.o: selscan-data.cpp selscan-data.h
47+
$(CC) $(G++FLAG) -c $< $(I_PATH)
48+
49+
selscan-cli.o: selscan-cli.cpp selscan-cli.h
50+
$(CC) $(G++FLAG) -c $< $(I_PATH)
51+
52+
binom.o: binom.cpp binom.h
53+
$(CC) $(G++FLAG) -c $<
54+
55+
norm2.o: norm2.cpp
56+
$(CC) $(G++FLAG) -c $< $(I_PATH)
57+
58+
param_t.o: param_t.cpp param_t.h
59+
$(CC) $(G++FLAG) -c $<
60+
61+
gzstream.o: gzstream.cpp gzstream.h
62+
$(CC) $(G++FLAG) -c $< $(I_PATH)
63+
64+
thread_pool.o: thread_pool.cpp thread_pool.h
65+
$(CC) $(G++FLAG) -c $< $(LINK_OPTS_SELSCAN)
66+
67+
# Pattern rules for hapmap and stats modules
68+
%.o: hapmap/%.cpp hapmap/%.h
69+
$(CC) $(G++FLAG) -c $< -o $@ $(I_PATH)
70+
71+
%.o: stats/%.cpp stats/%.h
72+
$(CC) $(G++FLAG) -c $< -o $@ $(I_PATH)
73+
74+
# Optional: filetype modules (currently disabled)
75+
# %.o: filetype/%.cpp filetype/%.h
76+
# $(CC) $(G++FLAG) -c $< -o $@ $(I_PATH)
77+
78+
# Cleanup rules
79+
.PHONY: clean
80+
clean:
81+
rm -rf *.o selscan norm2 norm
82+
# rm -rf outfile*

0 commit comments

Comments
 (0)