Skip to content

Commit 5232b3d

Browse files
UniverseUniverse
authored andcommitted
update scripts
1 parent 2699fa7 commit 5232b3d

File tree

1,138 files changed

+401912
-1624559
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,138 files changed

+401912
-1624559
lines changed

00_INSTALL_conda_ENV.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
conda config --add channels defaults
2+
conda config --add channels bioconda
3+
conda config --add channels conda-forge
4+
conda config --add channels mvdbeek
5+
6+
conda create -n snapshot python=3 r=3.6 bedtools ucsc_tools numpy scikit-learn r-ggplot2 r-pheatmap r-igraph r-networkD3
7+
conda activate snapshot
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
##########################################
2+
###### Generate input_data folders ######
3+
cd input_data/
4+
mkdir function_label
5+
6+
##########################################
7+
###### Download IDEAS state matrix ######
8+
wget https://usevision.org/data/hg38/IDEASstates/ideasJointMay2021/S3V2_IDEAS_hg38_r3_withHg38Mm10prior.state
9+
input_IDEAS_state_file='S3V2_IDEAS_hg38_r3_withHg38Mm10prior.state'
10+
#
11+
### Extract IDEAS state bed files per cell type from the state matrix
12+
output_file='function_label/CMP.J_IDEAS.bed'
13+
target_cell_col=12
14+
cat $input_IDEAS_state_file | awk -F ' ' -v OFS='\t' -v used_col=$target_cell_col '{if ($2=="CHR") print "#"$2,$3,$4, $used_col; else print $2,$3,$4, $used_col}' > $output_file
15+
#
16+
output_file='function_label/GMP.J_IDEAS.bed'
17+
target_cell_col=18
18+
cat $input_IDEAS_state_file | awk -F ' ' -v OFS='\t' -v used_col=$target_cell_col '{if ($2=="CHR") print "#"$2,$3,$4, $used_col; else print $2,$3,$4, $used_col}' > $output_file
19+
#
20+
output_file='function_label/HSC.J_IDEAS.bed'
21+
target_cell_col=20
22+
cat $input_IDEAS_state_file | awk -F ' ' -v OFS='\t' -v used_col=$target_cell_col '{if ($2=="CHR") print "#"$2,$3,$4, $used_col; else print $2,$3,$4, $used_col}' > $output_file
23+
#
24+
output_file='function_label/MEP.J_IDEAS.bed'
25+
target_cell_col=30
26+
cat $input_IDEAS_state_file | awk -F ' ' -v OFS='\t' -v used_col=$target_cell_col '{if ($2=="CHR") print "#"$2,$3,$4, $used_col; else print $2,$3,$4, $used_col}' > $output_file
27+
#
28+
output_file='function_label/ERY.J_IDEAS.bed'
29+
target_cell_col=16
30+
cat $input_IDEAS_state_file | awk -F ' ' -v OFS='\t' -v used_col=$target_cell_col '{if ($2=="CHR") print "#"$2,$3,$4, $used_col; else print $2,$3,$4, $used_col}' > $output_file
31+
##########################################
32+
33+
34+

README.md

Lines changed: 124 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -28,194 +28,202 @@
2828

2929

3030
## Dependence:
31-
#### Python/2.7
31+
#### Python
3232
###### numpy
3333
###### sklearn
3434
#### R
3535
###### ggplot2; pheatmap; igraph; networkD3
3636
#### bedtools
37-
######(http://bedtools.readthedocs.io/en/latest/content/installation.html)
38-
3937

4038
## Install Snapshot
4139
#### (1) clone the github repository
42-
#### (2) run the INSTALL.sh command
4340
```
4441
git clone https://github.com/guanjue/snapshot.git
45-
cd snapshot
46-
bash INSTALL.sh
4742
```
48-
49-
## Input data
50-
##### The cell type peak binary label file list: 1st column is the foldername and the filename in input folder; 2nd column is the cell type label in output figures
51-
###### The peak binary label of each cell type in bed format is in the 'atac_pk/' folder
43+
#### (2) set conda environment named as "snapshot".
44+
##### Details about how to install conda can be found in (https://docs.conda.io/projects/conda/en/latest/user-guide/index.html)
5245
```
53-
peak_list.txt
54-
>>> head peak_list.txt
55-
atac_pk/LSK.pk.bed LSK
56-
atac_pk/CMP.pk.bed CMP
57-
atac_pk/MEP.pk.bed MEP
58-
atac_pk/GMP.pk.bed GMP
46+
>>> cat 00_INSTALL_conda_ENV.sh
47+
conda config --add channels defaults
48+
conda config --add channels bioconda
49+
conda config --add channels conda-forge
50+
conda config --add channels mvdbeek
51+
52+
conda create -n snapshot python=3 r=3.6 bedtools ucsc_tools numpy scikit-learn r-ggplot2 r-pheatmap r-igraph r-networkD3
53+
conda activate snapshot
5954
```
6055

61-
##### The cell type peak signal file list: 1st column is the foldername and the filename in input folder; 2nd column is the cell type label in output figures
62-
###### The signal track of each cell type in bed format is in the 'atac_sig/' folder
56+
## Input data
57+
##### The cell type peak & signal file list: 1st column is the cell type label; 2nd column is the cell-type specific peak bed file path; 3rd column is the cell-type specific signal bigWig file path
6358
```
64-
signal_list.txt
65-
>>> head signal_list.txt
66-
atac_sig/LSK.atac.sig.bed LSK
67-
atac_sig/CMP.atac.sig.bed CMP
68-
atac_sig/MEP.atac.sig.bed MEP
69-
atac_sig/GMP.atac.sig.bed GMP
59+
>>> head input_data/peak_signal_list.txt
60+
LSK atac_pk/LSK.pk.bed atac_sig/LSK.atac.sig.bw
61+
CMP atac_pk/CMP.pk.bed atac_sig/CMP.atac.sig.bw
62+
MEP atac_pk/MEP.pk.bed atac_sig/MEP.atac.sig.bw
63+
GMP atac_pk/GMP.pk.bed atac_sig/GMP.atac.sig.bw
7064
```
7165

72-
##### The cell type functional state file list: 1st column is the foldername and the filename in input folder; 2nd column is the cell type label in output figures
73-
###### The functional state label of each cell type in bed format is in the 'function_label/' folder
66+
##### The cell type functional state file list: 1st column is the cell type label; 2nd column is the epigenetic state bedgraph file path
7467
```
75-
function_list.txt
76-
>>> head function_list.txt
77-
function_label/LSK.ideas.bed LSK
78-
function_label/CMP.ideas.bed CMP
79-
function_label/MEP.ideas.bed MEP
80-
function_label/GMP.ideas.bed GMP
68+
>>> head input_data/function_list.txt
69+
LSK function_label/LSK.ideas.bedgraph
70+
CMP function_label/CMP.ideas.bedgraph
71+
MEP function_label/MEP.ideas.bedgraph
72+
GMP function_label/GMP.ideas.bedgraph
8173
```
8274

8375
##### The cell type differentiation tree: Each row represent one edge in the ell type differentiation tree. The 1st cell type is the progenitor cell type and the 2nd cell type is the differentiated cell type
8476
```
85-
cd_tree.txt
86-
head cd_tree.txt
77+
>>> head input_data/cd_tree.txt
8778
LSK,CMP
8879
CMP,MEP
8980
CMP,GMP
9081
```
9182

92-
##### The functional state color list
83+
##### The functional state color list: 1st column is the epigenetic state label; 2nd column is the epigenetic state RGB color
9384
```
94-
>>> head function_color_list.txt
95-
36 35 194,7,153 250,151,3
96-
35 34 250,151,3 136,53,241
97-
34 33 136,53,241 197,151,0
98-
33 32 197,151,0 138,177,89
99-
32 31 138,177,89 191,0,84
100-
31 30 191,0,84 176,0,93
101-
30 29 176,0,93 252,48,50
102-
29 28 252,48,50 0,0,172
103-
28 27 0,0,172 219,8,0
104-
27 26 219,8,0 241,198,171
85+
>>> head input_data/function_color_list.txt
86+
0 255,255,255
87+
1 180,180,180
88+
2 25,160,25
89+
3 126,126,240
90+
4 253,253,157
91+
5 240,185,254
92+
6 253,213,154
93+
7 0,0,212
94+
8 0,146,0
95+
9 250,248,0
10596
```
10697

10798
## RUN Snapshot
108-
##### (1) for command line version, change the folder names (script_folder, input_folder, output_folder) in 'runall.sh'
99+
##### (1) User need to change the script_folder, input_folder, output_folder, in 'run_Snapshot.sh' file.
100+
##### The "min_number_per_index_set" is the only parameter user need to decide for Snapshot. It is minimum number of peak per index-set. The index-set with lower number of peaks will be merged into the last X_X_X_... index-set
109101
```
110-
head -4 runall_commandline.sh
102+
>>> cat run_Snapshot.sh
111103
##################################
112-
script_folder='/Users/universe/Documents/2018_BG/snapshot/bin/'
113-
input_folder='/Users/universe/Documents/2018_BG/snapshot/test_data/input_data/'
114-
output_folder='/Users/universe/Documents/2018_BG/snapshot/test_data/output_result/'
104+
script_folder='/Users/universe/Documents/2022_Independent/snapshot/bin/'
105+
input_folder='/Users/universe/Documents/2022_Independent/00_Independent_analysis/snapshot_test_data/input_data/'
106+
output_folder='/Users/universe/Documents/2022_Independent/00_Independent_analysis/snapshot_test_data/output_result/'
107+
master_peak_bed='/Users/universe/Documents/2022_Independent/00_Independent_analysis/snapshot_test_data/input_data/atac_pk/cCRE.Pool.Merged.bed'
108+
109+
peak_signal_list_file='peak_signal_list.txt'
110+
IDEAS_state_200bp_bed_files_list_file='function_list.txt'
111+
IDEAS_state_color_list_file='function_color_list.txt'
112+
cell_type_tree_file='cd_tree.txt'
113+
114+
output_name='snapshot_test_run'
115+
min_number_per_index_set=10
116+
117+
118+
### run snapshot (CORE!!!)
119+
echo 'run snapshot :o'
120+
cd $input_folder
121+
time python $script_folder'snapshot_v1.py' -p $peak_signal_list_file \
122+
-n $output_name -t $min_number_per_indexset \
123+
-f $IDEAS_state_200bp_bed_files_list_file \
124+
-c $IDEAS_state_color_list_file \
125+
-e $cell_type_tree_file \
126+
-i $input_folder -o $output_folder -s $script_folder \
127+
-m $master_peak_bed
128+
echo 'complete :)'
115129
```
116130
##### (2) use 'runall_commandline.sh' script to run Snapshot
117131
```
118-
bash runall_commandline.sh
132+
time bash run_Snapshot.sh
119133
```
120-
##### (3) for graphical user interface (GUI) version, see:
121-
https://github.com/guanjue/snapshot/blob/master/gui_manual.md
122-
##### note: current GUI version do not have peak rescuing step. This step will be added soon.
134+
123135

124136
## Output results for test data
125137
### All output files will be to the 'output_folder'
126138

127139
## The heatmap for index set
128140
##### Average atac-seq signal heatmap (left). Most abundant functional state heatmap (right).
129-
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/snapshot.meansig.png" width="350"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/snapshot.indexset_fun.png" width="350"/>
141+
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/snapshot_test_run.meansig.png" width="350"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/snapshot_test_run.indexset_fun.png" width="350"/>
130142

131143
##### Functional state epigenetic patterns.
132-
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/input_data/function_label/functional_state_epigenetic_pattern.png" width="350"/>
144+
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/example/functional_state_epigenetic_pattern.png" width="350"/>
133145

134146
## The cell differentiation tree for index set 6
135147
##### Average signal tree (left). Most abundant functional state tree (right).
136-
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/signal_tree/5.signal_list.txt0_1_1_0.tree.png" width="400"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/fun_tree/5.function_list.txt0_1_1_0.tree.png" width="400"/>
148+
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/example/6.peak_signal_list.txt1_0_0_1.tree.pdf" width="400"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/example/6.function_list.txt1_0_0_1.tree.pdf" width="400"/>
137149

138150
##### Cell type differentiation mean signal violin plot & functional state bar plot
139-
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/signal_violin/5.0_1_1_0.violin.png" width="400"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/output_result/fun_bar/5.0_1_1_0.bar.png" width="400"/>
151+
<img src="https://github.com/guanjue/snapshot/blob/master/test_data/example/6.1_0_0_1.violin.pdf" width="400"/> <img src="https://github.com/guanjue/snapshot/blob/master/test_data/example/6.1_0_0_1.bar.pdf" width="400"/>
140152

141153

142154
##### Merged peak file (bed format)
143155
```
144-
>>> head atac_4cell.sort.bed
145-
chr1 3445639 3446478 chr1_3445639_3446478
146-
chr1 3531951 3532124 chr1_3531951_3532124
147-
chr1 3670451 3671268 chr1_3670451_3671268
148-
chr1 3672091 3672710 chr1_3672091_3672710
149-
chr1 3915538 3915756 chr1_3915538_3915756
150-
chr1 4247201 4247354 chr1_4247201_4247354
151-
chr1 4332543 4332767 chr1_4332543_4332767
152-
chr1 4351941 4352297 chr1_4351941_4352297
153-
chr1 4405847 4406057 chr1_4405847_4406057
154-
chr1 4412515 4412820 chr1_4412515_4412820
156+
>>> head output_result/snapshot_test_run.sort.bed
157+
chr1 3445639 3446478 1
158+
chr1 3531951 3532124 2
159+
chr1 3670451 3671268 3
160+
chr1 3672091 3672710 4
161+
chr1 3915538 3915756 5
162+
chr1 4247201 4247354 6
163+
chr1 4332543 4332767 7
164+
chr1 4351941 4352297 8
165+
chr1 4405847 4406057 9
166+
chr1 4412515 4412820 10
155167
```
156168

157169
##### Index set mean signal matrix (bed format)
158170
```
159-
>>> head atac_4cell.meansig.txt
160-
0_0_0_1 1.0613451945782413 1.1849577200056323 0.5294839829940231 2.928835175525287
161-
0_0_1_0 1.0671246828371792 1.2664917833388463 4.556785451444871 0.9371314051064104
162-
0_0_1_1 1.50932230122 0.9744568211300001 1.94595219793 1.8584097730899998
163-
0_1_0_0 1.4409803894962963 2.8776475408259254 1.4850287042000003 1.4514520181962964
164-
0_1_0_1 1.1662457074078123 3.1474988663281254 0.5663094448585938 4.87743563546875
165-
0_1_1_0 1.5020250353344822 3.7340121203655174 6.566498942344828 1.2791282110241378
166-
0_1_1_1 1.543835688075 3.2044735005000002 4.55608701975 1.8168650337750003
167-
1_0_0_0 3.0386503686410262 1.1333250568846154 0.8024231362858975 1.2217726620948723
168-
1_0_0_1 3.243331680793104 1.4948123895824135 0.7377843294110347 5.471914041772415
169-
1_0_1_0 2.8438892549 2.0149830489600005 3.2194045138 1.9493869683600002
170-
171+
>>> head output_result/snapshot_test_run.meansig.txt
172+
0_0_0_1 1.1942193919753075 1.2288909759259259 0.5143000398765433 2.2558045432098757
173+
0_0_1_0 1.1964813092783506 1.4681236288659794 4.559413402061856 0.8977813195876289
174+
0_1_0_0 2.3508093636363636 5.0732800000000005 2.2966802727272726 2.4899857272727273
175+
0_1_0_1 1.2273509024999998 3.1057296249999986 0.5968926647500001 4.654063825
176+
0_1_1_0 1.4231207 4.145504333333332 6.617822999999999 1.2878787333333332
177+
1_0_0_0 3.55514962962963 1.6250158148148148 1.213055225925926 1.6773348888888886
178+
1_0_0_1 3.7572421052631584 1.427374894736842 0.6267781684210526 7.191793684210525
179+
1_0_1_0 2.780161818181819 2.0187935000000006 2.9209631363636364 1.9096209545454548
180+
1_1_0_0 5.526250833333332 3.49659475 0.6552302083333333 1.249175055555556
181+
1_1_0_1 6.688364330508474 7.341211720338984 0.9215638576271191 8.187531830508476
171182
```
172183

173184
##### Index signal matrix (bed format)
174185
```
175-
>>> head atac_4cell.sig.txt
176-
chr1_13592001_13592161 0_0_0_1 2.822207311 1.115653708 0.2218317345 8.850424476
177-
chr1_6975366_6975635 0_0_0_1 0.5263196312 0.7616906814 0.2218317345 1.920846614
178-
chr1_7053436_7053652 0_0_0_1 2.527397664 0.7896970943 0.8979158313 2.495271575
179-
chr1_13119493_13119701 0_0_0_1 1.301046319 0.6508195845 0.2326848409 1.060591172
180-
chr1_7109537_7109689 0_0_0_1 0.0 0.0 0.0 0.0
181-
chr1_13050969_13051145 0_0_0_1 0.197709028 0.8361873878 0.665887475 1.311687159
182-
chr1_16563017_16563247 0_0_0_1 1.195682715 1.589561904 0.4430346537 2.032925361
183-
chr1_13125691_13125919 0_0_0_1 2.450325898 2.278048049 1.294899657 1.557048213
184-
chr1_12985993_12986340 0_0_0_1 0.7858383109 0.4782726234 0.251386741 2.945028392
185-
chr1_7589470_7589648 0_0_0_1 1.198632728 0.5693356928 0.1489476907 2.255937684
186-
186+
>>> head head output_result/snapshot_test_run.sig.txt
187+
158 0_0_0_1 0.461536 0.97629 0.420611 1.17384
188+
122 0_0_0_1 1.05508 1.59315 0.332113 3.5154
189+
124 0_0_0_1 0.570329 1.321 0.197305 5.40436
190+
590 0_0_0_1 0.495322 0.82881 0.202169 1.4323
191+
126 0_0_0_1 1.05409 0.395961 0.25931 2.14795
192+
589 0_0_0_1 2.23673 1.88691 0.730118 3.32344
193+
587 0_0_0_1 1.57064 0.631929 0.13965 0.517572
194+
129 0_0_0_1 0.5076 0.613323 0.229215 1.54835
195+
130 0_0_0_1 1.38835 2.22171 0.0815782 3.49794
196+
121 0_0_0_1 2.41654 3.42985 0.510925 5.30627
187197
```
188198

189199
##### Index set most abundant functional state matrix (bed format)
190200
```
191-
>>> head atac_4cell.indexset_fun.txt
192-
0_0_0_1 0 0 0 0
201+
>>> head output_result/snapshot_test_run.indexset_fun.txt
202+
0_0_0_1 0 0 0 20
193203
0_0_1_0 0 0 20 0
194-
0_0_1_1 0 0 0 0
195-
0_1_0_0 0 0 0 0
196-
0_1_0_1 0 20 0 20
197-
0_1_1_0 0 20 12 0
198-
0_1_1_1 0 11 11 0
204+
0_1_0_0 0 20 11 20
205+
0_1_0_1 0 20 0 12
206+
0_1_1_0 0 20 12 1
199207
1_0_0_0 20 0 0 0
200-
1_0_0_1 20 0 0 12
201-
1_0_1_0 20 0 25 0
202-
208+
1_0_0_1 20 4 0 12
209+
1_0_1_0 20 0 0 0
210+
1_1_0_0 12 20 0 0
211+
1_1_0_1 12 12 0 12
203212
```
204213

205214
##### Index functional state matrix (bed format)
206215
```
207-
>>> head atac_4cell.fun.txt
208-
chr1_13592001_13592161 0_0_0_1 12 4 0 12
209-
chr1_6975366_6975635 0_0_0_1 0 0 0 0
210-
chr1_7053436_7053652 0_0_0_1 20 0 0 20
211-
chr1_13119493_13119701 0_0_0_1 0 0 0 0
212-
chr1_7109537_7109689 0_0_0_1 0 4 4 11
213-
chr1_13050969_13051145 0_0_0_1 0 0 0 0
214-
chr1_16563017_16563247 0_0_0_1 0 0 0 20
215-
chr1_13125691_13125919 0_0_0_1 20 20 20 20
216-
chr1_12985993_12986340 0_0_0_1 0 5 10 20
217-
chr1_7589470_7589648 0_0_0_1 0 0 0 0
218-
216+
>>> head output_result/snapshot_test_run.fun.txt
217+
158 0_0_0_1 7 4 0 0
218+
122 0_0_0_1 0 11 7 11
219+
124 0_0_0_1 4 4 0 20
220+
590 0_0_0_1 0 0 0 0
221+
126 0_0_0_1 0 0 0 4
222+
589 0_0_0_1 25 13 0 20
223+
587 0_0_0_1 7 7 7 7
224+
129 0_0_0_1 10 0 0 0
225+
130 0_0_0_1 4 20 0 20
226+
121 0_0_0_1 20 20 0 15
219227
```
220228

221229

bin/.DS_Store

0 Bytes
Binary file not shown.

bin/plot_fun_bar.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ signal_matrix = signal_matrix_od[ , c(3:dim(signal_matrix_od)[2]) ]
2525
class(signal_matrix) = 'numeric'
2626
###### read colnames file
2727
colname_file = read.table(signal_input_list, header=F)
28-
colname = colname_file[,2]
28+
colname = colname_file[,1]
2929
colnames(signal_matrix) = colname
3030

3131
### index set
@@ -45,7 +45,9 @@ ideas_state_matrix_uniq_sort = sort(ideas_state_matrix_uniq)
4545
### set heatmap colors
4646
print('set heatmap colors')
4747
rgb_col_num = read.table(ideas_state_color,header=F)
48-
rgb_col_num = rgb_col_num[,3]
48+
rgb_col_num = as.character(rev(rgb_col_num[,2]))
49+
print(rgb_col_num)
50+
rgb_col_num = c(rgb_col_num, rgb_col_num[length(rgb_col_num)])
4951
rgb_col_num = rgb_col_num[c((length(rgb_col_num)-1):1,length(rgb_col_num))]
5052
rgb_col_num = as.matrix(rgb_col_num)
5153
#print(rgb_col_num)
@@ -77,7 +79,7 @@ for (k in c(1:length(index_set_id_uniq_sort))){
7779
colnames(counts_matrix_t) = colnames(signal_matrix)
7880

7981
### save figure
80-
png(paste(toString(k-1), '.', toString(index_set_id_uniq_sort[k]), '.', cREs_IDEASpro_outfile, sep=''), dim(signal_matrix)[2]*100+5, dim(signal_matrix)[2]*100+5)
82+
pdf(paste(toString(k-1), '.', toString(index_set_id_uniq_sort[k]), '.', cREs_IDEASpro_outfile, sep=''), dim(signal_matrix)[2]*1+5, dim(signal_matrix)[2]*1+5)
8183
barplot(counts_matrix_t, col=my_colorbar)
8284
dev.off()
8385
}

0 commit comments

Comments
 (0)