Skip to content

Commit 171b08c

Browse files
committed
[BENCHMARK] Switch the reference genome used for E Coli
1 parent c18eecd commit 171b08c

27 files changed

+374
-434
lines changed

bucket_map/benchmark/benchmark.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/bin/bash
22

3-
GENOME_FILE="/mnt/d/genome/Egu.v3.genome_f.fasta"
3+
GENOME_FILE="/mnt/d/genome/GCA_000005845.2.fasta"
44
BENCHMARK_PATH="/home/zhenhao/bucket-map/bucket_map/benchmark"
5-
QUERY_FILE="/mnt/d/genome/TS1.81.90.001.fq"
6-
INDICATOR="egu"
5+
QUERY_FILE="/mnt/d/genome/DRR035999.fastq"
6+
INDICATOR="EColi"
77

88
# initialize benchmark directory if it doesnt exist
99
mkdir -p ${BENCHMARK_PATH}

bucket_map/benchmark/benchmark_index.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ echo "Indexing using bowtie2"
1313

1414
# run indexing for bwa
1515
echo "Indexing using bwa"
16-
#/usr/bin/time -o "${BENCHMARK_PATH}/log/bwa_index.time" -v bwa index -p "${INDEX_INDICATOR}_bwa" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/bwa_index.log"
16+
/usr/bin/time -o "${BENCHMARK_PATH}/log/bwa_index.time" -v bwa index -p "${INDEX_INDICATOR}_bwa" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/bwa_index.log"
1717

1818
# run indexing for subread
1919
echo "Indexing using subread"
20-
#/usr/bin/time -o "${BENCHMARK_PATH}/log/subread_index.time" -v subread-buildindex -o "${INDEX_INDICATOR}_subread" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/subread_index.log"
20+
/usr/bin/time -o "${BENCHMARK_PATH}/log/subread_index.time" -v subread-buildindex -o "${INDEX_INDICATOR}_subread" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/subread_index.log"
2121

2222
# run indexing for minimap2
2323
echo "Indexing using minimap2"
24-
#/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_index.time" -v minimap2 -d "${INDEX_INDICATOR}_minimap.mmi" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/minimap2_index.log"
24+
/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_index.time" -v minimap2 -d "${INDEX_INDICATOR}_minimap.mmi" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/minimap2_index.log"
2525

2626
# run indexing for BucketMap
2727
echo "Indexing using BucketMap"
28-
#/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_index.time" -v bucketmap -x -i "${INDEX_INDICATOR}_bucketmap" &> "${BENCHMARK_PATH}/log/bucketmap_index.log"
28+
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_index.time" -v bucketmap -x -i "${INDEX_INDICATOR}_bucketmap" &> "${BENCHMARK_PATH}/log/bucketmap_index.log"

bucket_map/benchmark/benchmark_map.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ echo "Mapping using minimap2"
2828

2929
# run bucketmap
3030
echo "Mapping using BucketMap"
31-
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_map.time" -v bucketmap --version-check 0 -r 150 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_map.log"
31+
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_map.time" -v bucketmap --version-check 0 -r 310 -s 30 -e 0.6 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_map.log"
3232

3333
echo "Mapping using BucketMap_align"
34-
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_align_map.time" -v bucketmap_align --version-check 0 -r 150 -u 30 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_align_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_align_map.log"
34+
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_align_map.time" -v bucketmap_align --version-check 0 -r 310 -s 30 -e 0.6 -u 30 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_align_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_align_map.log"

bucket_map/benchmark/log/bowtie2_index.log

+48-48
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Settings:
1616
Random seed: 0
1717
Sizeofs: void*:8, int:4, long:8, size_t:8
1818
Input files DNA, FASTA:
19-
/mnt/d/genome/GCA_900166955.1.fasta
19+
/mnt/d/genome/GCA_000005845.2.fasta
2020
Building a SMALL index
2121
Reading reference sizes
2222
Time reading reference sizes: 00:00:00
@@ -25,10 +25,10 @@ Writing header
2525
Reserving space for joined string
2626
Joining reference sequences
2727
Time to join reference sequences: 00:00:00
28-
bmax according to bmaxDivN setting: 1383039
29-
Using parameters --bmax 1037280 --dcv 1024
28+
bmax according to bmaxDivN setting: 1160413
29+
Using parameters --bmax 870310 --dcv 1024
3030
Doing ahead-of-time memory usage test
31-
Passed! Constructing with these parameters: --bmax 1037280 --dcv 1024
31+
Passed! Constructing with these parameters: --bmax 870310 --dcv 1024
3232
Constructing suffix-array element generator
3333
Building DifferenceCoverSample
3434
Building sPrime
@@ -52,33 +52,33 @@ Multikey QSorting 12 samples
5252
Calculating bucket sizes
5353
Splitting and merging
5454
Splitting and merging time: 00:00:00
55-
Avg bucket size: 5.53216e+06 (target: 1037279)
55+
Avg bucket size: 4.64165e+06 (target: 870309)
5656
Converting suffix-array elements to index image
5757
Allocating ftab, absorbFtab
5858
Entering Ebwt loop
5959
Getting block 1 of 1
6060
No samples; assembling all-inclusive block
61-
Sorting block of length 5532158 for bucket 1
61+
Sorting block of length 4641652 for bucket 1
6262
(Using difference cover)
6363
Sorting block time: 00:00:00
64-
Returning block of 5532159 for bucket 1
64+
Returning block of 4641653 for bucket 1
6565
Exited Ebwt loop
6666
fchr[A]: 0
67-
fchr[C]: 1368477
68-
fchr[G]: 2761687
69-
fchr[T]: 4159906
70-
fchr[$]: 5532158
67+
fchr[C]: 1142742
68+
fchr[G]: 2322833
69+
fchr[T]: 3500270
70+
fchr[$]: 4641652
7171
Exiting Ebwt::buildToDisk()
7272
Returning from initFromVector
73-
Wrote 6039197 bytes to primary EBWT file: EColi_bowtie.1.bt2
74-
Wrote 1383044 bytes to secondary EBWT file: EColi_bowtie.2.bt2
73+
Wrote 5741809 bytes to primary EBWT file: EColi_bowtie.1.bt2
74+
Wrote 1160420 bytes to secondary EBWT file: EColi_bowtie.2.bt2
7575
Re-opening _in1 and _in2 as input streams
7676
Returning from Ebwt constructor
7777
Headers:
78-
len: 5532158
79-
bwtLen: 5532159
80-
sz: 1383040
81-
bwtSz: 1383040
78+
len: 4641652
79+
bwtLen: 4641653
80+
sz: 1160413
81+
bwtSz: 1160414
8282
lineRate: 6
8383
offRate: 4
8484
offMask: 0xfffffff0
@@ -87,19 +87,19 @@ Headers:
8787
eftabSz: 80
8888
ftabLen: 1048577
8989
ftabSz: 4194308
90-
offsLen: 345760
91-
offsSz: 1383040
90+
offsLen: 290104
91+
offsSz: 1160416
9292
lineSz: 64
9393
sideSz: 64
9494
sideBwtSz: 48
9595
sideBwtLen: 192
96-
numSides: 28814
97-
numLines: 28814
98-
ebwtTotLen: 1844096
99-
ebwtTotSz: 1844096
96+
numSides: 24176
97+
numLines: 24176
98+
ebwtTotLen: 1547264
99+
ebwtTotSz: 1547264
100100
color: 0
101101
reverse: 0
102-
Total time for call to driver() for forward index: 00:00:01
102+
Total time for call to driver() for forward index: 00:00:00
103103
Reading reference sizes
104104
Time reading reference sizes: 00:00:00
105105
Calculating joined length
@@ -108,10 +108,10 @@ Reserving space for joined string
108108
Joining reference sequences
109109
Time to join reference sequences: 00:00:00
110110
Time to reverse reference sequence: 00:00:00
111-
bmax according to bmaxDivN setting: 1383039
112-
Using parameters --bmax 1037280 --dcv 1024
111+
bmax according to bmaxDivN setting: 1160413
112+
Using parameters --bmax 870310 --dcv 1024
113113
Doing ahead-of-time memory usage test
114-
Passed! Constructing with these parameters: --bmax 1037280 --dcv 1024
114+
Passed! Constructing with these parameters: --bmax 870310 --dcv 1024
115115
Constructing suffix-array element generator
116116
Building DifferenceCoverSample
117117
Building sPrime
@@ -135,33 +135,33 @@ Multikey QSorting 12 samples
135135
Calculating bucket sizes
136136
Splitting and merging
137137
Splitting and merging time: 00:00:00
138-
Avg bucket size: 5.53216e+06 (target: 1037279)
138+
Avg bucket size: 4.64165e+06 (target: 870309)
139139
Converting suffix-array elements to index image
140140
Allocating ftab, absorbFtab
141141
Entering Ebwt loop
142142
Getting block 1 of 1
143143
No samples; assembling all-inclusive block
144-
Sorting block of length 5532158 for bucket 1
144+
Sorting block of length 4641652 for bucket 1
145145
(Using difference cover)
146-
Sorting block time: 00:00:00
147-
Returning block of 5532159 for bucket 1
146+
Sorting block time: 00:00:01
147+
Returning block of 4641653 for bucket 1
148148
Exited Ebwt loop
149149
fchr[A]: 0
150-
fchr[C]: 1368477
151-
fchr[G]: 2761687
152-
fchr[T]: 4159906
153-
fchr[$]: 5532158
150+
fchr[C]: 1142742
151+
fchr[G]: 2322833
152+
fchr[T]: 3500270
153+
fchr[$]: 4641652
154154
Exiting Ebwt::buildToDisk()
155155
Returning from initFromVector
156-
Wrote 6039197 bytes to primary EBWT file: EColi_bowtie.rev.1.bt2
157-
Wrote 1383044 bytes to secondary EBWT file: EColi_bowtie.rev.2.bt2
156+
Wrote 5741809 bytes to primary EBWT file: EColi_bowtie.rev.1.bt2
157+
Wrote 1160420 bytes to secondary EBWT file: EColi_bowtie.rev.2.bt2
158158
Re-opening _in1 and _in2 as input streams
159159
Returning from Ebwt constructor
160160
Headers:
161-
len: 5532158
162-
bwtLen: 5532159
163-
sz: 1383040
164-
bwtSz: 1383040
161+
len: 4641652
162+
bwtLen: 4641653
163+
sz: 1160413
164+
bwtSz: 1160414
165165
lineRate: 6
166166
offRate: 4
167167
offMask: 0xfffffff0
@@ -170,16 +170,16 @@ Headers:
170170
eftabSz: 80
171171
ftabLen: 1048577
172172
ftabSz: 4194308
173-
offsLen: 345760
174-
offsSz: 1383040
173+
offsLen: 290104
174+
offsSz: 1160416
175175
lineSz: 64
176176
sideSz: 64
177177
sideBwtSz: 48
178178
sideBwtLen: 192
179-
numSides: 28814
180-
numLines: 28814
181-
ebwtTotLen: 1844096
182-
ebwtTotSz: 1844096
179+
numSides: 24176
180+
numLines: 24176
181+
ebwtTotLen: 1547264
182+
ebwtTotSz: 1547264
183183
color: 0
184184
reverse: 1
185-
Total time for backward call to driver() for mirror index: 00:00:00
185+
Total time for backward call to driver() for mirror index: 00:00:01

bucket_map/benchmark/log/bowtie2_index.time

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
Command being timed: "bowtie2-build /mnt/d/genome/GCA_900166955.1.fasta EColi_bowtie"
2-
User time (seconds): 1.51
3-
System time (seconds): 0.13
4-
Percent of CPU this job got: 94%
5-
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.74
1+
Command being timed: "bowtie2-build /mnt/d/genome/GCA_000005845.2.fasta EColi_bowtie"
2+
User time (seconds): 1.28
3+
System time (seconds): 0.09
4+
Percent of CPU this job got: 97%
5+
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.41
66
Average shared text size (kbytes): 0
77
Average unshared data size (kbytes): 0
88
Average stack size (kbytes): 0
99
Average total size (kbytes): 0
10-
Maximum resident set size (kbytes): 103104
10+
Maximum resident set size (kbytes): 101960
1111
Average resident set size (kbytes): 0
12-
Major (requiring I/O) page faults: 109
13-
Minor (reclaiming a frame) page faults: 12988
14-
Voluntary context switches: 418
15-
Involuntary context switches: 3
12+
Major (requiring I/O) page faults: 0
13+
Minor (reclaiming a frame) page faults: 15265
14+
Voluntary context switches: 193
15+
Involuntary context switches: 1
1616
Swaps: 0
17-
File system inputs: 25488
18-
File system outputs: 31736
17+
File system inputs: 0
18+
File system outputs: 29272
1919
Socket messages sent: 0
2020
Socket messages received: 0
2121
Signals delivered: 0
+6-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
4922564 reads; of these:
2-
4922564 (100.00%) were unpaired; of these:
3-
462017 (9.39%) aligned 0 times
4-
3478827 (70.67%) aligned exactly 1 time
5-
981720 (19.94%) aligned >1 times
6-
90.61% overall alignment rate
1+
1302395 reads; of these:
2+
1302395 (100.00%) were unpaired; of these:
3+
238893 (18.34%) aligned 0 times
4+
1032371 (79.27%) aligned exactly 1 time
5+
31131 (2.39%) aligned >1 times
6+
81.66% overall alignment rate

bucket_map/benchmark/log/bowtie2_map.time

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
Command being timed: "bowtie2 -x egu_bowtie -U /mnt/d/genome/TS1.81.90.001.fq -S /home/zhenhao/bucket-map/bucket_map/benchmark/output/bowtie2_map.sam"
2-
User time (seconds): 743.00
3-
System time (seconds): 7.05
1+
Command being timed: "bowtie2 -x EColi_bowtie -U /mnt/d/genome/DRR035999.fastq -S /home/zhenhao/bucket-map/bucket_map/benchmark/output/bowtie2_map.sam"
2+
User time (seconds): 196.40
3+
System time (seconds): 2.52
44
Percent of CPU this job got: 98%
5-
Elapsed (wall clock) time (h:mm:ss or m:ss): 12:41.37
5+
Elapsed (wall clock) time (h:mm:ss or m:ss): 3:21.23
66
Average shared text size (kbytes): 0
77
Average unshared data size (kbytes): 0
88
Average stack size (kbytes): 0
99
Average total size (kbytes): 0
10-
Maximum resident set size (kbytes): 1961608
10+
Maximum resident set size (kbytes): 30520
1111
Average resident set size (kbytes): 0
12-
Major (requiring I/O) page faults: 115
13-
Minor (reclaiming a frame) page faults: 8232
14-
Voluntary context switches: 100365
15-
Involuntary context switches: 797
12+
Major (requiring I/O) page faults: 0
13+
Minor (reclaiming a frame) page faults: 6632
14+
Voluntary context switches: 31347
15+
Involuntary context switches: 223
1616
Swaps: 0
17-
File system inputs: 3912920
18-
File system outputs: 3700552
17+
File system inputs: 8
18+
File system outputs: 1688680
1919
Socket messages sent: 0
2020
Socket messages received: 0
2121
Signals delivered: 0
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
[INFO] Allowing Smith-Waterman for alignment verifications.
2-
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 26507.
2+
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 71.
33
[INFO] Set q-gram shape to be: [1,1,1,1,1,1,1,1,1] with number of effective characters: 9
4-
[ERROR] The specified file already exists in directory: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
5-
[BENCHMARK] Number of Q-grams with distinguishability >= 0.499981: 251645 (95.9949%).
6-
[BENCHMARK] Elapsed time for loading index files: 16.253 s.
7-
[INFO] Successfully loaded "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
8-
[BENCHMARK] Elapsed time for bucket mapping: 269.231 s (54.6932 μs/seq).
9-
[BENCHMARK] Number of reads that have at least one candidate bucket: 4841636 (98.356%).
10-
[BENCHMARK] Average number of buckets an original read is mapped to: 2.04986.
11-
[BENCHMARK] Average number of buckets a reverse complement of the read is mapped to: 2.07859.
12-
[BENCHMARK] Total time used for building k-mer index for each bucket: 204.802 s.
13-
[BENCHMARK] Total time used for finding exact location of the sequences: 92.906 s (18.8735 μs/seq).
14-
[BENCHMARK] Total mapped locations: 5422347 (1.10153 per sequence).
15-
[BENCHMARK] Total time used for alignment verification and output: 231.415 s (42.678 μs per pairwise alignment).
4+
[ERROR] The specified file already exists in directory: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/EColi_bucketmap.qgram".
5+
[BENCHMARK] Number of Q-grams with distinguishability >= 0.492958: 249872 (95.3186%).
6+
[BENCHMARK] Elapsed time for loading index files: 0.057 s.
7+
[INFO] Successfully loaded "/home/zhenhao/bucket-map/bucket_map/benchmark/index/EColi_bucketmap.qgram".
8+
[BENCHMARK] Elapsed time for bucket mapping: 14.789 s (11.3552 μs/seq).
9+
[BENCHMARK] Number of reads that have at least one candidate bucket: 1254472 (96.3204%).
10+
[BENCHMARK] Average number of buckets an original read is mapped to: 1.12067.
11+
[BENCHMARK] Average number of buckets a reverse complement of the read is mapped to: 1.13216.
12+
[BENCHMARK] Total time used for building k-mer index for each bucket: 0.245 s.
13+
[BENCHMARK] Total time used for finding exact location of the sequences: 6.599 s (5.06682 μs/seq).
14+
[BENCHMARK] Total mapped locations: 1181663 (0.9073 per sequence).
15+
[BENCHMARK] Total time used for alignment verification and output: 63.521 s (53.7556 μs per pairwise alignment).

bucket_map/benchmark/log/bucketmap_align_map.time

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
Command being timed: "bucketmap_align --version-check 0 -r 150 -u 30 -i egu_bucketmap -q /mnt/d/genome/TS1.81.90.001.fq -o /home/zhenhao/bucket-map/bucket_map/benchmark/output/bucketmap_align_map.sam"
2-
User time (seconds): 852.20
3-
System time (seconds): 3.88
4-
Percent of CPU this job got: 99%
5-
Elapsed (wall clock) time (h:mm:ss or m:ss): 14:21.91
1+
Command being timed: "bucketmap_align --version-check 0 -r 310 -s 30 -e 0.6 -u 30 -i EColi_bucketmap -q /mnt/d/genome/DRR035999.fastq -o /home/zhenhao/bucket-map/bucket_map/benchmark/output/bucketmap_align_map.sam"
2+
User time (seconds): 87.01
3+
System time (seconds): 1.73
4+
Percent of CPU this job got: 97%
5+
Elapsed (wall clock) time (h:mm:ss or m:ss): 1:31.01
66
Average shared text size (kbytes): 0
77
Average unshared data size (kbytes): 0
88
Average stack size (kbytes): 0
99
Average total size (kbytes): 0
10-
Maximum resident set size (kbytes): 1081252
10+
Maximum resident set size (kbytes): 169976
1111
Average resident set size (kbytes): 0
12-
Major (requiring I/O) page faults: 2
13-
Minor (reclaiming a frame) page faults: 192708
14-
Voluntary context switches: 24820
15-
Involuntary context switches: 596
12+
Major (requiring I/O) page faults: 0
13+
Minor (reclaiming a frame) page faults: 17050
14+
Voluntary context switches: 9248
15+
Involuntary context switches: 132
1616
Swaps: 0
17-
File system inputs: 16
18-
File system outputs: 3484984
17+
File system inputs: 0
18+
File system outputs: 1445960
1919
Socket messages sent: 0
2020
Socket messages received: 0
2121
Signals delivered: 0
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
[INFO] Not using Smith-Waterman for alignment verifications.
2-
[INFO] Initializing indexer and mapper with bucket length: 32768, and number of buckets: 52426.
2+
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 71.
33
[INFO] Set q-gram shape to be: [1,1,1,1,1,1,1,1,1] with number of effective characters: 9
4-
[ERROR] The specified file already exists in directory: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap_2.qgram".
4+
[INFO] The bucket q-gram index is stored in: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/EColi_bucketmap.qgram".
5+
[INFO] The number of buckets: 71.
6+
[INFO] The bucket ids are stored in: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/EColi_bucketmap.bucket_id".
7+
[BENCHMARK] Elapsed time for creating and storing index files: 0.101 s.

0 commit comments

Comments
 (0)