-
Notifications
You must be signed in to change notification settings - Fork 33
Open
Description
Dear developer, hello, I have some questions I'd like to ask!
I am currently using verkko to assemble a homologous hexaploid using only HiFi data. The verkko strategy combining HiFi and ONT has already completed its run much earlier. But my HIFI-only strategy never seems to finish running.
Why does the script 5-untip/untip.sh take such a long time to run? It's been stuck at the following point for 3-4 days now:
Combine mappings
Combine edges
Find lengths
Fix coverage
Pop bubbles based on coverageIs my genome too fragmented? The file nodecov_hifi_fix.csv has a lot of lines. What should I do to complete untip.sh faster?
cat nodecov_hifi_fix.csv|wc -l
1508215-untip/untip.sh:
#!/bin/sh
set -e
/home//anaconda3/envs/verkko/lib/verkko/scripts/untip_relative.py 15000 5000 0.1 <../2-processGraph/unitig-unrolled-hifi-resolved.gfa > connected-tip.gfa
/home//anaconda3/envs/verkko/lib/verkko/scripts/unitigify.py utig3a- unitig-mapping-3a.txt \
< connected-tip.gfa \
> unitig-connected-tip.gfa
echo Combine mappings
cat ../2-processGraph/unitig-mapping-1.txt ../2-processGraph/haplofix-mapping.txt ../2-processGraph/unroll_mapping_1.txt unitig-mapping-3a.txt \
> combined-nodemap-1.txt
echo Combine edges
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa connected-tip.gfaunitig-connected-tip.gfa \
| grep '^L' \
> combined-edges-1.gfa
# this used to have grep -v \*
#
echo Find lengths
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa connected-tip.gfaunitig-connected-tip.gfa \
| awk 'BEGIN \
{ \
FS="[ \t]+"; OFS="\t"; \
} \
$1 == "S" \
{ \
print $2, length($3); \
}' \
> nodelens-1.txt
echo Fix coverage
/home//anaconda3/envs/verkko/lib/verkko/scripts/get_original_coverage.py \
unitig-connected-tip.gfa \
combined-nodemap-1.txt \
combined-edges-1.gfa \
nodelens-1.txt \
../1-buildGraph/hifi_nodecov.csv \
> ./nodecov_hifi_fix.csv
echo Pop bubbles based on coverage
/home//anaconda3/envs/verkko/lib/verkko/scripts/pop_bubbles_coverage_based.py \
unitig-connected-tip.gfa \
./nodecov_hifi_fix.csv False \
> popped-connected-tip.gfa \
2> popinfo_round1.err
# consider doing this in a loop to avoid code redundancy
echo Unroll simple loops
/home//anaconda3/envs/verkko/lib/verkko/scripts/unroll_simple_loops.py popped-connected-tip.gfa ./nodecov_hifi_fix.csv > unrolled-popped-connected-tip.gfa 2>> unitig-mapping-3a.txt
echo Unitigify 3
/home//anaconda3/envs/verkko/lib/verkko/scripts/unitigify.py utig3b- unitig-mapping-3b.txt \
< unrolled-popped-connected-tip.gfa \
> unitig-unrolled-popped-connected-tip.gfa
echo Combine mappings
cat ../2-processGraph/unitig-mapping-1.txt ../2-processGraph/haplofix-mapping.txt ../2-processGraph/unroll_mapping_1.txt unitig-mapping-3a.txt unitig-mapping-3b.txt \
> combined-nodemap-1.txt
echo Combine edges
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa connected-tip.gfaunitig-connected-tip.gfa popped-connected-tip.gfa unrolled-popped-connected-tip.gfaunitig-unrolled-popped-connected-tip.gfa \
| grep '^L' \
> combined-edges-1.gfa
# this used to have grep -v \*
#
echo Find lengths
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa connected-tip.gfaunitig-connected-tip.gfa popped-connected-tip.gfa unrolled-popped-connected-tip.gfaunitig-unrolled-popped-connected-tip.gfa \
| awk 'BEGIN \
{ \
FS="[ \t]+"; OFS="\t"; \
} \
$1 == "S" \
{ \
print $2, length($3); \
}' \
> nodelens-1.txt
echo Fix coverage
/home//anaconda3/envs/verkko/lib/verkko/scripts/get_original_coverage.py \
../5-untip/unitig-unrolled-popped-connected-tip.gfa \
combined-nodemap-1.txt \
combined-edges-1.gfa \
nodelens-1.txt \
../1-buildGraph/hifi_nodecov.csv \
>> ./nodecov_hifi_fix.csv
echo Pop bubbles based on coverage round 2
/home//anaconda3/envs/verkko/lib/verkko/scripts/pop_bubbles_coverage_based.py \
unitig-unrolled-popped-connected-tip.gfa \
./nodecov_hifi_fix.csv False \
> popped-unitig-unrolled-popped-connected-tip.gfa \
2> popinfo_round2.err
echo Unroll simple loops round 2
/home//anaconda3/envs/verkko/lib/verkko/scripts/unroll_simple_loops.py popped-unitig-unrolled-popped-connected-tip.gfa ./nodecov_hifi_fix.csv > unrolled-popped-unitig-unrolled-popped-connected-tip.gfa 2>> unitig-mapping-3b.txt
echo Unitigify 4
/home//anaconda3/envs/verkko/lib/verkko/scripts/unitigify.py utig4- unitig-mapping-4.txt < unrolled-popped-unitig-unrolled-popped-connected-tip.gfa > ../5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa
# disable until we support jump links
#
# > pregap-popped-unitig-normal-connected-tip.gfa
#
#echo Add homozygous node scaffold edges
#cat combined-nodemap-1.txt unitig-mapping-3a.txt unitig-mapping-3b.txt unitig-mapping-4.txt > combined-nodemap-2.txt
# /home//anaconda3/envs/verkko/lib/verkko/scripts/add_hom_node_scaffold_edges.py pregap-popped-unitig-normal-connected-tip.gfa combined-nodemap-2.txt homgap > ../5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa
echo Generate Coverage
#
# Generates files used to compute coverage tables. It combines all the
# various bits and pieces of the assembly into three outputs. Which files
# are combined depends on which stage we're at in the assembly process
# (and if ONT data is included); getAllMappings() and getAllGraphs()
# figures this out.
# 5-untip/combined-edges-final.gfa # Edges in the graphs
# 5-untip/nodelens-final.txt # Sequence lines that have actual sequence
# 5-untip/combined-nodemap-final.txt # Maps from nodes to components
#
# Then computes coverage of either HiFi or ONT reads for unitigs at various stages.
# hifi2cov = '2-processGraph/unitig-unrolled-hifi-resolved.hifi-coverage.csv',
# ont2cov = '2-processGraph/unitig-unrolled-hifi-resolved.ont-coverage.csv' if config['withONT'] == "True" else rules.emptyfile.output,
# hifi4cov = '4-processONT/unitig-unrolled-ont-resolved.hifi-coverage.csv' if config['withONT'] == "True" else rules.emptyfile.output,
# ont4cov = '4-processONT/unitig-unrolled-ont-resolved.ont-coverage.csv' if config['withONT'] == "True" else rules.emptyfile.output,
# hifi5cov = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.hifi-coverage.csv',
# ont5cov = '5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.ont-coverage.csv' if config['withONT'] == "True" else rules.emptyfile.output,
cat ../2-processGraph/unitig-mapping-1.txt ../2-processGraph/haplofix-mapping.txt ../2-processGraph/unroll_mapping_1.txt ../5-untip/unitig-mapping-3a.txt ../5-untip/unitig-mapping-3b.txt ../5-untip/unitig-mapping-4.txt \
> ./combined-nodemap-final.txt
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa ../5-untip/connected-tip.gfa ../5-untip/unitig-connected-tip.gfa ../5-untip/popped-connected-tip.gfa ../5-untip/unrolled-popped-connected-tip.gfa ../5-untip/unitig-unrolled-popped-connected-tip.gfa ../5-untip/popped-unitig-unrolled-popped-connected-tip.gfa ../5-untip/unrolled-popped-unitig-unrolled-popped-connected-tip.gfa ../5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa \
| grep '^L' \
> ./combined-edges-final.gfa
cat ../1-buildGraph/hifi-resolved.gfa ../2-processGraph/gapped-once-hifi-resolved.gfa ../2-processGraph/gapped-twice-hifi-resolved.gfa ../2-processGraph/gapped-hifi-resolved.gfa ../2-processGraph/fixed-hifi-resolved.gfa ../2-processGraph/unrolled-hifi-resolved.gfa ../2-processGraph/unitig-unrolled-hifi-resolved.gfa ../5-untip/connected-tip.gfa ../5-untip/unitig-connected-tip.gfa ../5-untip/popped-connected-tip.gfa ../5-untip/unrolled-popped-connected-tip.gfa ../5-untip/unitig-unrolled-popped-connected-tip.gfa ../5-untip/popped-unitig-unrolled-popped-connected-tip.gfa ../5-untip/unrolled-popped-unitig-unrolled-popped-connected-tip.gfa ../5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip.gfa \
| awk 'BEGIN \
{ \
FS="[ \t]+"; OFS="\t"; \
} \
($1 == "S") && ($3 != "*") \
{ \
print $2, length($3); \
}' \
> ./nodelens-final.txt
for gg in 2-processGraph/unitig-unrolled-hifi-resolved \
4-processONT/unitig-unrolled-ont-resolved \
5-untip/unitig-unrolled-unitig-unrolled-popped-unitig-normal-connected-tip ; do
if [ -e ../$gg.gfa ] ; then
/home//anaconda3/envs/verkko/lib/verkko/scripts/get_original_coverage.py ../$gg.gfa ./combined-nodemap-final.txt ./combined-edges-final.gfa ./nodelens-final.txt ../1-buildGraph/hifi_nodecov.csv > ../$gg.hifi-coverage.csv
fi
if [ -e ../$gg.gfa ] ; then
/home//anaconda3/envs/verkko/lib/verkko/scripts/get_original_coverage.py ../$gg.gfa ./combined-nodemap-final.txt ./combined-edges-final.gfa ./nodelens-final.txt ../emptyfile > ../$gg.ont-coverage.csv
fi
doneMetadata
Metadata
Assignees
Labels
No labels