From c2315ad3ea8fd1b886c4ab3476ef990db43e6a47 Mon Sep 17 00:00:00 2001 From: pmitev Date: Mon, 18 Jan 2021 17:29:19 +0100 Subject: [PATCH] fixes --- docs/Bio/NCBI-taxonomy.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docs/Bio/NCBI-taxonomy.md b/docs/Bio/NCBI-taxonomy.md index 5bad557d..f963d416 100644 --- a/docs/Bio/NCBI-taxonomy.md +++ b/docs/Bio/NCBI-taxonomy.md @@ -171,6 +171,39 @@ $ ./01.tabulate-names.awk <(bzcat names.dmp.bz2) | bzip2 -c > names.tab.bz2 function Cap (string) { return toupper(substr(string,0,1))substr(string,2) } ``` +Note that this script will keep the last information for the corresponding match for each ID. To prevent this we need to take care that any subsequent match is ignored + + +??? note "01.tabulate-names-first.awk" + ``` awk + #!/usr/bin/awk -f + BEGIN{ + FS="|" + # print "#taxonID scientific_name common_name genbank_common_name" + } + + $4 ~ "scientific name" { if (! sciname[$1*1] ) sciname[$1*1]= unds(Clean($2)); next} + + $4 ~ "common name" { if (! com_name[$1*1]) com_name[$1*1]= Cap(Clean($2)); next} + + $4 ~ "genbank common name" { if (! genbank[$1*1] ) genbank[$1*1]= unds(Clean($2)); next} + + END{ + for(i in sciname) print i"|"sciname[i]"|"com_name[i]"|"genbank[i] + } + + function Clean (string){ + sub(/^[ \t]+/, "",string) + sub(/[ \t]+$/, "",string) + return string + } + + function unds(string) { gsub(" ","_",string); return string} + + function Cap (string) { return toupper(substr(string,0,1))substr(string,2) } + ``` + + --- ### Step 2