updated phyTools::extract_[intergenic|CDS]_from_genbank as GIs will be deprecated in Sept2016

eead-csic-compbio · eead-csic-compbio · commit e525725de579 · 2016-06-23T17:30:04.000+02:00
https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -138,3 +138,5 @@
 20052016: make_nr_pangenome_matrix.pl can now take all references matching nr clusters (see var $ONLYBESTREFHIT)
 27052016: improved the description of $MIN_PERSEQID_HOM_EST and $MIN_COVERAGE_HOM_EST and added a FAQ
 27052016: improved the description of $MIN_PERSEQID_HOM and $MIN_COVERAGE_HOM and added a FAQ
+23062016: updated phyTools extract_intergenic_from_genbank & extract_CDSs_from_genbank as GIs will be deprecated in Sept2016 (https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers)
+23062016: updated manuals
diff --git a/lib/marfil_homology.pm b/lib/marfil_homology.pm
@@ -3219,7 +3219,7 @@ sub find_OMCL_clusters
 }
 
 # returns: adds all orthologies found by orth all against all comparisons
-# uses globals. %graph,%weight,$bpofile,$last_graph_item
+# uses globals: %graph,%weight,$bpofile,$last_graph_item
 # modified from original orthomcl.pl, last change in May2012
 # jan2015 added $ref_full_sequence_taxa
 sub findAllOrthologiesORTHMCL
diff --git a/lib/phyTools.pm b/lib/phyTools.pm
@@ -620,14 +620,15 @@ sub add_labels2newick_tree
   return join(";\n",split(/;/,$fully_labelled_tree));
 }
 
+# Updated Jun2016
 sub extract_intergenic_from_genbank
 {
-
-# takes a genbank input file and creates a FNA file containing all intergenic sequences found
-# inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
-# use $min_intergenic_size = 0 to skip minimal length tests
-# use $max_intergenic_size = 0 to skip maximum length test
-# use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
+  # takes a genbank input file and creates a FNA file containing all intergenic sequences found
+  # inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
+  # use $min_intergenic_size = 0 to skip minimal length tests
+  # use $max_intergenic_size = 0 to skip maximum length test
+  # use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
+  
   my ($infile,$out_intergenic_file,$min_intergenic_size,$max_intergenic_size,$length_flanking_ORFs) = @_;
   if(!defined($length_flanking_ORFs) || $length_flanking_ORFs < 0){ $length_flanking_ORFs = 0 }
   my ($n_of_intergenic,$gi,$start,$end,$length,$strand,$genename,$taxon) = (0);
@@ -643,17 +644,24 @@ sub extract_intergenic_from_genbank
     $taxon = '';
     for my $f ($seq->get_SeqFeatures)
     {
-      if($f->primary_tag =~ /CDS|rRNA|tRNA/) # campos de 'genes'
+      if($f->primary_tag =~ /CDS|rRNA|tRNA/) # ~genes
       {
+        #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
         $gi = $genename = ''; # compatible con subrutina extract_CDS_from_genbank
-        if($f->has_tag('db_xref'))
+        #if($f->has_tag('db_xref'))
+        #{
+        #  my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
+        #  if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
+        #}
+        
+        if($f->has_tag('protein_id'))
         {
-          my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
-          if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
-        }
-        elsif($f->has_tag('locus_tag'))
+          $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
+        } 
+  
+        if($f->has_tag('locus_tag') && $gi eq '')
         {
-          if($gi eq '' && $f->has_tag('locus_tag')){ $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag')); }
+          $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
         }
 
         if($f->has_tag('gene'))
@@ -834,7 +842,7 @@ sub extract_features_from_genbank
         }
         elsif($f->has_tag('db_xref') && $gi eq '')
         {
-          $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
+          $gi = "ID:".join(',',sort $f->each_tag_value('db_xref'));
         }
 
         if($f->has_tag('gene'))
@@ -880,9 +888,9 @@ sub extract_features_from_genbank
   return \%already_seen;
 }
 
+# Updated Jun2016
 sub extract_CDSs_from_genbank
 {
-
  # takes a genbank input file and creates two FASTA files containing all CDSs in
  # aminoacid and dna sequences, respectively
  # returns number of CDSs found
@@ -920,12 +928,12 @@ sub extract_CDSs_from_genbank
       }
       elsif($f->primary_tag() =~ /CDS/)
       {
+        #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
         $gene=$gi=$crossrefs=$genelength=$protsequence=$CDSseq=$rev=''; 
         $CDScoords = $f->spliced_seq();
 
         if($f->location->isa('Bio::Location::SplitLocationI'))
         {
-          # Bruno Jul2014
           #LOCUS       NC_002505            2961149 bp    DNA     circular BCT 24-MAY-2010
           #ACCESSION   NC_002505
           #VERSION     NC_002505.1  GI:1564003
@@ -950,13 +958,18 @@ sub extract_CDSs_from_genbank
         {
           $CDSseq = $CDScoords->{'seq'};
         }
-
+        
         if($f->has_tag('db_xref'))
         {
           $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
-          if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
+          #if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
           next if($crossrefs =~ /PSEUDO:/); # no sabemos si es universal, funciona con Bradyrizobium_ORS278.gb
         }
+        
+        if($f->has_tag('protein_id'))
+        {
+          $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
+        }        
         if($f->has_tag('translation'))
         {
           $protsequence = join('',sort $f->each_tag_value('translation'));
diff --git a/manual_get_homologues-est.pdf b/manual_get_homologues-est.pdf
diff --git a/manual_get_homologues.pdf b/manual_get_homologues.pdf
diff --git a/plot_pancore_matrix.pl b/plot_pancore_matrix.pl
@@ -687,7 +687,7 @@ sub plot_pan_genome
 	}
   
 	png(file="$PNGfile");
-	plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20);
+	plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20); #,ylim=c(25000,35000));
 	axis(side=1,at=xaxis_labels);	
 	if(converged == TRUE) 
         {

Original file line number	Diff line number	Diff line change
`@@ -3219,7 +3219,7 @@ sub find_OMCL_clusters`
`3219`	`3219`	`}`
`3220`	`3220`
`3221`	`3221`	`# returns: adds all orthologies found by orth all against all comparisons`
`3222`		`-# uses globals. %graph,%weight,$bpofile,$last_graph_item`
	`3222`	`+# uses globals: %graph,%weight,$bpofile,$last_graph_item`
`3223`	`3223`	`# modified from original orthomcl.pl, last change in May2012`
`3224`	`3224`	`# jan2015 added $ref_full_sequence_taxa`
`3225`	`3225`	`sub findAllOrthologiesORTHMCL`
Original file line number	Diff line number	Diff line change
`@@ -620,14 +620,15 @@ sub add_labels2newick_tree`
`620`	`620`	`return join(";\n",split(/;/,$fully_labelled_tree));`
`621`	`621`	`}`
`622`	`622`
	`623`	`+# Updated Jun2016`
`623`	`624`	`sub extract_intergenic_from_genbank`
`624`	`625`	`{`
`625`		`-`
`626`		`-# takes a genbank input file and creates a FNA file containing all intergenic sequences found`
`627`		`-# inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html`
`628`		`-# use $min_intergenic_size = 0 to skip minimal length tests`
`629`		`-# use $max_intergenic_size = 0 to skip maximum length test`
`630`		`-# use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors`
	`626`	`+ # takes a genbank input file and creates a FNA file containing all intergenic sequences found`
	`627`	`+ # inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html`
	`628`	`+ # use $min_intergenic_size = 0 to skip minimal length tests`
	`629`	`+ # use $max_intergenic_size = 0 to skip maximum length test`
	`630`	`+ # use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors`
	`631`	`+`
`631`	`632`	`my ($infile,$out_intergenic_file,$min_intergenic_size,$max_intergenic_size,$length_flanking_ORFs) = @_;`
`632`	`633`	`if(!defined($length_flanking_ORFs) \|\| $length_flanking_ORFs < 0){ $length_flanking_ORFs = 0 }`
`633`	`634`	`my ($n_of_intergenic,$gi,$start,$end,$length,$strand,$genename,$taxon) = (0);`
`@@ -643,17 +644,24 @@ sub extract_intergenic_from_genbank`
`643`	`644`	`$taxon = '';`
`644`	`645`	`for my $f ($seq->get_SeqFeatures)`
`645`	`646`	`{`
`646`		`- if($f->primary_tag =~ /CDS\|rRNA\|tRNA/) # campos de 'genes'`
	`647`	`+ if($f->primary_tag =~ /CDS\|rRNA\|tRNA/) # ~genes`
`647`	`648`	`{`
	`649`	`+ #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/`
`648`	`650`	`$gi = $genename = ''; # compatible con subrutina extract_CDS_from_genbank`
`649`		`- if($f->has_tag('db_xref'))`
	`651`	`+ #if($f->has_tag('db_xref'))`
	`652`	`+ #{`
	`653`	`+ # my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));`
	`654`	`+ # if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }`
	`655`	`+ #}`
	`656`	`+`
	`657`	`+ if($f->has_tag('protein_id'))`
`650`	`658`	`{`
`651`		`- my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));`
`652`		`- if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }`
`653`		`- }`
`654`		`- elsif($f->has_tag('locus_tag'))`
	`659`	`+ $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";`
	`660`	`+ }`
	`661`	`+`
	`662`	`+ if($f->has_tag('locus_tag') && $gi eq '')`
`655`	`663`	`{`
`656`		`- if($gi eq '' && $f->has_tag('locus_tag')){ $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag')); }`
	`664`	`+ $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));`
`657`	`665`	`}`
`658`	`666`
`659`	`667`	`if($f->has_tag('gene'))`
`@@ -834,7 +842,7 @@ sub extract_features_from_genbank`
`834`	`842`	`}`
`835`	`843`	`elsif($f->has_tag('db_xref') && $gi eq '')`
`836`	`844`	`{`
`837`		`- $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));`
	`845`	`+ $gi = "ID:".join(',',sort $f->each_tag_value('db_xref'));`
`838`	`846`	`}`
`839`	`847`
`840`	`848`	`if($f->has_tag('gene'))`
`@@ -880,9 +888,9 @@ sub extract_features_from_genbank`
`880`	`888`	`return \%already_seen;`
`881`	`889`	`}`
`882`	`890`
	`891`	`+# Updated Jun2016`
`883`	`892`	`sub extract_CDSs_from_genbank`
`884`	`893`	`{`
`885`		`-`
`886`	`894`	`# takes a genbank input file and creates two FASTA files containing all CDSs in`
`887`	`895`	`# aminoacid and dna sequences, respectively`
`888`	`896`	`# returns number of CDSs found`
`@@ -920,12 +928,12 @@ sub extract_CDSs_from_genbank`
`920`	`928`	`}`
`921`	`929`	`elsif($f->primary_tag() =~ /CDS/)`
`922`	`930`	`{`
	`931`	`+ #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/`
`923`	`932`	`$gene=$gi=$crossrefs=$genelength=$protsequence=$CDSseq=$rev='';`
`924`	`933`	`$CDScoords = $f->spliced_seq();`
`925`	`934`
`926`	`935`	`if($f->location->isa('Bio::Location::SplitLocationI'))`
`927`	`936`	`{`
`928`		`- # Bruno Jul2014`
`929`	`937`	`#LOCUS NC_002505 2961149 bp DNA circular BCT 24-MAY-2010`
`930`	`938`	`#ACCESSION NC_002505`
`931`	`939`	`#VERSION NC_002505.1 GI:1564003`
`@@ -950,13 +958,18 @@ sub extract_CDSs_from_genbank`
`950`	`958`	`{`
`951`	`959`	`$CDSseq = $CDScoords->{'seq'};`
`952`	`960`	`}`
`953`		`-`
	`961`	`+`
`954`	`962`	`if($f->has_tag('db_xref'))`
`955`	`963`	`{`
`956`	`964`	`$crossrefs = join(',',sort $f->each_tag_value('db_xref'));`
`957`		`- if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }`
	`965`	`+ #if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }`
`958`	`966`	`next if($crossrefs =~ /PSEUDO:/); # no sabemos si es universal, funciona con Bradyrizobium_ORS278.gb`
`959`	`967`	`}`
	`968`	`+`
	`969`	`+ if($f->has_tag('protein_id'))`
	`970`	`+ {`
	`971`	`+ $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";`
	`972`	`+ }`
`960`	`973`	`if($f->has_tag('translation'))`
`961`	`974`	`{`
`962`	`975`	`$protsequence = join('',sort $f->each_tag_value('translation'));`
Original file line number	Diff line number	Diff line change
`@@ -687,7 +687,7 @@ sub plot_pan_genome`
`687`	`687`	`}`
`688`	`688`
`689`	`689`	`png(file="$PNGfile");`
`690`		`- plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20);`
	`690`	`+ plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20); #,ylim=c(25000,35000));`
`691`	`691`	`axis(side=1,at=xaxis_labels);`
`692`	`692`	`if(converged == TRUE)`
`693`	`693`	`{`