@@ -620,14 +620,15 @@ sub add_labels2newick_tree
620
620
return join (" ;\n " ,split (/ ;/ ,$fully_labelled_tree ));
621
621
}
622
622
623
+ # Updated Jun2016
623
624
sub extract_intergenic_from_genbank
624
625
{
625
-
626
- # takes a genbank input file and creates a FNA file containing all intergenic sequences found
627
- # inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
628
- # use $min_intergenic_size = 0 to skip minimal length tests
629
- # use $max_intergenic_size = 0 to skip maximum length test
630
- # use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
626
+ # takes a genbank input file and creates a FNA file containing all intergenic sequences found
627
+ # inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
628
+ # use $min_intergenic_size = 0 to skip minimal length tests
629
+ # use $max_intergenic_size = 0 to skip maximum length test
630
+ # use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
631
+
631
632
my ($infile ,$out_intergenic_file ,$min_intergenic_size ,$max_intergenic_size ,$length_flanking_ORFs ) = @_ ;
632
633
if (!defined ($length_flanking_ORFs ) || $length_flanking_ORFs < 0){ $length_flanking_ORFs = 0 }
633
634
my ($n_of_intergenic ,$gi ,$start ,$end ,$length ,$strand ,$genename ,$taxon ) = (0);
@@ -643,17 +644,24 @@ sub extract_intergenic_from_genbank
643
644
$taxon = ' ' ;
644
645
for my $f ($seq -> get_SeqFeatures)
645
646
{
646
- if ($f -> primary_tag =~ / CDS|rRNA|tRNA/ ) # campos de ' genes'
647
+ if ($f -> primary_tag =~ / CDS|rRNA|tRNA/ ) # ~ genes
647
648
{
649
+ # GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
648
650
$gi = $genename = ' ' ; # compatible con subrutina extract_CDS_from_genbank
649
- if ($f -> has_tag(' db_xref' ))
651
+ # if($f->has_tag('db_xref'))
652
+ # {
653
+ # my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
654
+ # if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
655
+ # }
656
+
657
+ if ($f -> has_tag(' protein_id' ))
650
658
{
651
- my $crossrefs = join (' ,' ,sort $f -> each_tag_value(' db_xref ' ));
652
- if ( $crossrefs =~ / (GI \:\d +) / ){ $gi = $1 }
653
- }
654
- elsif ($f -> has_tag(' locus_tag' ))
659
+ $gi = " ID: " . join (' ,' ,sort $f -> each_tag_value(' protein_id ' )); # print "$gi\n" ;
660
+ }
661
+
662
+ if ($f -> has_tag(' locus_tag' ) && $gi eq ' ' )
655
663
{
656
- if ( $gi eq ' ' && $f -> has_tag( ' locus_tag ' )){ $gi = " ID:" .join (' ,' ,sort $f -> each_tag_value(' locus_tag' )); }
664
+ $gi = " ID:" .join (' ,' ,sort $f -> each_tag_value(' locus_tag' ));
657
665
}
658
666
659
667
if ($f -> has_tag(' gene' ))
@@ -834,7 +842,7 @@ sub extract_features_from_genbank
834
842
}
835
843
elsif ($f -> has_tag(' db_xref' ) && $gi eq ' ' )
836
844
{
837
- $gi = " ID:" .join (' ,' ,sort $f -> each_tag_value(' locus_tag ' ));
845
+ $gi = " ID:" .join (' ,' ,sort $f -> each_tag_value(' db_xref ' ));
838
846
}
839
847
840
848
if ($f -> has_tag(' gene' ))
@@ -880,9 +888,9 @@ sub extract_features_from_genbank
880
888
return \%already_seen ;
881
889
}
882
890
891
+ # Updated Jun2016
883
892
sub extract_CDSs_from_genbank
884
893
{
885
-
886
894
# takes a genbank input file and creates two FASTA files containing all CDSs in
887
895
# aminoacid and dna sequences, respectively
888
896
# returns number of CDSs found
@@ -920,12 +928,12 @@ sub extract_CDSs_from_genbank
920
928
}
921
929
elsif ($f -> primary_tag() =~ / CDS/ )
922
930
{
931
+ # GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
923
932
$gene =$gi =$crossrefs =$genelength =$protsequence =$CDSseq =$rev =' ' ;
924
933
$CDScoords = $f -> spliced_seq();
925
934
926
935
if ($f -> location-> isa(' Bio::Location::SplitLocationI' ))
927
936
{
928
- # Bruno Jul2014
929
937
# LOCUS NC_002505 2961149 bp DNA circular BCT 24-MAY-2010
930
938
# ACCESSION NC_002505
931
939
# VERSION NC_002505.1 GI:1564003
@@ -950,13 +958,18 @@ sub extract_CDSs_from_genbank
950
958
{
951
959
$CDSseq = $CDScoords -> {' seq' };
952
960
}
953
-
961
+
954
962
if ($f -> has_tag(' db_xref' ))
955
963
{
956
964
$crossrefs = join (' ,' ,sort $f -> each_tag_value(' db_xref' ));
957
- if ($crossrefs =~ / (GI\:\d +)/ ){ $gi = $1 ; $crossrefs =~ s / $gi// }
965
+ # if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
958
966
next if ($crossrefs =~ / PSEUDO:/ ); # no sabemos si es universal, funciona con Bradyrizobium_ORS278.gb
959
967
}
968
+
969
+ if ($f -> has_tag(' protein_id' ))
970
+ {
971
+ $gi = " ID:" .join (' ,' ,sort $f -> each_tag_value(' protein_id' )); # print "$gi\n";
972
+ }
960
973
if ($f -> has_tag(' translation' ))
961
974
{
962
975
$protsequence = join (' ' ,sort $f -> each_tag_value(' translation' ));
0 commit comments