Skip to content

Commit e525725

Browse files
updated phyTools::extract_[intergenic|CDS]_from_genbank as GIs will be deprecated in Sept2016
https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers
1 parent 972fca0 commit e525725

6 files changed

+35
-20
lines changed

CHANGES.txt

+2
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,5 @@
138138
20052016: make_nr_pangenome_matrix.pl can now take all references matching nr clusters (see var $ONLYBESTREFHIT)
139139
27052016: improved the description of $MIN_PERSEQID_HOM_EST and $MIN_COVERAGE_HOM_EST and added a FAQ
140140
27052016: improved the description of $MIN_PERSEQID_HOM and $MIN_COVERAGE_HOM and added a FAQ
141+
23062016: updated phyTools extract_intergenic_from_genbank & extract_CDSs_from_genbank as GIs will be deprecated in Sept2016 (https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers)
142+
23062016: updated manuals

lib/marfil_homology.pm

+1-1
Original file line numberDiff line numberDiff line change
@@ -3219,7 +3219,7 @@ sub find_OMCL_clusters
32193219
}
32203220

32213221
# returns: adds all orthologies found by orth all against all comparisons
3222-
# uses globals. %graph,%weight,$bpofile,$last_graph_item
3222+
# uses globals: %graph,%weight,$bpofile,$last_graph_item
32233223
# modified from original orthomcl.pl, last change in May2012
32243224
# jan2015 added $ref_full_sequence_taxa
32253225
sub findAllOrthologiesORTHMCL

lib/phyTools.pm

+31-18
Original file line numberDiff line numberDiff line change
@@ -620,14 +620,15 @@ sub add_labels2newick_tree
620620
return join(";\n",split(/;/,$fully_labelled_tree));
621621
}
622622

623+
# Updated Jun2016
623624
sub extract_intergenic_from_genbank
624625
{
625-
626-
# takes a genbank input file and creates a FNA file containing all intergenic sequences found
627-
# inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
628-
# use $min_intergenic_size = 0 to skip minimal length tests
629-
# use $max_intergenic_size = 0 to skip maximum length test
630-
# use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
626+
# takes a genbank input file and creates a FNA file containing all intergenic sequences found
627+
# inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
628+
# use $min_intergenic_size = 0 to skip minimal length tests
629+
# use $max_intergenic_size = 0 to skip maximum length test
630+
# use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
631+
631632
my ($infile,$out_intergenic_file,$min_intergenic_size,$max_intergenic_size,$length_flanking_ORFs) = @_;
632633
if(!defined($length_flanking_ORFs) || $length_flanking_ORFs < 0){ $length_flanking_ORFs = 0 }
633634
my ($n_of_intergenic,$gi,$start,$end,$length,$strand,$genename,$taxon) = (0);
@@ -643,17 +644,24 @@ sub extract_intergenic_from_genbank
643644
$taxon = '';
644645
for my $f ($seq->get_SeqFeatures)
645646
{
646-
if($f->primary_tag =~ /CDS|rRNA|tRNA/) # campos de 'genes'
647+
if($f->primary_tag =~ /CDS|rRNA|tRNA/) # ~genes
647648
{
649+
#GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
648650
$gi = $genename = ''; # compatible con subrutina extract_CDS_from_genbank
649-
if($f->has_tag('db_xref'))
651+
#if($f->has_tag('db_xref'))
652+
#{
653+
# my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
654+
# if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
655+
#}
656+
657+
if($f->has_tag('protein_id'))
650658
{
651-
my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
652-
if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
653-
}
654-
elsif($f->has_tag('locus_tag'))
659+
$gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
660+
}
661+
662+
if($f->has_tag('locus_tag') && $gi eq '')
655663
{
656-
if($gi eq '' && $f->has_tag('locus_tag')){ $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag')); }
664+
$gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
657665
}
658666

659667
if($f->has_tag('gene'))
@@ -834,7 +842,7 @@ sub extract_features_from_genbank
834842
}
835843
elsif($f->has_tag('db_xref') && $gi eq '')
836844
{
837-
$gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
845+
$gi = "ID:".join(',',sort $f->each_tag_value('db_xref'));
838846
}
839847

840848
if($f->has_tag('gene'))
@@ -880,9 +888,9 @@ sub extract_features_from_genbank
880888
return \%already_seen;
881889
}
882890

891+
# Updated Jun2016
883892
sub extract_CDSs_from_genbank
884893
{
885-
886894
# takes a genbank input file and creates two FASTA files containing all CDSs in
887895
# aminoacid and dna sequences, respectively
888896
# returns number of CDSs found
@@ -920,12 +928,12 @@ sub extract_CDSs_from_genbank
920928
}
921929
elsif($f->primary_tag() =~ /CDS/)
922930
{
931+
#GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
923932
$gene=$gi=$crossrefs=$genelength=$protsequence=$CDSseq=$rev='';
924933
$CDScoords = $f->spliced_seq();
925934

926935
if($f->location->isa('Bio::Location::SplitLocationI'))
927936
{
928-
# Bruno Jul2014
929937
#LOCUS NC_002505 2961149 bp DNA circular BCT 24-MAY-2010
930938
#ACCESSION NC_002505
931939
#VERSION NC_002505.1 GI:1564003
@@ -950,13 +958,18 @@ sub extract_CDSs_from_genbank
950958
{
951959
$CDSseq = $CDScoords->{'seq'};
952960
}
953-
961+
954962
if($f->has_tag('db_xref'))
955963
{
956964
$crossrefs = join(',',sort $f->each_tag_value('db_xref'));
957-
if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
965+
#if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
958966
next if($crossrefs =~ /PSEUDO:/); # no sabemos si es universal, funciona con Bradyrizobium_ORS278.gb
959967
}
968+
969+
if($f->has_tag('protein_id'))
970+
{
971+
$gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
972+
}
960973
if($f->has_tag('translation'))
961974
{
962975
$protsequence = join('',sort $f->each_tag_value('translation'));

manual_get_homologues-est.pdf

14.1 KB
Binary file not shown.

manual_get_homologues.pdf

-49.7 KB
Binary file not shown.

plot_pancore_matrix.pl

+1-1
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ sub plot_pan_genome
687687
}
688688
689689
png(file="$PNGfile");
690-
plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20);
690+
plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20); #,ylim=c(25000,35000));
691691
axis(side=1,at=xaxis_labels);
692692
if(converged == TRUE)
693693
{

0 commit comments

Comments
 (0)