From 48eadb6149af156306a26d3c7a93a2f6a55ba487 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Fri, 5 Jul 2024 13:47:29 +0200 Subject: [PATCH 1/7] indentation --- bin/agat_sp_filter_feature_by_attribute_presence.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/agat_sp_filter_feature_by_attribute_presence.pl b/bin/agat_sp_filter_feature_by_attribute_presence.pl index 431c8aa2..657adfc3 100755 --- a/bin/agat_sp_filter_feature_by_attribute_presence.pl +++ b/bin/agat_sp_filter_feature_by_attribute_presence.pl @@ -314,7 +314,8 @@ =head1 OPTIONS =item B<--attribute>, B<--att>, B<-a> -String - Attributes tag specified will be used to filter the feature type (feature type can also be specified by the option -p). List of attribute tags must be coma separated. +String - Attributes tag specified will be used to filter the feature type (feature type can also be specified by the option -p). +List of attribute tags must be coma separated. =item B<--flip> From 9b9fa05d0473ce1c6074ad1ac5cafc2377099942 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Fri, 5 Jul 2024 14:01:05 +0200 Subject: [PATCH 2/7] =?UTF-8?q?Update=20remove=5Fl2=5Fand=5Frelatives=20an?= =?UTF-8?q?d=20remove=5Fl3=5Fand=5Frelatives=20to=20be=20able=20to=20print?= =?UTF-8?q?=20the=20complete=C2=A0features=20instead=20of=20the=20ID=20onl?= =?UTF-8?q?y.=20Add=20print=5Ffeature=5Fsafe=20to=20be=20able=20to=20print?= =?UTF-8?q?=20GFF=20object=20file=20handler=20as=20well=20as=20IO::File?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/AGAT/OmniscientTool.pm | 59 +++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/lib/AGAT/OmniscientTool.pm b/lib/AGAT/OmniscientTool.pm index 2203c795..65ba6acf 100644 --- a/lib/AGAT/OmniscientTool.pm +++ b/lib/AGAT/OmniscientTool.pm @@ -878,6 +878,31 @@ sub remove_element_from_omniscient { } } +# Print a feature safely. +# +# It ensures that the feature is printed correctly without causing any errors according the output type. +# +# Parameters: +# $feature : the feature to print +# $out : AGAT::BioperlGFF / Bio::Tools::GFF object, or IO::File object or empty value +# Returns: +# None +# +# Note: +# This function should be used when printing features to avoid any potential errors. +sub print_feature_safe{ + my ($feature, $out)=@_; + + if($out){ + if($out->isa('AGAT::BioperlGFF' ) or $out->isa('Bio::Tools::GFF')){ + $out->write_feature( $feature ); + } + else{ + print $out $feature->gff_string()."\n"; + } + } +} + # @Purpose: remove from omniscient l1 feature and all subfeatures # @input: 3 => hash(omniscient hash), feature L1, optional fh to write case removed # @output: 1 => hash (nb feature removed) @@ -899,19 +924,19 @@ sub remove_l1_and_relatives{ foreach my $ptag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... if ( exists_keys( $omniscient, ('level3', $ptag_l3, $level2_ID) ) ){ foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$level2_ID}}) { - $cases_l3++; - print $fh_removed $feature_l3->_tag_value('ID')."\n" if ($fh_removed); + $cases_l3++; + print_feature_safe( $feature_l3, $fh_removed ); } delete $omniscient->{'level3'}{$ptag_l3}{$level2_ID} # delete level3 } } $cases_l2++; - print $fh_removed $feature_l2->_tag_value('ID')."\n" if($fh_removed); + print_feature_safe( $feature_l2, $fh_removed ); } delete $omniscient->{'level2'}{$ptag_l2}{$id_l1} # delete level2 } } - print $fh_removed $feature->gff_string()."\n" if $fh_removed; + print_feature_safe( $feature, $fh_removed ); delete $omniscient->{'level1'}{$tag_l1}{$id_l1}; # delete level1 $cases_l1++; @@ -930,8 +955,8 @@ sub remove_l2_and_relatives{ my ($omniscient, $feature, $ptag_l1, $id_l1, $fh_removed)=@_; my %cases; - my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; - my $ptag_l2 = lc($feature->primary_tag); + my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; + my $ptag_l2 = lc($feature->primary_tag); my $level2_Parent_ID = lc($feature->_tag_value('Parent')); my $level2_ID = lc($feature->_tag_value('ID')); @@ -943,8 +968,8 @@ sub remove_l2_and_relatives{ foreach my $ptag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... if ( exists_keys( $omniscient, ('level3', $ptag_l3, $level2_ID) ) ){ foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$level2_ID}}) { - $cases_l3++; - print $fh_removed $feature_l3->_tag_value('ID')."\n" if ($fh_removed); + $cases_l3++; + print_feature_safe( $feature_l3, $fh_removed ); } delete $omniscient->{'level3'}{$ptag_l3}{$level2_ID} # delete level3 } @@ -957,7 +982,7 @@ sub remove_l2_and_relatives{ my @id_list_to_remove=($level2_ID); my @list_tag_key=('all'); $cases_l2++; - print $fh_removed $feature->gff_string()."\n" if ($fh_removed); + print_feature_safe( $feature, $fh_removed ); remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); @@ -972,7 +997,7 @@ sub remove_l2_and_relatives{ if(! $anotherL2Linked){ if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ $cases_l1++; - print $fh_removed $id_l1."\n" if ($fh_removed); + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); delete $omniscient->{'level1'}{$ptag_l1}{$id_l1}; } } @@ -1008,7 +1033,7 @@ sub remove_l3_and_relatives{ my @id_list_to_remove=($id_l3); my @list_tag_key=('all'); $cases_l3++; - print $fh_removed $feature->gff_string()."\n" if ($fh_removed); + print_feature_safe( $feature, $fh_removed ); remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level3','false', \@list_tag_key); } } @@ -1033,7 +1058,8 @@ sub remove_l3_and_relatives{ my @id_list_to_remove=($id_l2); my @list_tag_key=('all'); $cases_l2++; - print $fh_removed $id_l2."\n" if ($fh_removed); + my $feature_l2 = get_feature_l2_from_id_l2_l1($omniscient, $id_l2, $id_l1); + print_feature_safe( $feature_l2, $fh_removed ); remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); # Should we remove l1 too? @@ -1041,7 +1067,7 @@ sub remove_l3_and_relatives{ #The list was empty so l2 has been removed, we can now remove l1 if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ $cases_l1++; - print $fh_removed $id_l1."\n" if($fh_removed); + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); delete $omniscient->{'level1'}{$ptag_l1}{$id_l1} } } @@ -1623,6 +1649,13 @@ sub group_l1IDs_from_omniscient { return \%group; } +# Retrieves the feature level 2 from the given level 2 and level 1 IDs. +# +# Parameters: +# - $id_l2: The ID of the level 2 feature +# - $id_l1: The ID of the parent level 1 feature +# Returns: +# - The feature at level 2 corresponding to the given ID. sub get_feature_l2_from_id_l2_l1 { my ($hash_omniscient, $id_l2, $id_l1) = @_ ; foreach my $tag_l2 (keys %{$hash_omniscient->{'level2'}}){ From 92fee69b3e5ede40076e52aeea971fa567bae7c3 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Fri, 5 Jul 2024 14:02:34 +0200 Subject: [PATCH 3/7] fix #471 + put features that cannot be checked because attribute is missing in a separte file. The user can now decide to let these particular features aside or merge them back (manually) to one or the other output file --- ...at_sp_filter_feature_by_attribute_value.pl | 148 +++++++++++------- 1 file changed, 93 insertions(+), 55 deletions(-) diff --git a/bin/agat_sp_filter_feature_by_attribute_value.pl b/bin/agat_sp_filter_feature_by_attribute_value.pl index c584a7d5..8fe9d3a0 100755 --- a/bin/agat_sp_filter_feature_by_attribute_value.pl +++ b/bin/agat_sp_filter_feature_by_attribute_value.pl @@ -59,7 +59,7 @@ ############### # Test options -if($opt_test ne "<" and $opt_test ne ">" and $opt_test ne "<=" and $opt_test ne ">=" and $opt_test ne "="){ +if($opt_test ne "<" and $opt_test ne ">" and $opt_test ne "<=" and $opt_test ne ">=" and $opt_test ne "=" and $opt_test ne "!"){ print "The test to apply is Wrong: $opt_test.\nWe want something among this list: <,>,<=,>=,! or =.";exit; } @@ -70,19 +70,22 @@ my $gffout_ok_file ; my $fhout_discarded_file ; my $ostreamReport_file; +my $fhout_semidDiscarded_file; if ($opt_output) { my ($outfile,$path,$ext) = fileparse($opt_output,qr/\.[^.]*/); # set file names $gffout_ok_file = $path.$outfile.$ext; - $fhout_discarded_file = $path.$outfile."_discarded.txt"; + $fhout_discarded_file = $path.$outfile."_discarded.gff"; $ostreamReport_file = $path.$outfile."_report.txt"; + $fhout_semidDiscarded_file = $path.$outfile."_cannot_be_tested_because_attribute_missing.gff"; } my $gffout_ok = prepare_gffout($config, $gffout_ok_file); -my $fhout_discarded = prepare_fileout($fhout_discarded_file); +my $fhout_discarded = prepare_gffout($config, $fhout_discarded_file); my $ostreamReport = prepare_fileout($ostreamReport_file); +my $fhout_semidDiscarded = prepare_gffout($config, $fhout_semidDiscarded_file); # Manage $primaryTag my @ptagList; @@ -121,17 +124,14 @@ } } -use Data::Dumper; -print Dumper($value_hash); - # start with some interesting information my $stringPrint = strftime "%m/%d/%Y at %Hh%Mm%Ss", localtime; $stringPrint .= "\nusage: $0 @copyARGV\n"; $stringPrint .= "We will discard $print_feature_string that have the attribute $opt_attribute with the value $opt_test $opt_value"; if ($opt_value_insensitive){ - $stringPrint .= "case insensitive.\n"; + $stringPrint .= " case insensitive.\n"; }else{ - $stringPrint .= "case sensitive.\n"; + $stringPrint .= " case sensitive.\n"; } if ($opt_output){ @@ -142,7 +142,9 @@ ####################### # >>>>>>>>>>>>>>>>>>>>>>>># MAIN #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ####################### -my %all_cases = ('l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0); +my %all_cases = ( 'left' => {'l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0}, + 'discarded' => {'l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0} ); + ###################### ### Parse GFF input # my ($hash_omniscient, $hash_mRNAGeneLink) = slurp_gff3_file_JD({ input => $opt_gff, @@ -167,19 +169,29 @@ $removeit = check_feature($feature_l1, 'level1'); # we can remove feature L1 now because we are looping over $hash_sortBySeq not $hash_omniscient if ($removeit){ - my $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_discarded); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + my $cases; + if($removeit == 2){ + $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_semidDiscarded); + $all_cases{'left'}{'l1'} += $cases->{'l1'}; + $all_cases{'left'}{'l2'} += $cases->{'l2'}; + $all_cases{'left'}{'l3'} += $cases->{'l3'}; + $all_cases{'left'}{'all'} += $cases->{'all'}; + } else { + $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_discarded); + $all_cases{'discarded'}{'l1'} += $cases->{'l1'}; + $all_cases{'discarded'}{'l2'} += $cases->{'l2'}; + $all_cases{'discarded'}{'l3'} += $cases->{'l3'}; + $all_cases{'discarded'}{'all'} += $cases->{'all'}; + } + next; } ################# # == LEVEL 2 == # ################# - my @list_l2_to_remove; - my @list_l3_to_remove; + my %list_l2_to_remove; + my %list_l3_to_remove; foreach my $tag_l2 (sort keys %{$hash_omniscient->{'level2'}}){ # primary_tag_key_level2 = mrna or mirna or ncrna or trna etc... if ( exists_keys( $hash_omniscient, ('level2', $tag_l2, $id_l1) ) ){ @@ -188,7 +200,11 @@ $removeit = check_feature($feature_l2,'level2'); if ($removeit){ - push @list_l2_to_remove, [$feature_l2, $tag_l1, $id_l1, $fhout_discarded]; + if($removeit == 2){ + push ( @{$list_l2_to_remove{'left'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_semidDiscarded]); + } else { + push ( @{$list_l2_to_remove{'discarded'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_discarded]); + } next; } ################# @@ -203,7 +219,11 @@ $removeit = check_feature($feature_l3, 'level3'); if ($removeit){ - push @list_l3_to_remove, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]; + if($removeit == 2){ + push ( @{$list_l3_to_remove{'left'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_semidDiscarded]); + } else { + push ( @{$list_l3_to_remove{'discarded'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]); + } } } } @@ -212,22 +232,22 @@ } } # Should be removed after looping over them to avoid problems - if (@list_l2_to_remove) { - foreach my $infos (@list_l2_to_remove) { + foreach my $key ( keys %list_l2_to_remove ){ + foreach my $infos ( @{$list_l2_to_remove{$key}} ) { my $cases = remove_l2_and_relatives( $hash_omniscient, @$infos); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + $all_cases{$key}{'l1'} += $cases->{'l1'}; + $all_cases{$key}{'l2'} += $cases->{'l2'}; + $all_cases{$key}{'l3'} += $cases->{'l3'}; + $all_cases{$key}{'all'} += $cases->{'all'}; } } - if (@list_l3_to_remove) { - foreach my $infos (@list_l3_to_remove) { + foreach my $key ( keys %list_l3_to_remove ){ + foreach my $infos ( @{$list_l3_to_remove{$key}} ) { my $cases = remove_l3_and_relatives( $hash_omniscient, @$infos); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + $all_cases{$key}{'l1'} += $cases->{'l1'}; + $all_cases{$key}{'l2'} += $cases->{'l2'}; + $all_cases{$key}{'l3'} += $cases->{'l3'}; + $all_cases{$key}{'all'} += $cases->{'all'}; } } } @@ -236,10 +256,18 @@ print_omniscient( {omniscient => $hash_omniscient, output => $gffout_ok} ); -$stringPrint = $all_cases{'all'}." features removed:\n"; -$stringPrint .= $all_cases{'l1'}." features level1 (e.g. gene) removed\n"; -$stringPrint .= $all_cases{'l2'}." features level2 (e.g. mRNA) removed\n"; -$stringPrint .= $all_cases{'l3'}." features level3 (e.g. exon) removed\n"; +$stringPrint = "Feature discarded by applying the test (see $fhout_discarded_file file):\n"; +$stringPrint .= $all_cases{'discarded'}{'all'}." features removed:\n"; +$stringPrint .= $all_cases{'discarded'}{'l1'}." features level1 (e.g. gene) removed\n"; +$stringPrint .= $all_cases{'discarded'}{'l2'}." features level2 (e.g. mRNA) removed\n"; +$stringPrint .= $all_cases{'discarded'}{'l3'}." features level3 (e.g. exon) removed\n"; + +$stringPrint .= "Feature left out because the attribute is missing (see $fhout_semidDiscarded_file file):\n"; +$stringPrint .= $all_cases{'left'}{'all'}." features removed:\n"; +$stringPrint .= $all_cases{'left'}{'l1'}." features level1 (e.g. gene) removed\n"; +$stringPrint .= $all_cases{'left'}{'l2'}." features level2 (e.g. mRNA) removed\n"; +$stringPrint .= $all_cases{'left'}{'l3'}." features level3 (e.g. exon) removed\n"; + if ($opt_output){ print $ostreamReport $stringPrint; print $stringPrint; @@ -298,35 +326,45 @@ sub should_we_remove_feature{ } # for string values replace = by eq and ! by ne and avoid other type of test if ( ! looks_like_number ($given_value) or ! looks_like_number ($file_value)){ + print "String case\n" if $opt_verbose; if ($opt_test eq "="){ - if ($file_value eq $given_value){return 1; } + if ($file_value eq $given_value) { print "equal\n" if $opt_verbose; return 1; } + else { print "not equal\n" if $opt_verbose; } } elsif ($opt_test eq "!"){ - if ($file_value ne $given_value){return 1; } + if ($file_value ne $given_value){ print "different\n" if $opt_verbose; return 1; } + else { print "not different\n" if $opt_verbose; } + } + } + else{ + print "Number case\n" if $opt_verbose; + if ($opt_test eq "="){ + if ($file_value == $given_value){return 1; } + } + elsif ($opt_test eq "!"){ + if ($file_value != $given_value){return 1; } + } + elsif ($opt_test eq ">"){ + if ($file_value > $given_value){return 1; } + } + elsif ($opt_test eq "<"){ + if ($file_value < $given_value){return 1; } + } + elsif ($opt_test eq "<="){ + if ($file_value <= $given_value){return 1; } + } + elsif ($opt_test eq ">="){ + if ($file_value >= $given_value){return 1; } } - } - elsif ($opt_test eq "="){ - if ($file_value == $given_value){return 1; } - } - elsif ($opt_test eq "!"){ - if ($file_value != $given_value){return 1; } - } - elsif ($opt_test eq ">"){ - if ($file_value > $given_value){return 1; } - } - elsif ($opt_test eq "<"){ - if ($file_value < $given_value){return 1; } - } - elsif ($opt_test eq "<="){ - if ($file_value <= $given_value){return 1; } - } - elsif ($opt_test eq ">="){ - if ($file_value >= $given_value){return 1; } } } } + return 0; + } else { + print "Attribute not found case\n" if $opt_verbose; + print "$opt_attribute ".$feature->gff_string."\n"; + return 2; } - return 0; } __END__ From b07f054d0e8a13e96d67d48f91461d282995eb63 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 9 Jul 2024 10:43:12 +0200 Subject: [PATCH 4/7] remove useless piece of code --- bin/agat_sp_statistics.pl | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/agat_sp_statistics.pl b/bin/agat_sp_statistics.pl index 4f3bce47..6ede4d16 100755 --- a/bin/agat_sp_statistics.pl +++ b/bin/agat_sp_statistics.pl @@ -57,8 +57,6 @@ if(defined($opt_yaml)){ if( defined($opt_output)){ $opt_yaml = $opt_output.".yaml"; - }else{ - $out } } From 33f53336bb82afd860ca4e90e147519663d60b37 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 9 Jul 2024 10:46:12 +0200 Subject: [PATCH 5/7] add keep_parental parameter to remove_l3_and_relatives and remove_l2_and_relatives function to skip removing parental geature(s) if needed. Fix error of trying to remove L2 already removed in case where all CDS are removed in once because share the same ID (in that case L2 is not removed at the last round as expected) --- lib/AGAT/OmniscientTool.pm | 144 +++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 70 deletions(-) diff --git a/lib/AGAT/OmniscientTool.pm b/lib/AGAT/OmniscientTool.pm index 65ba6acf..9e383cb5 100644 --- a/lib/AGAT/OmniscientTool.pm +++ b/lib/AGAT/OmniscientTool.pm @@ -868,7 +868,7 @@ sub remove_element_from_omniscient { if(@listok){ @{$hash_omniscient->{$level}{$tag_key}{$id_concern}}=@listok; } - else{ # The list is empty we could remove the key (otherwise we would have saved a emplty list) + else{ # The list is empty we could remove the key (otherwise we would have saved a empty list) delete $hash_omniscient->{$level}{$tag_key}{$id_concern}; } } @@ -952,7 +952,7 @@ sub remove_l1_and_relatives{ # @input: 5 => hash(omniscient hash), featureL2, primary tag l1, id l1, optional fh to write case removed # @output: 1 => hash (nb feature removed) sub remove_l2_and_relatives{ - my ($omniscient, $feature, $ptag_l1, $id_l1, $fh_removed)=@_; + my ($omniscient, $feature, $ptag_l1, $id_l1, $fh_removed, $keep_parental)=@_; my %cases; my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; @@ -976,7 +976,7 @@ sub remove_l2_and_relatives{ } } } - + # delete level2 and the hash pointer if the list is empty (no isoform left) my @id_concern_list=($id_l1); my @id_list_to_remove=($level2_ID); @@ -985,23 +985,24 @@ sub remove_l2_and_relatives{ print_feature_safe( $feature, $fh_removed ); remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); - - if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - my $anotherL2Linked=0; - foreach my $ptag_l2 (keys %{$omniscient->{'level2'}}){ - if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - $anotherL2Linked=1; - } - } - #The list was empty so l2 has been removed, we can now remove l1 - if(! $anotherL2Linked){ - if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ - $cases_l1++; - print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); - delete $omniscient->{'level1'}{$ptag_l1}{$id_l1}; - } - } - } + if (! $keep_parental){ + if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + my $anotherL2Linked=0; + foreach my $ptag_l2 (keys %{$omniscient->{'level2'}}){ + if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + $anotherL2Linked=1; + } + } + #The list was empty so l2 has been removed, we can now remove l1 + if(! $anotherL2Linked){ + if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ + $cases_l1++; + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); + delete $omniscient->{'level1'}{$ptag_l1}{$id_l1}; + } + } + } + } } $cases{'l1'} = $cases_l1; @@ -1016,62 +1017,65 @@ sub remove_l2_and_relatives{ # @input: 7 => hash(omniscient hash), feature L3, primary tag l2, id l2, primary tag l2, id l2, optional fh to write case removed # @output: 1 => hash (nb feature removed) sub remove_l3_and_relatives{ - my ($omniscient, $feature, $ptag_l1, $id_l1, $ptag_l2, $id_l2, $fh_removed)=@_; - + my ($omniscient, $feature, $ptag_l1, $id_l1, $ptag_l2, $id_l2, $fh_removed, $keep_parental)=@_; + my %cases; my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; - my $level3_Parent_ID = lc($feature->_tag_value('Parent')); - my $id_l3 = lc($feature->_tag_value('ID')); + my $level3_Parent_ID = lc($feature->_tag_value('Parent')); + my $id_l3 = lc($feature->_tag_value('ID')); my $ptag_l3 = lc($feature->primary_tag); - if ( exists_keys( $omniscient, ('level3', $ptag_l3, $id_l2) ) ){ # just extra security in case - foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$id_l2}}) { - - if($id_l3 eq lc($feature_l3->_tag_value('ID')) ){ - #remove one feature and pointer if no more feature left in the list - my @id_concern_list=($level3_Parent_ID); - my @id_list_to_remove=($id_l3); - my @list_tag_key=('all'); - $cases_l3++; - print_feature_safe( $feature, $fh_removed ); - remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level3','false', \@list_tag_key); - } - } - } - - # List empty check if we remove l2 or other l3 linked to it - if( ! exists_keys($omniscient, ('level3', $ptag_l3, $id_l2)) ){ - foreach my $tag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... - if ( exists_keys( $omniscient, ('level3', $tag_l3, $id_l2) ) ){ - $cases{'l1'} = $cases_l1; - $cases{'l2'} = $cases_l2; - $cases{'l3'} = $cases_l3; - $cases{'all'} = $cases_l1+$cases_l2+$cases_l3; - - return \%cases; - } - } + if ( exists_keys( $omniscient, ('level3', $ptag_l3, $id_l2) ) ) { # just extra security in case + foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$id_l2}}) { + + if($id_l3 eq lc($feature_l3->_tag_value('ID')) ){ + #remove one feature and pointer if no more feature left in the list + my @id_concern_list=($level3_Parent_ID); + my @id_list_to_remove=($id_l3); + my @list_tag_key=('all'); + $cases_l3++; + print_feature_safe( $feature, $fh_removed ); + remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level3','false', \@list_tag_key); + } + } + } - # if we arrive here it means no more L3 feature is attached to L2 - # we remove the L2 parent properly (if isoforms they are kept) - my @id_concern_list=($id_l1); - my @id_list_to_remove=($id_l2); - my @list_tag_key=('all'); - $cases_l2++; - my $feature_l2 = get_feature_l2_from_id_l2_l1($omniscient, $id_l2, $id_l1); - print_feature_safe( $feature_l2, $fh_removed ); - remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); + if (! $keep_parental){ + # List empty check if we remove l2 or other l3 linked to it + if( ! exists_keys($omniscient, ('level3', $ptag_l3, $id_l2)) ){ + foreach my $tag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... + if ( exists_keys( $omniscient, ('level3', $tag_l3, $id_l2) ) ){ + $cases{'l1'} = $cases_l1; + $cases{'l2'} = $cases_l2; + $cases{'l3'} = $cases_l3; + $cases{'all'} = $cases_l1+$cases_l2+$cases_l3; + return \%cases; + } + } - # Should we remove l1 too? - if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - #The list was empty so l2 has been removed, we can now remove l1 - if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ - $cases_l1++; - print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); - delete $omniscient->{'level1'}{$ptag_l1}{$id_l1} - } - } - } + # if we arrive here it means no more L3 feature is attached to L2 + # we remove the L2 parent properly (if isoforms they are kept) + if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1)) ){ + my @id_concern_list=($id_l1); + my @id_list_to_remove=($id_l2); + my @list_tag_key=('all'); + $cases_l2++; + my $feature_l2 = get_feature_l2_from_id_l2_l1($omniscient, $id_l2, $id_l1); + print_feature_safe( $feature_l2, $fh_removed ); + remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); + + # Should we remove l1 too? + if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + #The list was empty so l2 has been removed, we can now remove l1 + if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ + $cases_l1++; + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); + delete $omniscient->{'level1'}{$ptag_l1}{$id_l1} + } + } + } + } + } $cases{'l1'} = $cases_l1; $cases{'l2'} = $cases_l2; From 38ca4986cc9955a1d214446d78285c5617b858fb Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 9 Jul 2024 10:47:14 +0200 Subject: [PATCH 6/7] add na_aside parameter for cases where the attribute tag is missing --- ...at_sp_filter_feature_by_attribute_value.pl | 83 +++++++++++-------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/bin/agat_sp_filter_feature_by_attribute_value.pl b/bin/agat_sp_filter_feature_by_attribute_value.pl index 8fe9d3a0..b8631cd8 100755 --- a/bin/agat_sp_filter_feature_by_attribute_value.pl +++ b/bin/agat_sp_filter_feature_by_attribute_value.pl @@ -15,6 +15,8 @@ my $primaryTag=undef; my $opt_output= undef; my $opt_value = undef; +my $opt_keep_parental = undef; +my $opt_na_aside = undef; my $opt_value_insensitive = undef; my $opt_attribute = undef; my $opt_test = "="; @@ -27,6 +29,8 @@ if ( !GetOptions( 'f|ref|reffile|gff=s' => \$opt_gff, 'value=s' => \$opt_value, 'value_insensitive!' => \$opt_value_insensitive, + 'keep_parental!' => \$opt_keep_parental, + 'na_aside!' => \$opt_na_aside, "p|type|l=s" => \$primaryTag, 'a|attribute=s' => \$opt_attribute, 't|test=s' => \$opt_test, @@ -70,7 +74,7 @@ my $gffout_ok_file ; my $fhout_discarded_file ; my $ostreamReport_file; -my $fhout_semidDiscarded_file; +my $fhout_semidDiscarded_file if $opt_na_aside; if ($opt_output) { my ($outfile,$path,$ext) = fileparse($opt_output,qr/\.[^.]*/); @@ -79,13 +83,13 @@ $gffout_ok_file = $path.$outfile.$ext; $fhout_discarded_file = $path.$outfile."_discarded.gff"; $ostreamReport_file = $path.$outfile."_report.txt"; - $fhout_semidDiscarded_file = $path.$outfile."_cannot_be_tested_because_attribute_missing.gff"; + $fhout_semidDiscarded_file = $path.$outfile."_na.gff"; } my $gffout_ok = prepare_gffout($config, $gffout_ok_file); my $fhout_discarded = prepare_gffout($config, $fhout_discarded_file); my $ostreamReport = prepare_fileout($ostreamReport_file); -my $fhout_semidDiscarded = prepare_gffout($config, $fhout_semidDiscarded_file); +my $fhout_semidDiscarded = prepare_gffout($config, $fhout_semidDiscarded_file) if $opt_na_aside; # Manage $primaryTag my @ptagList; @@ -170,20 +174,19 @@ # we can remove feature L1 now because we are looping over $hash_sortBySeq not $hash_omniscient if ($removeit){ my $cases; - if($removeit == 2){ - $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_semidDiscarded); - $all_cases{'left'}{'l1'} += $cases->{'l1'}; - $all_cases{'left'}{'l2'} += $cases->{'l2'}; - $all_cases{'left'}{'l3'} += $cases->{'l3'}; - $all_cases{'left'}{'all'} += $cases->{'all'}; - } else { + if($removeit == 1){ $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_discarded); $all_cases{'discarded'}{'l1'} += $cases->{'l1'}; $all_cases{'discarded'}{'l2'} += $cases->{'l2'}; $all_cases{'discarded'}{'l3'} += $cases->{'l3'}; $all_cases{'discarded'}{'all'} += $cases->{'all'}; + } elsif ($removeit == 2 and $opt_na_aside){ + $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_semidDiscarded); + $all_cases{'na'}{'l1'} += $cases->{'l1'}; + $all_cases{'na'}{'l2'} += $cases->{'l2'}; + $all_cases{'na'}{'l3'} += $cases->{'l3'}; + $all_cases{'na'}{'all'} += $cases->{'all'}; } - next; } @@ -200,10 +203,10 @@ $removeit = check_feature($feature_l2,'level2'); if ($removeit){ - if($removeit == 2){ - push ( @{$list_l2_to_remove{'left'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_semidDiscarded]); - } else { + if ($removeit == 1){ push ( @{$list_l2_to_remove{'discarded'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_discarded]); + } elsif ($removeit == 2 and $opt_na_aside){ + push ( @{$list_l2_to_remove{'na'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_semidDiscarded]); } next; } @@ -218,12 +221,10 @@ foreach my $feature_l3 ( @list_fl3 ) { $removeit = check_feature($feature_l3, 'level3'); - if ($removeit){ - if($removeit == 2){ - push ( @{$list_l3_to_remove{'left'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_semidDiscarded]); - } else { - push ( @{$list_l3_to_remove{'discarded'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]); - } + if($removeit == 1){ + push ( @{$list_l3_to_remove{'discarded'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]); + } elsif ( $removeit == 2 and $opt_na_aside ){ + push ( @{$list_l3_to_remove{'na'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_semidDiscarded]); } } } @@ -234,16 +235,16 @@ # Should be removed after looping over them to avoid problems foreach my $key ( keys %list_l2_to_remove ){ foreach my $infos ( @{$list_l2_to_remove{$key}} ) { - my $cases = remove_l2_and_relatives( $hash_omniscient, @$infos); + my $cases = remove_l2_and_relatives( $hash_omniscient, @$infos, $opt_keep_parental); $all_cases{$key}{'l1'} += $cases->{'l1'}; $all_cases{$key}{'l2'} += $cases->{'l2'}; $all_cases{$key}{'l3'} += $cases->{'l3'}; $all_cases{$key}{'all'} += $cases->{'all'}; } } - foreach my $key ( keys %list_l3_to_remove ){ + foreach my $key ( sort keys %list_l3_to_remove ){ foreach my $infos ( @{$list_l3_to_remove{$key}} ) { - my $cases = remove_l3_and_relatives( $hash_omniscient, @$infos); + my $cases = remove_l3_and_relatives( $hash_omniscient, @$infos, $opt_keep_parental); $all_cases{$key}{'l1'} += $cases->{'l1'}; $all_cases{$key}{'l2'} += $cases->{'l2'}; $all_cases{$key}{'l3'} += $cases->{'l3'}; @@ -262,11 +263,13 @@ $stringPrint .= $all_cases{'discarded'}{'l2'}." features level2 (e.g. mRNA) removed\n"; $stringPrint .= $all_cases{'discarded'}{'l3'}." features level3 (e.g. exon) removed\n"; -$stringPrint .= "Feature left out because the attribute is missing (see $fhout_semidDiscarded_file file):\n"; -$stringPrint .= $all_cases{'left'}{'all'}." features removed:\n"; -$stringPrint .= $all_cases{'left'}{'l1'}." features level1 (e.g. gene) removed\n"; -$stringPrint .= $all_cases{'left'}{'l2'}." features level2 (e.g. mRNA) removed\n"; -$stringPrint .= $all_cases{'left'}{'l3'}." features level3 (e.g. exon) removed\n"; +if($opt_na_aside){ + $stringPrint .= "Feature left out because the attribute is missing (see $fhout_semidDiscarded_file file):\n"; + $stringPrint .= $all_cases{'na'}{'all'}." features removed:\n"; + $stringPrint .= $all_cases{'na'}{'l1'}." features level1 (e.g. gene) removed\n"; + $stringPrint .= $all_cases{'na'}{'l2'}." features level2 (e.g. mRNA) removed\n"; + $stringPrint .= $all_cases{'na'}{'l3'}." features level3 (e.g. exon) removed\n"; +} if ($opt_output){ print $ostreamReport $stringPrint; @@ -362,7 +365,6 @@ sub should_we_remove_feature{ return 0; } else { print "Attribute not found case\n" if $opt_verbose; - print "$opt_attribute ".$feature->gff_string."\n"; return 2; } } @@ -376,11 +378,16 @@ =head1 NAME =head1 DESCRIPTION The script aims to filter features according to attribute value (9th column). -If the attribute tag is missing the feature will not be discarded. -If the attribute exists and the value pass the test, the feature is discarded. +- If the attribute exists and the value do not pass the test, the feature is written into . +- If the attribute exists and the value pass the test, the feature is discarded and written into _discarded.gff. +- If the attribute tag is missing (test cannot be applyed), the feature will be written into by default. If --na_aside parameter +is activated then it will be written into _na.gff. + Attribute are stored in the 9th column and have this shape: tag=value -/!\ Removing a level1 or level2 feature will automatically remove all linked subfeatures, and -removing all children of a feature will automatically remove this feature too. +/!\ Removing a level1 or level2 feature will automatically remove all linked subfeatures. +/!\ Removing all children of a feature will automatically remove this feature too (excepted if --keep_parental is activated). +/!\ If --keep_parental is not activated and --na_aside is activated, and all level3 features of a record are split between both _na.gff and _discarded.gff, +, then the parental level1 and level2 features are removed and will end up in the _na.gff file only. =head1 SYNOPSIS @@ -416,6 +423,16 @@ =head1 OPTIONS Bolean. Deactivated by default. When activated the values provided by the --value parameter are handled case insensitive. +=item B<--na_aside> + +Bolean. Deactivated by default. By default if the attribute tag on which the filter is based is missing, the feature will be written into . +When activated, such features will be written into a separate file called _na.gff + + +=item B<--keep_parental> + +Bolean. Deactivated by default. When activated even if all child features have been removed, the parental one will be kept. + =item B<-t> or B<--test> Test to apply (> < = ! >= <=). default value "=". If you use one of these two character >, <, please don't forget to quote the From daa544350b6d037470d93e2544191e455367ffa9 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 9 Jul 2024 10:55:38 +0200 Subject: [PATCH 7/7] update doc --- ...at_sp_filter_feature_by_attribute_value.pl | 8 +++---- ...at_sp_filter_feature_by_attribute_value.md | 23 ++++++++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/bin/agat_sp_filter_feature_by_attribute_value.pl b/bin/agat_sp_filter_feature_by_attribute_value.pl index b8631cd8..0f599fdd 100755 --- a/bin/agat_sp_filter_feature_by_attribute_value.pl +++ b/bin/agat_sp_filter_feature_by_attribute_value.pl @@ -387,7 +387,7 @@ =head1 DESCRIPTION /!\ Removing a level1 or level2 feature will automatically remove all linked subfeatures. /!\ Removing all children of a feature will automatically remove this feature too (excepted if --keep_parental is activated). /!\ If --keep_parental is not activated and --na_aside is activated, and all level3 features of a record are split between both _na.gff and _discarded.gff, -, then the parental level1 and level2 features are removed and will end up in the _na.gff file only. +then the parental level1 and level2 features are removed and will end up in the _na.gff file only. =head1 SYNOPSIS @@ -426,14 +426,14 @@ =head1 OPTIONS =item B<--na_aside> Bolean. Deactivated by default. By default if the attribute tag on which the filter is based is missing, the feature will be written into . -When activated, such features will be written into a separate file called _na.gff - +When activated, such features will be written into a separate file called _na.gff. =item B<--keep_parental> Bolean. Deactivated by default. When activated even if all child features have been removed, the parental one will be kept. =item B<-t> or B<--test> + Test to apply (> < = ! >= <=). default value "=". If you use one of these two character >, <, please don't forget to quote the parameter like that "<=" otherwise your terminal will complain. @@ -441,7 +441,7 @@ =head1 OPTIONS =item B<-o> or B<--output> -Output GFF file. If no output file is specified, the output will be +Output GFF file. If no output file is specified, the output will be written to STDOUT. =item B<-v> diff --git a/docs/tools/agat_sp_filter_feature_by_attribute_value.md b/docs/tools/agat_sp_filter_feature_by_attribute_value.md index e0d37896..b9fa6e50 100644 --- a/docs/tools/agat_sp_filter_feature_by_attribute_value.md +++ b/docs/tools/agat_sp_filter_feature_by_attribute_value.md @@ -3,11 +3,14 @@ ## DESCRIPTION The script aims to filter features according to attribute value (9th column). -If the attribute tag is missing the feature will not be discarded. -If the attribute exists and the value pass the test, the feature is discarded. -Attribute are stored in the 9th column and have this shape: tag=value -/!\\ Removing a level1 or level2 feature will automatically remove all linked subfeatures, and -removing all children of a feature will automatically remove this feature too. +- If the attribute exists and the value do not pass the test, the feature is written into . +- If the attribute exists and the value pass the test, the feature is discarded and written into _discarded.gff. +- If the attribute tag is missing (test cannot be applyed), the feature will be written into by default. If --na_aside parameter is activated then it will be written into _na.gff. + +Attribute are stored in the 9th column and have this shape: tag=value. +/!\\ Removing a level1 or level2 feature will automatically remove all linked subfeatures. +/!\\ Removing all children of a feature will automatically remove this feature too (excepted if --keep_parental is activated). +/!\\ If --keep_parental is not activated and --na_aside is activated, and all level3 features of a record are split between both _na.gff and _discarded.gff, then the parental level1 and level2 features are removed and will end up in the _na.gff file only. ## SYNOPSIS @@ -43,6 +46,14 @@ agat_sp_filter_feature_by_attribute_value.pl --help Bolean. Deactivated by default. When activated the values provided by the --value parameter are handled case insensitive. +- **<--na\_aside** + + Bolean. Deactivated by default. By default if the attribute tag on which the filter is based is missing, the feature will be written into . + When activated, such features will be written into a separate file called _na.gff. + +- **<--keep\_parental>** + + Bolean. Deactivated by default. When activated even if all child features have been removed, the parental one will be kept. - **-t** or **--test** @@ -53,7 +64,7 @@ agat_sp_filter_feature_by_attribute_value.pl --help - **-o** or **--output** - Output GFF file. If no output file is specified, the output will be + Output GFF file. If no output file is specified, the output will be written to STDOUT. - **-v**