From 83641ef2d0a815a783b64f204cac3ed2cec44818 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 9 Jul 2024 15:52:53 +0200 Subject: [PATCH] Fix #471 - agat_sp_filter_feature_by_attribute_value.pl (#473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * indentation * Update remove_l2_and_relatives and remove_l3_and_relatives to be able to print the complete features instead of the ID only. Add print_feature_safe to be able to print GFF object file handler as well as IO::File * fix #471 Fix error of trying to remove L2 already removed in case where all CDS are removed in once because share the same ID (in that case L2 is not removed at the last round as expected) * add keep_parental parameter to remove_l3_and_relatives and remove_l2_and_relatives function to skip removing parental geature(s) if needed. * add na_aside parameter for cases where the attribute tag is missing * update doc --- ...sp_filter_feature_by_attribute_presence.pl | 3 +- ...at_sp_filter_feature_by_attribute_value.pl | 181 ++++++++++------ bin/agat_sp_statistics.pl | 2 - ...at_sp_filter_feature_by_attribute_value.md | 23 ++- lib/AGAT/OmniscientTool.pm | 193 +++++++++++------- 5 files changed, 252 insertions(+), 150 deletions(-) diff --git a/bin/agat_sp_filter_feature_by_attribute_presence.pl b/bin/agat_sp_filter_feature_by_attribute_presence.pl index 431c8aa2..657adfc3 100755 --- a/bin/agat_sp_filter_feature_by_attribute_presence.pl +++ b/bin/agat_sp_filter_feature_by_attribute_presence.pl @@ -314,7 +314,8 @@ =head1 OPTIONS =item B<--attribute>, B<--att>, B<-a> -String - Attributes tag specified will be used to filter the feature type (feature type can also be specified by the option -p). List of attribute tags must be coma separated. +String - Attributes tag specified will be used to filter the feature type (feature type can also be specified by the option -p). +List of attribute tags must be coma separated. =item B<--flip> diff --git a/bin/agat_sp_filter_feature_by_attribute_value.pl b/bin/agat_sp_filter_feature_by_attribute_value.pl index c584a7d5..0f599fdd 100755 --- a/bin/agat_sp_filter_feature_by_attribute_value.pl +++ b/bin/agat_sp_filter_feature_by_attribute_value.pl @@ -15,6 +15,8 @@ my $primaryTag=undef; my $opt_output= undef; my $opt_value = undef; +my $opt_keep_parental = undef; +my $opt_na_aside = undef; my $opt_value_insensitive = undef; my $opt_attribute = undef; my $opt_test = "="; @@ -27,6 +29,8 @@ if ( !GetOptions( 'f|ref|reffile|gff=s' => \$opt_gff, 'value=s' => \$opt_value, 'value_insensitive!' => \$opt_value_insensitive, + 'keep_parental!' => \$opt_keep_parental, + 'na_aside!' => \$opt_na_aside, "p|type|l=s" => \$primaryTag, 'a|attribute=s' => \$opt_attribute, 't|test=s' => \$opt_test, @@ -59,7 +63,7 @@ ############### # Test options -if($opt_test ne "<" and $opt_test ne ">" and $opt_test ne "<=" and $opt_test ne ">=" and $opt_test ne "="){ +if($opt_test ne "<" and $opt_test ne ">" and $opt_test ne "<=" and $opt_test ne ">=" and $opt_test ne "=" and $opt_test ne "!"){ print "The test to apply is Wrong: $opt_test.\nWe want something among this list: <,>,<=,>=,! or =.";exit; } @@ -70,19 +74,22 @@ my $gffout_ok_file ; my $fhout_discarded_file ; my $ostreamReport_file; +my $fhout_semidDiscarded_file if $opt_na_aside; if ($opt_output) { my ($outfile,$path,$ext) = fileparse($opt_output,qr/\.[^.]*/); # set file names $gffout_ok_file = $path.$outfile.$ext; - $fhout_discarded_file = $path.$outfile."_discarded.txt"; + $fhout_discarded_file = $path.$outfile."_discarded.gff"; $ostreamReport_file = $path.$outfile."_report.txt"; + $fhout_semidDiscarded_file = $path.$outfile."_na.gff"; } my $gffout_ok = prepare_gffout($config, $gffout_ok_file); -my $fhout_discarded = prepare_fileout($fhout_discarded_file); +my $fhout_discarded = prepare_gffout($config, $fhout_discarded_file); my $ostreamReport = prepare_fileout($ostreamReport_file); +my $fhout_semidDiscarded = prepare_gffout($config, $fhout_semidDiscarded_file) if $opt_na_aside; # Manage $primaryTag my @ptagList; @@ -121,17 +128,14 @@ } } -use Data::Dumper; -print Dumper($value_hash); - # start with some interesting information my $stringPrint = strftime "%m/%d/%Y at %Hh%Mm%Ss", localtime; $stringPrint .= "\nusage: $0 @copyARGV\n"; $stringPrint .= "We will discard $print_feature_string that have the attribute $opt_attribute with the value $opt_test $opt_value"; if ($opt_value_insensitive){ - $stringPrint .= "case insensitive.\n"; + $stringPrint .= " case insensitive.\n"; }else{ - $stringPrint .= "case sensitive.\n"; + $stringPrint .= " case sensitive.\n"; } if ($opt_output){ @@ -142,7 +146,9 @@ ####################### # >>>>>>>>>>>>>>>>>>>>>>>># MAIN #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ####################### -my %all_cases = ('l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0); +my %all_cases = ( 'left' => {'l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0}, + 'discarded' => {'l1' => 0, 'l2' => 0, 'l3' => 0, 'all' => 0} ); + ###################### ### Parse GFF input # my ($hash_omniscient, $hash_mRNAGeneLink) = slurp_gff3_file_JD({ input => $opt_gff, @@ -167,19 +173,28 @@ $removeit = check_feature($feature_l1, 'level1'); # we can remove feature L1 now because we are looping over $hash_sortBySeq not $hash_omniscient if ($removeit){ - my $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_discarded); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + my $cases; + if($removeit == 1){ + $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_discarded); + $all_cases{'discarded'}{'l1'} += $cases->{'l1'}; + $all_cases{'discarded'}{'l2'} += $cases->{'l2'}; + $all_cases{'discarded'}{'l3'} += $cases->{'l3'}; + $all_cases{'discarded'}{'all'} += $cases->{'all'}; + } elsif ($removeit == 2 and $opt_na_aside){ + $cases = remove_l1_and_relatives($hash_omniscient, $feature_l1, $fhout_semidDiscarded); + $all_cases{'na'}{'l1'} += $cases->{'l1'}; + $all_cases{'na'}{'l2'} += $cases->{'l2'}; + $all_cases{'na'}{'l3'} += $cases->{'l3'}; + $all_cases{'na'}{'all'} += $cases->{'all'}; + } next; } ################# # == LEVEL 2 == # ################# - my @list_l2_to_remove; - my @list_l3_to_remove; + my %list_l2_to_remove; + my %list_l3_to_remove; foreach my $tag_l2 (sort keys %{$hash_omniscient->{'level2'}}){ # primary_tag_key_level2 = mrna or mirna or ncrna or trna etc... if ( exists_keys( $hash_omniscient, ('level2', $tag_l2, $id_l1) ) ){ @@ -188,7 +203,11 @@ $removeit = check_feature($feature_l2,'level2'); if ($removeit){ - push @list_l2_to_remove, [$feature_l2, $tag_l1, $id_l1, $fhout_discarded]; + if ($removeit == 1){ + push ( @{$list_l2_to_remove{'discarded'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_discarded]); + } elsif ($removeit == 2 and $opt_na_aside){ + push ( @{$list_l2_to_remove{'na'}}, [$feature_l2, $tag_l1, $id_l1, $fhout_semidDiscarded]); + } next; } ################# @@ -202,8 +221,10 @@ foreach my $feature_l3 ( @list_fl3 ) { $removeit = check_feature($feature_l3, 'level3'); - if ($removeit){ - push @list_l3_to_remove, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]; + if($removeit == 1){ + push ( @{$list_l3_to_remove{'discarded'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_discarded]); + } elsif ( $removeit == 2 and $opt_na_aside ){ + push ( @{$list_l3_to_remove{'na'}}, [$feature_l3, $tag_l1, $id_l1, $tag_l2, $id_l2, $fhout_semidDiscarded]); } } } @@ -212,22 +233,22 @@ } } # Should be removed after looping over them to avoid problems - if (@list_l2_to_remove) { - foreach my $infos (@list_l2_to_remove) { - my $cases = remove_l2_and_relatives( $hash_omniscient, @$infos); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + foreach my $key ( keys %list_l2_to_remove ){ + foreach my $infos ( @{$list_l2_to_remove{$key}} ) { + my $cases = remove_l2_and_relatives( $hash_omniscient, @$infos, $opt_keep_parental); + $all_cases{$key}{'l1'} += $cases->{'l1'}; + $all_cases{$key}{'l2'} += $cases->{'l2'}; + $all_cases{$key}{'l3'} += $cases->{'l3'}; + $all_cases{$key}{'all'} += $cases->{'all'}; } } - if (@list_l3_to_remove) { - foreach my $infos (@list_l3_to_remove) { - my $cases = remove_l3_and_relatives( $hash_omniscient, @$infos); - $all_cases{'l1'} += $cases->{'l1'}; - $all_cases{'l2'} += $cases->{'l2'}; - $all_cases{'l3'} += $cases->{'l3'}; - $all_cases{'all'} += $cases->{'all'}; + foreach my $key ( sort keys %list_l3_to_remove ){ + foreach my $infos ( @{$list_l3_to_remove{$key}} ) { + my $cases = remove_l3_and_relatives( $hash_omniscient, @$infos, $opt_keep_parental); + $all_cases{$key}{'l1'} += $cases->{'l1'}; + $all_cases{$key}{'l2'} += $cases->{'l2'}; + $all_cases{$key}{'l3'} += $cases->{'l3'}; + $all_cases{$key}{'all'} += $cases->{'all'}; } } } @@ -236,10 +257,20 @@ print_omniscient( {omniscient => $hash_omniscient, output => $gffout_ok} ); -$stringPrint = $all_cases{'all'}." features removed:\n"; -$stringPrint .= $all_cases{'l1'}." features level1 (e.g. gene) removed\n"; -$stringPrint .= $all_cases{'l2'}." features level2 (e.g. mRNA) removed\n"; -$stringPrint .= $all_cases{'l3'}." features level3 (e.g. exon) removed\n"; +$stringPrint = "Feature discarded by applying the test (see $fhout_discarded_file file):\n"; +$stringPrint .= $all_cases{'discarded'}{'all'}." features removed:\n"; +$stringPrint .= $all_cases{'discarded'}{'l1'}." features level1 (e.g. gene) removed\n"; +$stringPrint .= $all_cases{'discarded'}{'l2'}." features level2 (e.g. mRNA) removed\n"; +$stringPrint .= $all_cases{'discarded'}{'l3'}." features level3 (e.g. exon) removed\n"; + +if($opt_na_aside){ + $stringPrint .= "Feature left out because the attribute is missing (see $fhout_semidDiscarded_file file):\n"; + $stringPrint .= $all_cases{'na'}{'all'}." features removed:\n"; + $stringPrint .= $all_cases{'na'}{'l1'}." features level1 (e.g. gene) removed\n"; + $stringPrint .= $all_cases{'na'}{'l2'}." features level2 (e.g. mRNA) removed\n"; + $stringPrint .= $all_cases{'na'}{'l3'}." features level3 (e.g. exon) removed\n"; +} + if ($opt_output){ print $ostreamReport $stringPrint; print $stringPrint; @@ -298,35 +329,44 @@ sub should_we_remove_feature{ } # for string values replace = by eq and ! by ne and avoid other type of test if ( ! looks_like_number ($given_value) or ! looks_like_number ($file_value)){ + print "String case\n" if $opt_verbose; if ($opt_test eq "="){ - if ($file_value eq $given_value){return 1; } + if ($file_value eq $given_value) { print "equal\n" if $opt_verbose; return 1; } + else { print "not equal\n" if $opt_verbose; } } elsif ($opt_test eq "!"){ - if ($file_value ne $given_value){return 1; } + if ($file_value ne $given_value){ print "different\n" if $opt_verbose; return 1; } + else { print "not different\n" if $opt_verbose; } + } + } + else{ + print "Number case\n" if $opt_verbose; + if ($opt_test eq "="){ + if ($file_value == $given_value){return 1; } + } + elsif ($opt_test eq "!"){ + if ($file_value != $given_value){return 1; } + } + elsif ($opt_test eq ">"){ + if ($file_value > $given_value){return 1; } + } + elsif ($opt_test eq "<"){ + if ($file_value < $given_value){return 1; } + } + elsif ($opt_test eq "<="){ + if ($file_value <= $given_value){return 1; } + } + elsif ($opt_test eq ">="){ + if ($file_value >= $given_value){return 1; } } - } - elsif ($opt_test eq "="){ - if ($file_value == $given_value){return 1; } - } - elsif ($opt_test eq "!"){ - if ($file_value != $given_value){return 1; } - } - elsif ($opt_test eq ">"){ - if ($file_value > $given_value){return 1; } - } - elsif ($opt_test eq "<"){ - if ($file_value < $given_value){return 1; } - } - elsif ($opt_test eq "<="){ - if ($file_value <= $given_value){return 1; } - } - elsif ($opt_test eq ">="){ - if ($file_value >= $given_value){return 1; } } } } + return 0; + } else { + print "Attribute not found case\n" if $opt_verbose; + return 2; } - return 0; } __END__ @@ -338,11 +378,16 @@ =head1 NAME =head1 DESCRIPTION The script aims to filter features according to attribute value (9th column). -If the attribute tag is missing the feature will not be discarded. -If the attribute exists and the value pass the test, the feature is discarded. +- If the attribute exists and the value do not pass the test, the feature is written into . +- If the attribute exists and the value pass the test, the feature is discarded and written into _discarded.gff. +- If the attribute tag is missing (test cannot be applyed), the feature will be written into by default. If --na_aside parameter +is activated then it will be written into _na.gff. + Attribute are stored in the 9th column and have this shape: tag=value -/!\ Removing a level1 or level2 feature will automatically remove all linked subfeatures, and -removing all children of a feature will automatically remove this feature too. +/!\ Removing a level1 or level2 feature will automatically remove all linked subfeatures. +/!\ Removing all children of a feature will automatically remove this feature too (excepted if --keep_parental is activated). +/!\ If --keep_parental is not activated and --na_aside is activated, and all level3 features of a record are split between both _na.gff and _discarded.gff, +then the parental level1 and level2 features are removed and will end up in the _na.gff file only. =head1 SYNOPSIS @@ -378,7 +423,17 @@ =head1 OPTIONS Bolean. Deactivated by default. When activated the values provided by the --value parameter are handled case insensitive. +=item B<--na_aside> + +Bolean. Deactivated by default. By default if the attribute tag on which the filter is based is missing, the feature will be written into . +When activated, such features will be written into a separate file called _na.gff. + +=item B<--keep_parental> + +Bolean. Deactivated by default. When activated even if all child features have been removed, the parental one will be kept. + =item B<-t> or B<--test> + Test to apply (> < = ! >= <=). default value "=". If you use one of these two character >, <, please don't forget to quote the parameter like that "<=" otherwise your terminal will complain. @@ -386,7 +441,7 @@ =head1 OPTIONS =item B<-o> or B<--output> -Output GFF file. If no output file is specified, the output will be +Output GFF file. If no output file is specified, the output will be written to STDOUT. =item B<-v> diff --git a/bin/agat_sp_statistics.pl b/bin/agat_sp_statistics.pl index 4f3bce47..6ede4d16 100755 --- a/bin/agat_sp_statistics.pl +++ b/bin/agat_sp_statistics.pl @@ -57,8 +57,6 @@ if(defined($opt_yaml)){ if( defined($opt_output)){ $opt_yaml = $opt_output.".yaml"; - }else{ - $out } } diff --git a/docs/tools/agat_sp_filter_feature_by_attribute_value.md b/docs/tools/agat_sp_filter_feature_by_attribute_value.md index e0d37896..b9fa6e50 100644 --- a/docs/tools/agat_sp_filter_feature_by_attribute_value.md +++ b/docs/tools/agat_sp_filter_feature_by_attribute_value.md @@ -3,11 +3,14 @@ ## DESCRIPTION The script aims to filter features according to attribute value (9th column). -If the attribute tag is missing the feature will not be discarded. -If the attribute exists and the value pass the test, the feature is discarded. -Attribute are stored in the 9th column and have this shape: tag=value -/!\\ Removing a level1 or level2 feature will automatically remove all linked subfeatures, and -removing all children of a feature will automatically remove this feature too. +- If the attribute exists and the value do not pass the test, the feature is written into . +- If the attribute exists and the value pass the test, the feature is discarded and written into _discarded.gff. +- If the attribute tag is missing (test cannot be applyed), the feature will be written into by default. If --na_aside parameter is activated then it will be written into _na.gff. + +Attribute are stored in the 9th column and have this shape: tag=value. +/!\\ Removing a level1 or level2 feature will automatically remove all linked subfeatures. +/!\\ Removing all children of a feature will automatically remove this feature too (excepted if --keep_parental is activated). +/!\\ If --keep_parental is not activated and --na_aside is activated, and all level3 features of a record are split between both _na.gff and _discarded.gff, then the parental level1 and level2 features are removed and will end up in the _na.gff file only. ## SYNOPSIS @@ -43,6 +46,14 @@ agat_sp_filter_feature_by_attribute_value.pl --help Bolean. Deactivated by default. When activated the values provided by the --value parameter are handled case insensitive. +- **<--na\_aside** + + Bolean. Deactivated by default. By default if the attribute tag on which the filter is based is missing, the feature will be written into . + When activated, such features will be written into a separate file called _na.gff. + +- **<--keep\_parental>** + + Bolean. Deactivated by default. When activated even if all child features have been removed, the parental one will be kept. - **-t** or **--test** @@ -53,7 +64,7 @@ agat_sp_filter_feature_by_attribute_value.pl --help - **-o** or **--output** - Output GFF file. If no output file is specified, the output will be + Output GFF file. If no output file is specified, the output will be written to STDOUT. - **-v** diff --git a/lib/AGAT/OmniscientTool.pm b/lib/AGAT/OmniscientTool.pm index 2203c795..9e383cb5 100644 --- a/lib/AGAT/OmniscientTool.pm +++ b/lib/AGAT/OmniscientTool.pm @@ -868,7 +868,7 @@ sub remove_element_from_omniscient { if(@listok){ @{$hash_omniscient->{$level}{$tag_key}{$id_concern}}=@listok; } - else{ # The list is empty we could remove the key (otherwise we would have saved a emplty list) + else{ # The list is empty we could remove the key (otherwise we would have saved a empty list) delete $hash_omniscient->{$level}{$tag_key}{$id_concern}; } } @@ -878,6 +878,31 @@ sub remove_element_from_omniscient { } } +# Print a feature safely. +# +# It ensures that the feature is printed correctly without causing any errors according the output type. +# +# Parameters: +# $feature : the feature to print +# $out : AGAT::BioperlGFF / Bio::Tools::GFF object, or IO::File object or empty value +# Returns: +# None +# +# Note: +# This function should be used when printing features to avoid any potential errors. +sub print_feature_safe{ + my ($feature, $out)=@_; + + if($out){ + if($out->isa('AGAT::BioperlGFF' ) or $out->isa('Bio::Tools::GFF')){ + $out->write_feature( $feature ); + } + else{ + print $out $feature->gff_string()."\n"; + } + } +} + # @Purpose: remove from omniscient l1 feature and all subfeatures # @input: 3 => hash(omniscient hash), feature L1, optional fh to write case removed # @output: 1 => hash (nb feature removed) @@ -899,19 +924,19 @@ sub remove_l1_and_relatives{ foreach my $ptag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... if ( exists_keys( $omniscient, ('level3', $ptag_l3, $level2_ID) ) ){ foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$level2_ID}}) { - $cases_l3++; - print $fh_removed $feature_l3->_tag_value('ID')."\n" if ($fh_removed); + $cases_l3++; + print_feature_safe( $feature_l3, $fh_removed ); } delete $omniscient->{'level3'}{$ptag_l3}{$level2_ID} # delete level3 } } $cases_l2++; - print $fh_removed $feature_l2->_tag_value('ID')."\n" if($fh_removed); + print_feature_safe( $feature_l2, $fh_removed ); } delete $omniscient->{'level2'}{$ptag_l2}{$id_l1} # delete level2 } } - print $fh_removed $feature->gff_string()."\n" if $fh_removed; + print_feature_safe( $feature, $fh_removed ); delete $omniscient->{'level1'}{$tag_l1}{$id_l1}; # delete level1 $cases_l1++; @@ -927,11 +952,11 @@ sub remove_l1_and_relatives{ # @input: 5 => hash(omniscient hash), featureL2, primary tag l1, id l1, optional fh to write case removed # @output: 1 => hash (nb feature removed) sub remove_l2_and_relatives{ - my ($omniscient, $feature, $ptag_l1, $id_l1, $fh_removed)=@_; + my ($omniscient, $feature, $ptag_l1, $id_l1, $fh_removed, $keep_parental)=@_; my %cases; - my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; - my $ptag_l2 = lc($feature->primary_tag); + my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; + my $ptag_l2 = lc($feature->primary_tag); my $level2_Parent_ID = lc($feature->_tag_value('Parent')); my $level2_ID = lc($feature->_tag_value('ID')); @@ -943,40 +968,41 @@ sub remove_l2_and_relatives{ foreach my $ptag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... if ( exists_keys( $omniscient, ('level3', $ptag_l3, $level2_ID) ) ){ foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$level2_ID}}) { - $cases_l3++; - print $fh_removed $feature_l3->_tag_value('ID')."\n" if ($fh_removed); + $cases_l3++; + print_feature_safe( $feature_l3, $fh_removed ); } delete $omniscient->{'level3'}{$ptag_l3}{$level2_ID} # delete level3 } } } } - + # delete level2 and the hash pointer if the list is empty (no isoform left) my @id_concern_list=($id_l1); my @id_list_to_remove=($level2_ID); my @list_tag_key=('all'); $cases_l2++; - print $fh_removed $feature->gff_string()."\n" if ($fh_removed); + print_feature_safe( $feature, $fh_removed ); remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); - - if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - my $anotherL2Linked=0; - foreach my $ptag_l2 (keys %{$omniscient->{'level2'}}){ - if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - $anotherL2Linked=1; - } - } - #The list was empty so l2 has been removed, we can now remove l1 - if(! $anotherL2Linked){ - if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ - $cases_l1++; - print $fh_removed $id_l1."\n" if ($fh_removed); - delete $omniscient->{'level1'}{$ptag_l1}{$id_l1}; - } - } - } + if (! $keep_parental){ + if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + my $anotherL2Linked=0; + foreach my $ptag_l2 (keys %{$omniscient->{'level2'}}){ + if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + $anotherL2Linked=1; + } + } + #The list was empty so l2 has been removed, we can now remove l1 + if(! $anotherL2Linked){ + if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ + $cases_l1++; + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); + delete $omniscient->{'level1'}{$ptag_l1}{$id_l1}; + } + } + } + } } $cases{'l1'} = $cases_l1; @@ -991,61 +1017,65 @@ sub remove_l2_and_relatives{ # @input: 7 => hash(omniscient hash), feature L3, primary tag l2, id l2, primary tag l2, id l2, optional fh to write case removed # @output: 1 => hash (nb feature removed) sub remove_l3_and_relatives{ - my ($omniscient, $feature, $ptag_l1, $id_l1, $ptag_l2, $id_l2, $fh_removed)=@_; - + my ($omniscient, $feature, $ptag_l1, $id_l1, $ptag_l2, $id_l2, $fh_removed, $keep_parental)=@_; + my %cases; my $cases_l1 = 0; my $cases_l2 = 0; my $cases_l3 = 0; - my $level3_Parent_ID = lc($feature->_tag_value('Parent')); - my $id_l3 = lc($feature->_tag_value('ID')); + my $level3_Parent_ID = lc($feature->_tag_value('Parent')); + my $id_l3 = lc($feature->_tag_value('ID')); my $ptag_l3 = lc($feature->primary_tag); - if ( exists_keys( $omniscient, ('level3', $ptag_l3, $id_l2) ) ){ # just extra security in case - foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$id_l2}}) { - - if($id_l3 eq lc($feature_l3->_tag_value('ID')) ){ - #remove one feature and pointer if no more feature left in the list - my @id_concern_list=($level3_Parent_ID); - my @id_list_to_remove=($id_l3); - my @list_tag_key=('all'); - $cases_l3++; - print $fh_removed $feature->gff_string()."\n" if ($fh_removed); - remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level3','false', \@list_tag_key); - } - } - } - - # List empty check if we remove l2 or other l3 linked to it - if( ! exists_keys($omniscient, ('level3', $ptag_l3, $id_l2)) ){ - foreach my $tag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... - if ( exists_keys( $omniscient, ('level3', $tag_l3, $id_l2) ) ){ - $cases{'l1'} = $cases_l1; - $cases{'l2'} = $cases_l2; - $cases{'l3'} = $cases_l3; - $cases{'all'} = $cases_l1+$cases_l2+$cases_l3; - - return \%cases; - } - } + if ( exists_keys( $omniscient, ('level3', $ptag_l3, $id_l2) ) ) { # just extra security in case + foreach my $feature_l3 ( @{$omniscient->{'level3'}{$ptag_l3}{$id_l2}}) { + + if($id_l3 eq lc($feature_l3->_tag_value('ID')) ){ + #remove one feature and pointer if no more feature left in the list + my @id_concern_list=($level3_Parent_ID); + my @id_list_to_remove=($id_l3); + my @list_tag_key=('all'); + $cases_l3++; + print_feature_safe( $feature, $fh_removed ); + remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level3','false', \@list_tag_key); + } + } + } - # if we arrive here it means no more L3 feature is attached to L2 - # we remove the L2 parent properly (if isoforms they are kept) - my @id_concern_list=($id_l1); - my @id_list_to_remove=($id_l2); - my @list_tag_key=('all'); - $cases_l2++; - print $fh_removed $id_l2."\n" if ($fh_removed); - remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); + if (! $keep_parental){ + # List empty check if we remove l2 or other l3 linked to it + if( ! exists_keys($omniscient, ('level3', $ptag_l3, $id_l2)) ){ + foreach my $tag_l3 (keys %{$omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc... + if ( exists_keys( $omniscient, ('level3', $tag_l3, $id_l2) ) ){ + $cases{'l1'} = $cases_l1; + $cases{'l2'} = $cases_l2; + $cases{'l3'} = $cases_l3; + $cases{'all'} = $cases_l1+$cases_l2+$cases_l3; + return \%cases; + } + } - # Should we remove l1 too? - if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ - #The list was empty so l2 has been removed, we can now remove l1 - if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ - $cases_l1++; - print $fh_removed $id_l1."\n" if($fh_removed); - delete $omniscient->{'level1'}{$ptag_l1}{$id_l1} - } - } - } + # if we arrive here it means no more L3 feature is attached to L2 + # we remove the L2 parent properly (if isoforms they are kept) + if( exists_keys($omniscient, ('level2', $ptag_l2, $id_l1)) ){ + my @id_concern_list=($id_l1); + my @id_list_to_remove=($id_l2); + my @list_tag_key=('all'); + $cases_l2++; + my $feature_l2 = get_feature_l2_from_id_l2_l1($omniscient, $id_l2, $id_l1); + print_feature_safe( $feature_l2, $fh_removed ); + remove_element_from_omniscient(\@id_concern_list, \@id_list_to_remove, $omniscient, 'level2','false', \@list_tag_key); + + # Should we remove l1 too? + if( ! exists_keys($omniscient, ('level2', $ptag_l2, $id_l1) ) ){ + #The list was empty so l2 has been removed, we can now remove l1 + if( exists_keys($omniscient, ('level1', $ptag_l1, $id_l1) ) ){ + $cases_l1++; + print_feature_safe( $omniscient->{'level1'}{$ptag_l1}{$id_l1}, $fh_removed ); + delete $omniscient->{'level1'}{$ptag_l1}{$id_l1} + } + } + } + } + } $cases{'l1'} = $cases_l1; $cases{'l2'} = $cases_l2; @@ -1623,6 +1653,13 @@ sub group_l1IDs_from_omniscient { return \%group; } +# Retrieves the feature level 2 from the given level 2 and level 1 IDs. +# +# Parameters: +# - $id_l2: The ID of the level 2 feature +# - $id_l1: The ID of the parent level 1 feature +# Returns: +# - The feature at level 2 corresponding to the given ID. sub get_feature_l2_from_id_l2_l1 { my ($hash_omniscient, $id_l2, $id_l1) = @_ ; foreach my $tag_l2 (keys %{$hash_omniscient->{'level2'}}){