|
232 | 232 | next if(-d "$dir/$file"); #print "$dir/$file\n";
|
233 | 233 |
|
234 | 234 | # read sequences in each cluster
|
235 |
| - my ($clusterkey,$cluster_data,$n_of_cluster_seqs) = ('','',0); |
| 235 | + my ($clusterkey,$cluster_data,$n_of_cluster_seqs,$taxon_name) = ('','',0); |
236 | 236 | my (@choppedseqs,@clusterseqs,%cluster_taxa,@gis,@neighbors,@sorted_taxa);
|
237 | 237 | my $cluster_ref = read_FASTA_file_array("$dir/$file");
|
238 | 238 |
|
239 |
| - if($taxa{$file}) # previosly read from cluster list file |
| 239 | + if($taxa{$file}) # previously read from .cluster_list file |
240 | 240 | {
|
241 | 241 | %cluster_taxa = %{$taxa{$file}};
|
242 |
| - delete($cluster_taxa{'sorted_taxa'}); # otherwise it would count as one extra taxa; it is conserved in %taxa |
| 242 | + delete($cluster_taxa{'sorted_taxa'}); # otherwise it would count as one extra taxa; conserved in %taxa |
| 243 | + #Uncultured_bacterium_plasmid_pRSB205.gb 1 |
243 | 244 | }
|
244 | 245 | else # automatically extracted from headers, error prone
|
245 | 246 | {
|
246 |
| - %cluster_taxa = find_taxa_FASTA_array_headers($cluster_ref,1); |
| 247 | + my %cluster_taxa_in_headers = find_taxa_FASTA_array_headers($cluster_ref,1); |
| 248 | + |
| 249 | + foreach $taxon (keys(%cluster_taxa_in_headers)) |
| 250 | + { |
| 251 | + $taxon_name = $taxon; |
| 252 | + $taxon_name =~ s/\[|\]//g; |
| 253 | + $cluster_taxa{$taxon_name} = $cluster_taxa_in_headers{$taxon}{'SIZE'}; |
| 254 | + } |
| 255 | + |
247 | 256 | foreach $seq (0 .. $#{$cluster_ref})
|
248 | 257 | {
|
249 |
| - foreach $taxon (keys(%cluster_taxa)) |
| 258 | + foreach $taxon (keys(%cluster_taxa_in_headers)) |
250 | 259 | {
|
251 |
| - if(grep(/^$seq$/,@{$cluster_taxa{$taxon}{'MEMBERS'}})) |
| 260 | + if(grep(/^$seq$/,@{$cluster_taxa_in_headers{$taxon}{'MEMBERS'}})) |
252 | 261 | {
|
253 |
| - $taxon =~ s/\[|\]//g; |
254 |
| - push(@{$taxa{$file}{'sorted_taxa'}},$taxon); |
| 262 | + $taxon_name = $taxon; |
| 263 | + $taxon_name =~ s/\[|\]//g; |
| 264 | + push(@{$taxa{$file}{'sorted_taxa'}},$taxon_name); |
255 | 265 | }
|
256 | 266 | }
|
257 | 267 | }
|
|
470 | 480 | my @intersection_keys;
|
471 | 481 | foreach my $key (keys(%stats))
|
472 | 482 | {
|
473 |
| - |
474 | 483 | # intersection steps
|
475 | 484 | next if($stats{$key}{'total'} != $n_of_dirs);
|
476 | 485 |
|
|
596 | 605 | my $pangenome_fasta_file = $INP_output_dir . "/pangenome_matrix$params\.fasta";
|
597 | 606 | my $pangenome_matrix_file = $INP_output_dir . "/pangenome_matrix$params\.tab";
|
598 | 607 |
|
599 |
| -# 1) ordena taxa por clustering jerarquico ,de la matriz pangenomica |
600 |
| -# codigo en python en collective intelligence para hacer clusters y pintar dendrograma |
601 |
| -# 2) ordena clusters (en horizontal) de mas frecuentes a menos, de core a pan |
602 |
| - |
| 608 | + # 1) sort clusters |
603 | 609 | my @taxon_names = keys(%pangemat);
|
604 | 610 | my (%cluster_names,$cluster_name,$file_number,%file_name);
|
605 | 611 | for($taxon=0;$taxon<scalar(@taxon_names);$taxon++)
|
|
0 commit comments