|
| 1 | +#!/usr/bin/env perl |
| 2 | +use strict; |
| 3 | +use warnings; |
| 4 | +use Getopt::Long; |
| 5 | +use File::Spec; |
| 6 | +use Cwd qw(abs_path); |
| 7 | +use Spreadsheet::Read; |
| 8 | +use Data::Dumper; |
| 9 | +use Cwd; |
| 10 | + |
| 11 | +my $id_re = '\b(\d{4}-\d{5})\b'; |
| 12 | +my $read_re = '_R?([12])(?:_\d+)?.f'; |
| 13 | +my $in = ''; |
| 14 | +my $dir = '/mnt/seq/MDU/READS'; |
| 15 | +my $out = 'samples.tab'; |
| 16 | +my $longid = 0; |
| 17 | +my $verbose = 0; |
| 18 | + |
| 19 | +sub usage { |
| 20 | + print "$0 [--verbose] [--longid] [--fastqdir $dir] [--out $out] --in jobdetails.xlsx\n"; |
| 21 | + exit; |
| 22 | +} |
| 23 | + |
| 24 | +@ARGV or usage(); |
| 25 | + |
| 26 | +GetOptions( |
| 27 | + "help" => \&usage, |
| 28 | + "verbose!" => \&verbose, |
| 29 | + "in=s" => \$in, |
| 30 | + "fastqdir=s" => \$dir, |
| 31 | + "out=s" => \$out, |
| 32 | + "longid!" => \$longid, |
| 33 | + "id_regexp=s" => \$id_re, |
| 34 | + "read_regexp=s" => \$read_re, |
| 35 | +) |
| 36 | +or usage(); |
| 37 | + |
| 38 | +if (!$in and @ARGV > 0) { |
| 39 | + $in = shift @ARGV; |
| 40 | + print STDERR "Guessing --in $in\n"; |
| 41 | +} |
| 42 | + |
| 43 | +$in or die "need ID file with --in job.xls"; |
| 44 | +-r $in or die "can't read ID file '$in'"; |
| 45 | + |
| 46 | +$dir or die "need top-level folder containing FASTQ files with --dir"; |
| 47 | +-d $dir or die "--dir '$dir' is not a directory"; |
| 48 | + |
| 49 | +$out or die "need --out file to save results to"; |
| 50 | + |
| 51 | +# compile regexps |
| 52 | +$id_re = qr"$id_re"; |
| 53 | +$read_re = qr"$read_re"; |
| 54 | + |
| 55 | +print STDERR "Scanning '$in' for MDU sample IDs...\n"; |
| 56 | + |
| 57 | +my $book = ReadData( $in, cells=>0, strip=>3, attr=>1 ); |
| 58 | +#print Dumper($book); exit; |
| 59 | +my @row = Spreadsheet::Read::rows($book->[1]); |
| 60 | +my %id; |
| 61 | +for my $row (@row) { |
| 62 | + my $line = join(' ', grep { defined $_ } @$row); |
| 63 | + if ($line =~ $id_re) { |
| 64 | + my $ID = $1; |
| 65 | + $line =~ s/\s+/--/g; |
| 66 | + $line =~ s/[_-]+$//; |
| 67 | + $line =~ s/^[_-]+//; |
| 68 | + $id{$ID} = $line; |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +printf STDERR "Found %d sample IDs:\n", scalar(keys %id); |
| 73 | + |
| 74 | +if (0 == keys %id) { |
| 75 | + print STDERR "ERROR: no IDs found in '$in'\n"; |
| 76 | + exit -1; |
| 77 | +} |
| 78 | + |
| 79 | +print STDERR map { "$_\n" } sort keys %id if $verbose; |
| 80 | + |
| 81 | +print STDERR "Scanning '$dir' for read files...\n"; |
| 82 | + |
| 83 | +my %want_id = (map { ($_ => 1) } keys %id); |
| 84 | +my %sample; |
| 85 | + |
| 86 | +open DIR, "find $dir -type f -name '*.f*q.gz' |"; |
| 87 | +while (my $file = <DIR>) { |
| 88 | + chomp $file; |
| 89 | + if ($file =~ $id_re and exists $want_id{$1}) { |
| 90 | + my $id = $1; |
| 91 | + my(undef, undef, $name) = File::Spec->splitpath($file); |
| 92 | + if ($name =~ $read_re) { |
| 93 | + my $read = $1; |
| 94 | +# print STDERR "$id $read\n"; |
| 95 | + $sample{$id}{$read} = abs_path($file); |
| 96 | + print STDERR "Found $id $read : $file\n" if $verbose; |
| 97 | + } |
| 98 | + else { |
| 99 | + print STDERR "WARNING: found $id but not $read_re in $name\n"; |
| 100 | + } |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +print STDERR "Creating output file: $out\n"; |
| 105 | +open OUT, '>', $out; |
| 106 | + |
| 107 | +#use Data::Dumper; |
| 108 | +#print STDERR Dumper(\%id); |
| 109 | + |
| 110 | +for my $id (sort keys %id) { |
| 111 | + if (exists $sample{$id}{1} and exists $sample{$id}{2}) { |
| 112 | + my $label = $longid ? $id{$id} : $id; |
| 113 | + print STDERR "$id - both reads found, labelling as '$label'\n"; |
| 114 | + printf OUT join("\t", $label, $sample{$id}{1}, $sample{$id}{2})."\n"; |
| 115 | + } |
| 116 | + elsif (!exists $sample{$id}{1} and !exists $sample{$id}{2}) { |
| 117 | + print STDERR "$id - NO READ FOUNDS !!!\n"; |
| 118 | + } |
| 119 | + elsif (!exists $sample{$id}{1}) { |
| 120 | + print STDERR "$id - MISSING Read 1 FILE !!!\n"; |
| 121 | + } |
| 122 | + elsif (!exists $sample{$id}{2}) { |
| 123 | + print STDERR "$id - MISSING Read 2 FILE !!!\n"; |
| 124 | + } |
| 125 | + else { |
| 126 | + print STDERR "$id - THIS LINE SHOULD NEVER BE REACHED\n"; |
| 127 | + } |
| 128 | +} |
| 129 | + |
| 130 | +print STDERR "Result in '$out'\n"; |
| 131 | +print STDERR "Done.\n"; |
| 132 | + |
| 133 | +my $name = qx(basename `pwd`); |
| 134 | +chomp $name; |
| 135 | +my $cmd = "nullarbor.pl --name $name --outdir nullarbor --input samples.tab --cpus 4 --mlst FIXME --ref FIXME"; |
| 136 | + |
| 137 | +print STDERR "Your next command is probably this:\n$cmd\n"; |
| 138 | + |
| 139 | + |
0 commit comments