Skip to content

Commit affdf5d

Browse files
committed
Add some helper scripts (only useful to MDU-PHL really)
1 parent 4a97a7e commit affdf5d

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env perl
2+
use strict;
3+
use warnings;
4+
use Getopt::Long;
5+
use File::Spec;
6+
use Cwd qw(abs_path);
7+
use Spreadsheet::Read;
8+
use Data::Dumper;
9+
use Cwd;
10+
11+
my $id_re = '\b(\d{4}-\d{5})\b';
12+
my $read_re = '_R?([12])(?:_\d+)?.f';
13+
my $in = '';
14+
my $dir = '/mnt/seq/MDU/READS';
15+
my $out = 'samples.tab';
16+
my $longid = 0;
17+
my $verbose = 0;
18+
19+
sub usage {
20+
print "$0 [--verbose] [--longid] [--fastqdir $dir] [--out $out] --in jobdetails.xlsx\n";
21+
exit;
22+
}
23+
24+
@ARGV or usage();
25+
26+
GetOptions(
27+
"help" => \&usage,
28+
"verbose!" => \&verbose,
29+
"in=s" => \$in,
30+
"fastqdir=s" => \$dir,
31+
"out=s" => \$out,
32+
"longid!" => \$longid,
33+
"id_regexp=s" => \$id_re,
34+
"read_regexp=s" => \$read_re,
35+
)
36+
or usage();
37+
38+
if (!$in and @ARGV > 0) {
39+
$in = shift @ARGV;
40+
print STDERR "Guessing --in $in\n";
41+
}
42+
43+
$in or die "need ID file with --in job.xls";
44+
-r $in or die "can't read ID file '$in'";
45+
46+
$dir or die "need top-level folder containing FASTQ files with --dir";
47+
-d $dir or die "--dir '$dir' is not a directory";
48+
49+
$out or die "need --out file to save results to";
50+
51+
# compile regexps
52+
$id_re = qr"$id_re";
53+
$read_re = qr"$read_re";
54+
55+
print STDERR "Scanning '$in' for MDU sample IDs...\n";
56+
57+
my $book = ReadData( $in, cells=>0, strip=>3, attr=>1 );
58+
#print Dumper($book); exit;
59+
my @row = Spreadsheet::Read::rows($book->[1]);
60+
my %id;
61+
for my $row (@row) {
62+
my $line = join(' ', grep { defined $_ } @$row);
63+
if ($line =~ $id_re) {
64+
my $ID = $1;
65+
$line =~ s/\s+/--/g;
66+
$line =~ s/[_-]+$//;
67+
$line =~ s/^[_-]+//;
68+
$id{$ID} = $line;
69+
}
70+
}
71+
72+
printf STDERR "Found %d sample IDs:\n", scalar(keys %id);
73+
74+
if (0 == keys %id) {
75+
print STDERR "ERROR: no IDs found in '$in'\n";
76+
exit -1;
77+
}
78+
79+
print STDERR map { "$_\n" } sort keys %id if $verbose;
80+
81+
print STDERR "Scanning '$dir' for read files...\n";
82+
83+
my %want_id = (map { ($_ => 1) } keys %id);
84+
my %sample;
85+
86+
open DIR, "find $dir -type f -name '*.f*q.gz' |";
87+
while (my $file = <DIR>) {
88+
chomp $file;
89+
if ($file =~ $id_re and exists $want_id{$1}) {
90+
my $id = $1;
91+
my(undef, undef, $name) = File::Spec->splitpath($file);
92+
if ($name =~ $read_re) {
93+
my $read = $1;
94+
# print STDERR "$id $read\n";
95+
$sample{$id}{$read} = abs_path($file);
96+
print STDERR "Found $id $read : $file\n" if $verbose;
97+
}
98+
else {
99+
print STDERR "WARNING: found $id but not $read_re in $name\n";
100+
}
101+
}
102+
}
103+
104+
print STDERR "Creating output file: $out\n";
105+
open OUT, '>', $out;
106+
107+
#use Data::Dumper;
108+
#print STDERR Dumper(\%id);
109+
110+
for my $id (sort keys %id) {
111+
if (exists $sample{$id}{1} and exists $sample{$id}{2}) {
112+
my $label = $longid ? $id{$id} : $id;
113+
print STDERR "$id - both reads found, labelling as '$label'\n";
114+
printf OUT join("\t", $label, $sample{$id}{1}, $sample{$id}{2})."\n";
115+
}
116+
elsif (!exists $sample{$id}{1} and !exists $sample{$id}{2}) {
117+
print STDERR "$id - NO READ FOUNDS !!!\n";
118+
}
119+
elsif (!exists $sample{$id}{1}) {
120+
print STDERR "$id - MISSING Read 1 FILE !!!\n";
121+
}
122+
elsif (!exists $sample{$id}{2}) {
123+
print STDERR "$id - MISSING Read 2 FILE !!!\n";
124+
}
125+
else {
126+
print STDERR "$id - THIS LINE SHOULD NEVER BE REACHED\n";
127+
}
128+
}
129+
130+
print STDERR "Result in '$out'\n";
131+
print STDERR "Done.\n";
132+
133+
my $name = qx(basename `pwd`);
134+
chomp $name;
135+
my $cmd = "nullarbor.pl --name $name --outdir nullarbor --input samples.tab --cpus 4 --mlst FIXME --ref FIXME";
136+
137+
print STDERR "Your next command is probably this:\n$cmd\n";
138+
139+

scripts/xlsx2tsv.pl

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env perl
2+
use warnings;
3+
use strict;
4+
use Spreadsheet::Read;
5+
use Data::Dumper;
6+
7+
my(@Options, $verbose, $sep, $informat);
8+
setOptions();
9+
10+
my $book = ReadData( $ARGV[0], cells=>0, strip=>1 );
11+
12+
my @row = Spreadsheet::Read::rows($book->[1]);
13+
14+
for my $row (@row) {
15+
my @r = map { $_ || '' } @$row;
16+
print join($sep, @r),"\n";
17+
}
18+
19+
#----------------------------------------------------------------------
20+
# Option setting routines
21+
22+
sub setOptions {
23+
use Getopt::Long;
24+
25+
@Options = (
26+
{OPT=>"help", VAR=>\&usage, DESC=>"This help"},
27+
{OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose output"},
28+
# {OPT=>"informat=s", VAR=>\$informat, DEFAULT=>"xlsx", DESC=>"Input format: xls xlsx csv ods sxc"},
29+
{OPT=>"sep=s", VAR=>\$sep, DEFAULT=>"\t", DESC=>"Output separator"},
30+
);
31+
32+
#(!@ARGV) && (usage());
33+
34+
&GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();
35+
36+
# Now setup default values.
37+
foreach (@Options) {
38+
if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
39+
${$_->{VAR}} = $_->{DEFAULT};
40+
}
41+
}
42+
}
43+
44+
sub usage {
45+
print "Usage: $0 [options] [<] file.xlsx > file.csv\n";
46+
foreach (@Options) {
47+
printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
48+
defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
49+
}
50+
exit(1);
51+
}
52+
53+
#----------------------------------------------------------------------

0 commit comments

Comments
 (0)