GEMprep/main.nf at master · SystemsGenetics/GEMprep · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env nextflow

nextflow.enable.dsl=2


workflow {
    // load input files
    fpkm_txt_files = Channel.fromFilePairs("${params.input_dir}/${params.fpkm_txt}", size: 1, flat: true)
    raw_txt_files  = Channel.fromFilePairs("${params.input_dir}/${params.raw_txt}", size: 1, flat: true)
    tpm_txt_files  = Channel.fromFilePairs("${params.input_dir}/${params.tpm_txt}", size: 1, flat: true)
    emx_txt_files  = Channel.fromFilePairs("${params.input_dir}/${params.emx_txt}", size: 1, flat: true)
    labels_files   = Channel.fromFilePairs("${params.input_dir}/${params.labels_txt}", size: 1, flat: true)

    data_txt_files = Channel.empty().mix(
        fpkm_txt_files,
        raw_txt_files,
        tpm_txt_files,
        emx_txt_files
    )

    // run convert if specified
    if ( params.convert_txt_npy == true ) {
        convert_txt_npy(data_txt_files)
    }

    // make sure that at most one quantile method (R or python) is enabled
    if ( params.normalize_quantile_py == true && params.normalize_quantile_r == true ) {
        error "error: only one quantile method (R or python) should be enabled"
    }

    // run normalize if specified
    if ( params.normalize == true ) {
        normalize(data_txt_files)
    }

    // run visualize if specified
    if ( params.visualize == true ) {
        inputs = data_txt_files.join(labels_files)

        visualize(inputs)
    }

    // run partition if specified
    if ( params.partition == true ) {
        partition(data_txt_files)
    }
}


/**
 * The convert process takes an expression matrix and converts it from plaintext
 * to binary.
 */
process convert_txt_npy {
    tag "${dataset}"
    publishDir "${params.output_dir}/${dataset}"

    input:
        tuple val(dataset), path(input_file)

    output:
        tuple val(dataset), path("*.npy"), path("*.rownames.txt"), path("*.colnames.txt")

    script:
        """
        convert.py ${input_file} `basename ${input_file} .txt`.npy
        """
}


/**
 * The normalize process takes an FPKM expression matrix and applies a series
 * of transformations (log2, k-s test outlier removal, quantile normalization)
 * which produces a normalized expression matrix.
 */
process normalize {
    tag "${dataset}"
    publishDir "${params.output_dir}/${dataset}"

    input:
        tuple val(dataset), path(input_file)

    output:
        tuple val(dataset), path("${dataset}.emx.txt")
        tuple val(dataset), path("${dataset}.kstest.txt")

    script:
        """
        mpirun -np ${params.normalize_np} normalize.py \
            ${input_file} \
            ${dataset}.emx.txt \
            ${params.normalize_log2 ? "--log2" : ""} \
            ${params.normalize_kstest ? "--kstest" : ""} \
            --ks-log ${dataset}.kstest.txt \
            ${params.normalize_quantile_py ? "--quantile" : ""}

        if [[ ${params.normalize_quantile_r} == true ]]; then
            mv ${dataset}.emx.txt FPKM.txt
            normalize_R --quantile
            mv GEM.txt ${dataset}.emx.txt
        fi
        """
}


/**
 * The visualize process takes an expression matrix and produces a set of
 * visualizations based on the input configuration.
 */
process visualize {
    tag "${dataset}"
    publishDir "${params.output_dir}/${dataset}"

    input:
        tuple val(dataset), path(data_file), path(labels_file)

    output:
        tuple val(dataset), path("*.png")

    script:
        """
        visualize.py \
            ${data_file} \
            --labels ${labels_file} \
            ${params.visualize_density ? "--density density.png" : ""} \
            ${params.visualize_tsne ? "--tsne tsne.png" : ""} \
            --tsne-na ${params.visualize_tsne_na} \
            --tsne-npca ${params.visualize_tsne_npca}
        """
}


/**
 * The partition process takes an expression matrix and produces several
 * sub-matrices based on a partitioning scheme.
 */
process partition {
    tag "${dataset}"
    publishDir "${params.output_dir}/${dataset}"

    input:
        tuple val(dataset), path(input_file)

    output:
        tuple val(dataset), path("*.txt")

    script:
        """
        partition.py \
            ${input_file} \
            partitions.txt \
            --n-partitions ${params.partition_npartitions} \
            --method ${params.partition_method}
        """
}