nextflow-io
diff --git a/‎.gitpod.Dockerfile‎
Lines changed: 0 additions & 32 deletions b/‎.gitpod.Dockerfile‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎.gitpod.yml‎
Lines changed: 5 additions & 14 deletions b/‎.gitpod.yml‎
Lines changed: 5 additions & 14 deletions
diff --git a/‎asciidocs/channels.adoc‎
Lines changed: 298 additions & 0 deletions b/‎asciidocs/channels.adoc‎
Lines changed: 298 additions & 0 deletions
diff --git a/‎asciidocs/containers.adoc‎
Lines changed: 1 addition & 6 deletions b/‎asciidocs/containers.adoc‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎asciidocs/index.adoc‎
Lines changed: 0 additions & 1 deletion b/‎asciidocs/index.adoc‎
Lines changed: 0 additions & 1 deletion
@@ -13,27 +13,18 @@ github:
     # add a "Review in Gitpod" button to pull requests (defaults to false)
     addBadge: false
 
-image:
-  file: .gitpod.Dockerfile
+# Old container: nfcore/gitpod:latest 
+image: nfcore/gitpod:latest 
 
 # List the start up tasks. Learn more https://www.gitpod.io/docs/config-start-tasks/
 tasks:
   - name: Start web server
     command: gp await-port 23000 && gp preview https://training.seqera.io    
 
   - name: Download Nextflow Tutorial
-    init: |
-      echo 'init script' # runs during prebuild
-      echo 'start script'
-
 
     command: |
-      curl -s https://get.nextflow.io | bash
-      chmod +x nextflow
-      sudo mv nextflow /usr/local/bin/
-      docker pull nextflow/rnaseq-nf
-      sudo apt install  -y tree
-      sudo apt install  -y graphviz
-      unset JAVA_TOOL_OPTIONS
-      alias conda_activate=". /opt/conda/etc/profile.d/conda.sh; conda activate base"
       cd nf-training
+      conda init bash
+      unset JAVA_TOOL_OPTIONS
+      docker pull nextflow/rnaseq-nf
@@ -360,3 +360,301 @@ process fastqc {
 }
 ----
 
+=== Text files
+
+The `splitText` operator allows you to split multi-line strings or text file items, emitted by a source channel into chunks containing n lines, which will be emitted by the resulting channel. See:
+
+----
+Channel
+     .fromPath('data/meta/random.txt') // <1>
+     .splitText()                      // <2>
+     .view()                           // <3>
+----
+
+<1> Instructs Nextflow to make a channel from the path "data/meta/random.txt".
+<2> The `splitText` operator splits each item into chunks of one line by default.
+<3> View contents of the channel.
+
+
+You can define the number of lines in each chunk by using the parameter `by`, as shown in the following example:
+
+----
+Channel
+     .fromPath('data/meta/random.txt')
+     .splitText( by: 2 )
+     .subscribe {
+         print it;
+         print "--- end of the chunk ---\n"
+     }
+----
+
+TIP: The `subscribe` operator permits to execute a user defined function each time a new value is emitted by the source channel.
+
+An optional closure can be specified in order to transform the text chunks produced by the operator. The following example shows how to split text files into chunks of 10 lines and transform them into capital letters:
+
+----
+Channel
+   .fromPath('data/meta/random.txt')
+   .splitText( by: 10 ) { it.toUpperCase() }
+   .view()
+----
+
+You can also make counts for each line:
+
+----
+count=0
+
+Channel
+   .fromPath('data/meta/random.txt')
+   .splitText()
+   .view { "${count++}: ${it.toUpperCase().trim()}" }
+
+----
+
+Finally, you can also use the operator on plain files (outside of the channel context), as so:
+
+----
+  def f = file('data/meta/random.txt')
+  def lines = f.splitText()
+  def count=0
+  for( String row : lines ) {
+    log.info "${count++} ${row.toUpperCase()}"
+  }
+----
+
+=== Comma separate values (.csv)
+
+The `splitCsv` operator allows you to parse text items emitted by a channel, that are formatted using the CSV format. 
+
+It then splits them into records or groups them into a list of records with a specified length.
+
+In the simplest case, just apply the `splitCsv` operator to a channel emitting a CSV formatted text files or text entries, to view only the first and fourth columns. For example:
+
+----
+  Channel
+    .fromPath("data/meta/patients_1.csv")
+    .splitCsv()
+    // row is a list object 
+    .view { row -> "${row[0]},${row[3]}" }
+----
+
+When the CSV begins with a header line defining the column names, you can specify the parameter `header: true` which allows you to reference each value by its name, as shown in the following example:
+
+----
+  Channel
+    .fromPath("data/meta/patients_1.csv")
+    .splitCsv(header: true)
+    // row is a list object 
+    .view { row -> "${row.patient_id},${row.num_samples}" }
+----
+
+Alternatively you can provide custom header names by specifying a the list of strings in the header parameter as shown below:
+
+----
+  Channel
+    .fromPath("data/meta/patients_1.csv")
+    .splitCsv(header: ['col1', 'col2', 'col3', 'col4', 'col5'] )
+    // row is a list object 
+    .view { row -> "${row.col1},${row.col4}" }
+----
+
+You can also process multiple csv files at the same time:
+
+----
+    Channel
+      .fromPath("data/meta/patients_*.csv") // <-- just use a pattern
+      .splitCsv(header:true)
+      .view { row -> "${row.patient_id}\t${row.num_samples}" }
+----
+
+TIP: Notice that you can change the output format simply by adding a different delimiter.
+
+Finally, you can also operate on csv files outside the channel context, as so:
+
+----
+def f = file('data/meta/patients_1.csv')
+  def lines = f.splitCsv()
+  for( List row : lines ) {
+    log.info "${row[0]} -- ${row[2]}"
+  }
+----
+
+[discrete]
+=== Exercise
+
+Try inputting fastq reads to the RNA-Seq workflow from earlier using `.splitCSV`.
+
+.Click here for the answer:
+[%collapsible]
+====
+Add a csv text file containing the following, as example input with the name "fastq.csv":
+
+[source,nextflow,linenums]
+----
+gut,/workspace/nf-training-public/nf-training/data/ggal/gut_1.fq,/workspace/nf-training-public/nf-training/data/ggal/gut_2.fq
+----
+
+Then replace the input channel for the reads in `script7.nf`. Changing the following lines:
+
+[source,nextflow,linenums]
+----
+Channel 
+    .fromFilePairs( params.reads, checkIfExists: true )
+    .into { read_pairs_ch; read_pairs2_ch } 
+----
+
+To a splitCsv channel factory input:
+
+[source,nextflow,linenums]
+----
+Channel 
+    .fromPath("fastq.csv")
+    .splitCsv()
+    .view () { row -> "${row[0]},${row[1]},${row[2]}" }
+    .into { read_pairs_ch; read_pairs2_ch } 
+----
+
+Finally, change the cardinality of the processes that use the input data. For example, for the quantification process I change it from:
+
+[source,nextflow,linenums]
+----
+process quantification {
+    tag "$sample_id"
+         
+    input:
+    path salmon_index from index_ch
+    tuple val(sample_id), path(reads) from read_pairs_ch
+ 
+    output:
+    path sample_id into quant_ch
+ 
+    script:
+    """
+    salmon quant --threads $task.cpus --libType=U -i $salmon_index -1 ${reads[0]} -2 ${reads[1]} -o $sample_id
+    """
+}
+----
+
+To:
+
+[source,nextflow,linenums]
+----
+process quantification {
+    tag "$sample_id"
+         
+    input:
+    path salmon_index from index_ch
+    tuple val(sample_id), path(reads1), path(reads2) from read_pairs_ch
+ 
+    output:
+    path sample_id into quant_ch
+ 
+    script:
+    """
+    salmon quant --threads $task.cpus --libType=U -i $salmon_index -1 ${reads1} -2 ${reads2} -o $sample_id
+    """
+}
+----
+
+Repeat for the fastqc step. Now the workflow should run from a CSV file.
+====
+
+=== Tab separated values (.tsv)
+
+Parsing tsv files works in a similar way, just adding the `sep:'\t'` option in the `splitCsv` context:
+
+----
+ Channel
+      .fromPath("data/meta/regions.tsv", checkIfExists:true)
+      // use `sep` option to parse TAB separated files
+      .splitCsv(sep:'\t')
+      // row is a list object 
+      .view()
+----
+
+[discrete]
+=== Exercise
+
+Try using the tab separation technique on the file "data/meta/regions.tsv", but print just the first column, and remove the header.
+
+.Answer:
+[%collapsible]
+====
+ Channel
+      .fromPath("data/meta/regions.tsv", checkIfExists:true)
+      // use `sep` option to parse TAB separated files
+      .splitCsv(sep:'\t', header:true )
+      // row is a list object 
+      .view { row -> "${row.patient_id}" }
+====
+
+== More complex file formats
+
+=== JSON
+
+We can also easily parse the JSON file format using the following groovy schema:
+
+----
+import groovy.json.JsonSlurper
+
+def f = file('data/meta/regions.json')
+def records = new JsonSlurper().parse(f)
+
+
+for( def entry : records ) {
+  log.info "$entry.patient_id -- $entry.feature"
+}
+----
+
+IMPORTANT: When using an older JSON version, you may need to replace `parse(f)` with `parseText(f.text)`
+
+=== YAML
+
+In a similar way, this is a way to parse YAML files: 
+
+----
+import org.yaml.snakeyaml.Yaml
+
+def f = file('data/meta/regions.json')
+def records = new Yaml().load(f)
+
+
+for( def entry : records ) {
+  log.info "$entry.patient_id -- $entry.feature"
+}
+----
+
+=== Storage of parsers into modules
+
+The best way to store parser scripts is to keep them in a nextflow module file. 
+
+This follows the DSL2 way of working.
+
+See the following nextflow script:
+
+----
+nextflow.preview.dsl=2
+
+include{ parseJsonFile } from './modules/parsers.nf'
+
+process foo {
+  input:
+  tuple val(meta), path(data_file)
+
+  """
+  echo your_command $meta.region_id $data_file
+  """
+}
+
+workflow {
+    Channel.fromPath('data/meta/regions*.json') \
+      | flatMap { parseJsonFile(it) } \
+      | map { entry -> tuple(entry,"/some/data/${entry.patient_id}.txt") } \
+      | foo
+}
+----
+
+To get this script to work, first we need to create a file called `parsers.nf`, and store it in the modules folder in the current directory.
+
+This file should have the `parseJsonFile` function present, then Nextflow will use this as a custom function within the workflow scope.
+
@@ -381,12 +381,7 @@ Conda is a popular package and environment manager. The built-in support for Con
 allows Nextflow pipelines to automatically create and activate the Conda
 environment(s), given the dependencies specified by each process.
 
-For this Gitpod tutorial you need to activate conda by typing:
-
-
-```bash
-conda_activate 
-```
+For this Gitpod tutorial you need to open a new terminal to ensure that conda is activated (see the + button on the terminal).
 
 You should now see that the beginning of your command prompt has `(base)` written. If you wish to deactivate conda at any point, simply enter `conda deactivate`.
 
 
@@ -30,5 +30,4 @@ include::config.adoc[]
 include::executors.adoc[]
 include::cache_and_resume.adoc[]
 include::debugging.adoc[]
-include::parsing.adoc[]
 :leveloffset: -1