Skip to content

Commit

Permalink
feat: Improve datavzrd tables (#93)
Browse files Browse the repository at this point in the history
Co-authored-by: Addimator <adpri100hhu.de>
Co-authored-by: David Laehnemann <[email protected]>
Co-authored-by: Johannes Köster <[email protected]>
Co-authored-by: Johannes Koester <[email protected]>
  • Loading branch information
4 people authored Aug 15, 2024
1 parent bae88d0 commit 93512b8
Show file tree
Hide file tree
Showing 37 changed files with 1,733 additions and 127 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
!config/units.tsv
!LICENSE
!README.md
local/*
resources
resources/*
results
Expand Down
25 changes: 24 additions & 1 deletion .test/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,25 @@ resources:
# ensembl species name
species: homo_sapiens
# ensembl release version
release: "104"
release: "112"
# genome build
build: GRCh38
# pfam release to use for annotation of domains in differential splicing analysis
pfam: "33.0"
# Choose strategy for selecting representative transcripts for each gene.
# Possible values:
# - canonical (use the canonical transcript from ensembl, only works for human at the moment)
# - mostsignificant (use the most significant transcript)
# - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
# the user has to ensure that there is only one ID per gene given)
representative_transcripts: canonical
ontology:
# gene ontology to download, used e.g. in goatools
gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"

pca:
# If set to true, samples with NA values in the specified covariate column will be removed for PCA computation;
exclude_nas: false
labels:
# columns of sample sheet to use for PCA
- condition
Expand Down Expand Up @@ -96,6 +104,21 @@ enrichment:
# the species specified by resources -> ref -> species above
pathway_database: "panther"

meta_comparisons:
# comparison is only run if set to `true`
activate: false
# Define here the comparisons under interest
comparisons:
# Define any name for comparison. You can add as many comparisions as you want
model_X_vs_model_Y:
items:
# Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
# items must be of form <arbitrary label>: <existing diffexp model from config>
X: model_X
Y: model_Y
# Define label for datavzrd report
label: model X vs. model Y

report:
# make this `true`, to get excel files for download in the snakemake
# report, BUT: this can drastically increase the runtime of datavzrd report
Expand Down
25 changes: 23 additions & 2 deletions .test/three_prime/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ experiment:
vendor: lexogen
plot-qc: all



resources:
ref:
# ensembl species name
Expand All @@ -23,12 +21,20 @@ resources:
build: GRCh38
# pfam release to use for annotation of domains in differential splicing analysis
pfam: "33.0"
# Choose strategy for selecting representative transcripts for each gene.
# Possible values:
# - canonical (use the canonical transcript from ensembl, only works for human at the moment)
# - mostsignificant (use the most significant transcript)
# - path/to/any/file.txt (a path to a file with ensembl transcript IDs to use;
# the user has to ensure that there is only one ID per gene given)
representative_transcripts: canonical
ontology:
# gene ontology to download, used e.g. in goatools
gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"

pca:
# If set to true, samples with NA values in the specified covariate column will be removed for PCA computation.
exclude_nas: false
labels:
# columns of sample sheet to use for PCA
- condition
Expand Down Expand Up @@ -98,6 +104,21 @@ enrichment:
# the species specified by resources -> ref -> species above
pathway_database: "panther"

meta_comparisons:
# comparison is only run if set to `true`
activate: false
# Define here the comparisons under interest
comparisons:
# Define any name for comparison. You can add as many comparisions as you want
model_X_vs_model_Y:
items:
# Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
# items must be of form <arbitrary label>: <existing diffexp model from config>
X: model_X
Y: model_Y
# Define label for datavzrd report
label: model X vs. model Y

bootstrap_plots:
# desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated
FDR: 0.01
Expand Down
6 changes: 6 additions & 0 deletions config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,9 @@ Changes to the recommendations are motivated as follows:
* `-a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"`: We remove A{18}, as this is handled by `--poly-a`. We increase `min_overlap` to 7 and set the `max_error_rate` to the Illumina error rate of about 0.005, both to avoid spurious adapter matches being removed.
* `-g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"`: This is not needed any more, as `-a` option will lead to complete removal of read sequence if adapter is found at the start of the read, see: https://cutadapt.readthedocs.io/en/stable/guide.html#rightmost
* `--discard-trimmed`: We omit this, as the `-a` with the adapter sequence will lead to complete read sequence removal if adapter is found at start, and the `--minimum-length` will then discard such empty reads.

### meta comparisons
Meta comparisons allow for comparing two full models against each other.
The axes represent the log2-fold changes (beta-scores) for the two models, with each point representing a gene.
Points on the diagonal indicate no difference between the comparisons, while deviations from the diagonal suggest differences in gene expression between the treatments.
For more details see the comments in the `config.yaml`.
25 changes: 23 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ experiment:
# this allows to plot QC of aligned read postion for specific transcripts (or 'all' transcripts)
plot-qc: all



resources:
ref:
# ensembl species name
Expand All @@ -35,6 +33,8 @@ resources:
gene_ontology: "http://current.geneontology.org/ontology/go-basic.obo"

pca:
# If set to true, samples with NA values in the specified covariate column will be removed for PCA computation;
exclude_nas: false
labels:
# columns of sample sheet to use for PCA
- condition
Expand Down Expand Up @@ -105,6 +105,27 @@ enrichment:
# the species specified by resources -> ref -> species above
pathway_database: "reactome"

meta_comparisons:
# comparison is only run if set to `true`
activate: false
# Define here the comparisons under interest
comparisons:
# Define any name for comparison. You can add as many comparisions as you want
model_X_vs_model_Y:
items:
# Define the two underlying models for the comparison. The models must be defined in the diffexp/models in the config
# items must be of form <arbitrary label for plot-axis>: <existing diffexp model from config>
X: model_X
Y: model_Y
# Define label for datavzrd report
label: model X vs. model Y

report:
# make this `true`, to get excel files for download in the snakemake
# report, BUT: this can drastically increase the runtime of datavzrd report
# generation, especially on larger cohorts
offer_excel: false

bootstrap_plots:
# desired false discovery rate for bootstrap plots, i.e. a lower FDR will result in fewer boxplots generated
FDR: 0.01
Expand Down
2 changes: 1 addition & 1 deletion config/samples.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ sample condition batch_effect
A treated batch1
B untreated batch1
C treated batch2
D untreated batch2
D untreated batch2
2 changes: 1 addition & 1 deletion config/units.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ A 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
B 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
B 2 300 14 raw/b.chr21.1.fq
C 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
D 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
D 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
1 change: 1 addition & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ include: "rules/diffsplice.smk"
include: "rules/enrichment.smk"
include: "rules/datavzrd.smk"
include: "rules/bam.smk"
include: "rules/meta_comparisons.smk"


rule all:
Expand Down
4 changes: 4 additions & 0 deletions workflow/envs/pandas.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- conda-forge
dependencies:
- pandas =2.2.1
4 changes: 4 additions & 0 deletions workflow/envs/polars.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- conda-forge
dependencies:
- polars =1.2.1
8 changes: 8 additions & 0 deletions workflow/envs/pystats.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
- nodefaults
dependencies:
- polars =0.20.28
- pyreadr =0.5
- altair =5.2
- pyarrow =16.1
6 changes: 6 additions & 0 deletions workflow/report/meta_compare.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Meta comparisons for {{ snakemake.wildcards.meta_comp }}.
The axes represent the log2-fold changes (beta-scores) for the two models, with each point representing a gene.
Points on the diagonal indicate no difference between the comparisons, while deviations from the diagonal suggest differences in gene expression between the treatments.
The color encodes the corresponding q-value.
By clicking on points, their label can be displayed.
Holding the Shift key allows to select or deselect labels for multiple genes.
2 changes: 2 additions & 0 deletions workflow/report/units.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Unit sheet containing all considered units, which can be multiple units for a single sample (for example, when the same biological sample was sequenced across multiple lanes and demultiplexed into separate lane-specific fastq files).
The annotations in this file determine how the workflow internally handles units.
2 changes: 1 addition & 1 deletion workflow/report/workflow.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
After adapter removal with `Cutadapt <http://cutadapt.readthedocs.io>`_, transcripts were quantified with `Kallisto <https://pachterlab.github.io/kallisto/>`_.
Integrated normalization and differential expression analysis was conducted with `Sleuth <https://pachterlab.github.io/sleuth>`_ following standard procedure as outlined in the manual.
For sample metadata, see {{ snakemake.config["samples"] }}_.
For sample metadata, see the samples file in `Inputs`_
99 changes: 99 additions & 0 deletions workflow/resources/custom_vega_plots/circle_diagram_genes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"width": 35,
"height": 35,
"data": {
"values": []
},
"layer": [
{
"mark": "arc",
"encoding": {
"theta": {
"field": "amount",
"type": "quantitative"
},
"color": {
"field": "category",
"type": "nominal",
"scale": {
"domain": [
"DE_genes",
"genes"
],
"range": [
"#f2e34c",
"#31a354"
]
},
"legend": null
},
"tooltip": [
{
"field": "category",
"type": "nominal"
},
{
"field": "amount",
"type": "quantitative"
}
]
}
},
{
"mark": {
"type": "text",
"baseline": "middle",
"align": "center",
"dx": 2,
"fontSize": 9,
"color": "white"
},
"encoding": {
"text": {
"field": "percentage",
"type": "quantitative",
"format": "0.2%"
}
}
},
{
"transform": [
{
"pivot": "category",
"value": "amount",
"groupby": [
"percentage"
]
}
],
"mark": "rule",
"encoding": {
"tooltip": [
{
"field": "genes",
"type": "nominal"
},
{
"field": "DE_genes",
"type": "quantitative"
}
]
},
"params": [
{
"name": "hover",
"select": {
"type": "point",
"fields": [
"percentage"
],
"nearest": true,
"on": "mouseover",
"clear": "mouseout"
}
}
]
}
]
}
72 changes: 72 additions & 0 deletions workflow/resources/custom_vega_plots/horizontal_bars.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"width": 35,
"height": 35,
"data": {
"values": []
},
"transform": [
{
"calculate": "datum.category === 'ratio_differential' ? 'differential' : 'random'",
"as": "tooltip_text"
}
],
"mark": {
"type": "bar",
"tooltip": {
"content": "data"
}
},
"encoding": {
"y": {
"field": "category",
"type": "nominal",
"axis": null,
"sort": [
"ratio_differential",
"ratio_size"
]
},
"x": {
"field": "amount",
"type": "quantitative",
"axis": null
},
"tooltip": {
"field": "tooltip_text",
"type": "nominal"
}
},
"layer": [
{
"mark": {
"type": "bar",
"tooltip": true
}
},
{
"mark": {
"type": "text",
"align": "left",
"dx": 2,
"dy": -2,
"color": "black",
"fontSize": 10
},
"encoding": {
"text": {
"field": "amount",
"type": "quantitative",
"format": ".3f"
}
}
}
],
"config": {
"style": {
"cell": {
"stroke": "transparent"
}
}
}
}
Loading

0 comments on commit 93512b8

Please sign in to comment.