genialis
diff --git a/‎.github/dependabot.yml
Lines changed: 1 addition & 1 deletion b/‎.github/dependabot.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/build.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎MANIFEST.in
Lines changed: 2 additions & 2 deletions b/‎MANIFEST.in
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.rst
Lines changed: 75 additions & 23 deletions b/‎README.rst
Lines changed: 75 additions & 23 deletions
diff --git a/‎docs/changelog.rst
Lines changed: 16 additions & 25 deletions b/‎docs/changelog.rst
Lines changed: 16 additions & 25 deletions
diff --git a/‎docs/conf.py
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/contributing.rst
Lines changed: 1 addition & 1 deletion b/‎docs/contributing.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guide.rst
Lines changed: 0 additions & 5 deletions b/‎docs/guide.rst
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/index.rst
Lines changed: 2 additions & 3 deletions b/‎docs/index.rst
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/ref.rst
Lines changed: 1 addition & 2 deletions b/‎docs/ref.rst
Lines changed: 1 addition & 2 deletions
diff --git a/‎pyproject.toml
Lines changed: 7 additions & 3 deletions b/‎pyproject.toml
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/rnanorm/annotation.py
Lines changed: 1 addition & 1 deletion b/‎src/rnanorm/annotation.py
Lines changed: 1 addition & 1 deletion
@@ -7,4 +7,4 @@ updates:
     directory: "/"
     schedule:
       # Check for updates to GitHub Actions every week
-      interval: "weekly"
+      interval: "weekly"
@@ -1,4 +1,4 @@
-name: RNAseq normalization CI
+name: RNA-seq normalization CI
 on:
   push:
     branches:
@@ -13,14 +13,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        tox-env: [py38, py39, py310, docs, linters, package]
+        tox-env: [py38, py39, py310, py311, docs, linters, package]
         include:
           - tox-env: py38
             python-version: 3.8
           - tox-env: py39
             python-version: 3.9
           - tox-env: py310
             python-version: "3.10"
+          - tox-env: py311
+            python-version: "3.11"
           - tox-env: docs
             python-version: "3.10"
           - tox-env: linters
 
@@ -2,5 +2,5 @@ include tox.ini
 include .readthedocs.yaml
 recursive-include docs *.py *.rst
 recursive-include tests *.py
-recursive-include src/rnanorm/files *.csv.gz *.gtf.gz
-recursive-include tests/files *.tsv
+recursive-include src/rnanorm/files *.csv.gz *.gtf.gz *.csv *.gtf
+recursive-include tests/files *.tsv
@@ -1,6 +1,6 @@
-====================
-RNAseq normalization
-====================
+=====================
+RNA-seq normalization
+=====================
 
 |build| |black| |docs| |pypi_version| |pypi_pyversions| |pypi_downloads|
 
@@ -29,7 +29,7 @@ RNAseq normalization
     :alt: Number of downloads from PyPI
 
 
-Python implementation of common RNAseq normalization methods:
+Python implementation of common RNA-seq normalization methods:
 
 - CPM (Counts per million)
 - FPKM_ (Fragments per kilobase million)
@@ -39,22 +39,25 @@ Python implementation of common RNAseq normalization methods:
 - TMM_ (Trimmed mean of M-values)
 - CTF_ (Counts adjusted with TMM factors)
 
+For in-depth description of methods see documentation_.
 
 .. _FPKM: https://www.nature.com/articles/nmeth.1226
 .. _TPM: https://link.springer.com/article/10.1007/s12064-012-0162-3
 .. _UQ: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-94
 .. _CUF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
 .. _TMM: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
 .. _CTF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
+.. _documentation: https://rnanorm.readthedocs.io/
+
 
 Features
 ========
 
 - Pure Python implementation (no need for R, etc.)
-- Scikit-learn_ compatible
+- Compatible with Scikit-learn_
 - Command line interface
-- Verbose documentation_ (at least we hope so...)
-- Tested methods
+- Verbose documentation_
+- Validated method implementation
 
 
 .. _Scikit-learn: https://scikit-learn.org/
@@ -72,37 +75,86 @@ We recommend installing RNAnorm with pip::
 Quick start
 ===========
 
-Implemented methods can be used from Python or from the command line.
+The implemented methods can be executed from Python or from the command line.
 
 Normalize from Python
 ---------------------
 
-Most commonly normalization methods are run from Python. E.g.::
+The most common use case is to run normalization from Python::
 
-   >>> from rnanorm.datasets import load_rnaseq_toy
+   >>> from rnanorm.datasets import load_toy_data
    >>> from rnanorm import FPKM
-   >>> dataset = load_rnaseq_toy()
+   >>> dataset = load_toy_data()
+   >>> # Expressions need to have genes in columns and samples in rows
    >>> dataset.exp
-          G1     G2      G3      G4       G5
-   S1  200.0  300.0   500.0  2000.0   7000.0
-   S2  400.0  600.0  1000.0  4000.0  14000.0
-   S3  200.0  300.0   500.0  2000.0  17000.0
-   S4  200.0  300.0   500.0  2000.0   2000.0
+             Gene_1  Gene_2  Gene_3  Gene_4  Gene_5
+   Sample_1     200     300     500    2000    7000
+   Sample_2     400     600    1000    4000   14000
+   Sample_3     200     300     500    2000   17000
+   Sample_4     200     300     500    2000    2000
    >>> fpkm = FPKM(dataset.gtf_path).set_output(transform="pandas")
    >>> fpkm.fit_transform(dataset.exp)
-             G1        G2        G3        G4        G5
-   S1  100000.0  100000.0  100000.0  200000.0  700000.0
-   S2  100000.0  100000.0  100000.0  200000.0  700000.0
-   S3   50000.0   50000.0   50000.0  100000.0  850000.0
-   S4  200000.0  200000.0  200000.0  400000.0  400000.0
+                Gene_1    Gene_2    Gene_3    Gene_4    Gene_5
+   Sample_1   100000.0  100000.0  100000.0  200000.0  700000.0
+   Sample_2   100000.0  100000.0  100000.0  200000.0  700000.0
+   Sample_3    50000.0   50000.0   50000.0  100000.0  850000.0
+   Sample_4   200000.0  200000.0  200000.0  400000.0  400000.0
 
 
 Normalize from command line
 ---------------------------
 
-Often it is handy to do normalization from the command line::
+Normalization from the command line is also supported. To list available
+methods and general help::
+
+    rnanorm --help
+
+Get info about a particular method, e.g., CPM::
+
+    rnanorm cpm --help
+
+To normalize with CPM::
+
+   rnanorm cpm exp.csv --out exp_cpm.csv
+
+File ``exp.csv`` needs to be comma separated file with genes in columns and
+samples in rows. Values should be raw counts. The output is saved to
+``exp_cpm.csv``. Example of input file::
+
+    cat exp.csv
+    ,Gene_1,Gene_2,Gene_3,Gene_4,Gene_5
+    Sample_1,200,300,500,2000,7000
+    Sample_2,400,600,1000,4000,14000
+    Sample_3,200,300,500,2000,17000
+    Sample_4,200,300,500,2000,2000
+
+One can also provide input through standard input::
+
+   cat exp.csv | rnanorm cpm --out exp_cpm.csv
+
+If file specified with ``--out`` already exists the command will fail. If you
+are sure that you wish to overwrite, use ``--force`` flag::
+
+   cat exp.csv | rnanorm cpm --force --out exp_cpm.csv
+
+If no file is specified with ``--out`` parameter, output is printed to standard
+output::
+
+   cat exp.csv | rnanorm cpm > exp_cpm.csv
+
+Methods TPM and FPKM require gene lengths. These can be provided either with GTF_
+file or with "gene lengths" file. The later is a two columns file. The first
+column should include the genes in the header of ``exp.csv`` and the second
+column should contain gene lengths computed by union exon model::
+
+    # Use GTF file
+    rnanorm tpm exp.csv --gtf annotations.gtf > exp_out.csv
+    # Use gene lengths file
+    rnanorm tpm exp.csv --gene-lengths lenghts.csv > exp_out.csv
+
+
 
-   rnanorm fpkm exp.csv --gtf annotation.gtf --out exp_fpkm.csv
+.. _GTF: https://www.ensembl.org/info/website/upload/gff.html
 
 
 Contribute
 
@@ -4,34 +4,25 @@ Change Log
 
 All notable changes to this project are documented in this file.
 
-==========
-Unreleased
-==========
-
-Added
------
--
-
-Fixed
------
--
-
-Changed
--------
--
 
 ==================
-0.0.1 - 2022-07-18
+2.0.0 - 2023-06-21
 ==================
 
 Added
 -----
--
-
-Fixed
------
--
-
-Changed
--------
--
+- Implementation of the following methods:
+
+    - CPM
+    - FPKM
+    - TPM
+    - UQ
+    - CUF
+    - TMM
+    - CTF
+
+- Add a "toy" and GTEx dataset
+- Add command line interface for all of the above methods
+- Add tests
+- Support calculation of gene lengths from GTf or gene lengths file in TPM /
+  FPKM
@@ -11,7 +11,7 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = "RNAseq normalization"
+project = "RNA-seq normalization"
 author = meta["Author"]
 release = meta["Version"]
 copyright = "2023, " + author
 
@@ -61,7 +61,7 @@ Preparing release
 
 
 Describe the new features in ``changelog.rst``. Replace the Unreleased heading
-with the new version, followed by the release date (e.g.
+with the new version, followed by the release date (e.g.,
 ``13.2.0 - 2018-10-23``).
 
 Add the new dependencies to ``pyproject.toml`` and update the package version.
 
@@ -1,11 +1,10 @@
-Welcome to RNAseq normalization's documentation!
-================================================
+Welcome to RNA-seq normalization's documentation!
+=================================================
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   guide
    ref
    changelog
    contributing
 
@@ -13,7 +13,6 @@ Normalization methods
    :nosignatures:
    :toctree: generated/
 
-   LibrarySize
    CPM
    FPKM
    TPM
@@ -32,5 +31,5 @@ Datasets
    :nosignatures:
    :toctree: generated/
 
-   datasets.load_rnaseq_toy
+   datasets.load_toy_data
    datasets.load_gtex
@@ -7,15 +7,15 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rnanorm"
-description = "Common RNAseq normalization methods"
+description = "Common RNA-seq normalization methods"
 authors = [
     {name = "Genialis, Inc."},
     {email = "[email protected]"},
 ]
 dynamic = ["version"]
 readme = "README.rst"
 license = {text = "Proprietary"}
-requires-python = ">=3.8, <3.11"
+requires-python = ">=3.8, <3.12"
 keywords = [
     "bio",
     "bioinformatics",
@@ -24,18 +24,22 @@ keywords = [
     "artificial intelligence",
     "python",
     "genialis",
+    "rnaseq",
+    "normalization",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Environment :: Console",
     "Intended Audience :: Developers",
     "Topic :: Software Development :: Libraries :: Python Modules",
+    "License :: OSI Approved :: Apache Software License",
     "Operating System :: OS Independent",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
     "click",
@@ -74,7 +78,7 @@ rnanorm = "rnanorm.cli:main"
 [tool.setuptools_scm]
 
 [tool.black]
-target-version = ["py38", "py39", "py310"]
+target-version = ["py38", "py39", "py310", "py311"]
 line-length = 99
 
 [tool.isort]
 
@@ -70,7 +70,7 @@ def _gene_length(self, gtf_df: pd.DataFrame, gene_id_attr: str = "gene_id") -> p
         Group exon start & end coordinates by gene ID & chromosome &
         strand. Then perfrom merge and length calculation for each
         group separately. The latter is needed since ``gene_id_attr``
-        is not unique in some annotations (e.g. RefSeq).
+        is not unique in some annotations (e.g., RefSeq).
         """
         gtf_df = gtf_df[gtf_df["feature_type"] == "exon"]