Skip to content

Commit

Permalink
Feature/gzip mtx output (#112)
Browse files Browse the repository at this point in the history
* Use pandas to compress mtx output

* Add test for mtx export

* Blackify

* Revert "Blackify"

This reverts commit 772f5a2.

* Fix mtx export test

* Run unit tests before integration tests

* Fix mtx export test directory sort

* add a round-trip check for matrix writing

* poke ci
  • Loading branch information
pinin4fjords authored Mar 1, 2022
1 parent ac6a7e0 commit 4d610f6
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 12 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,9 @@ jobs:
pip install $(pwd)/scanpy-scripts
python -m pip install $(pwd)/scanpy --no-deps --ignore-installed -vv
- name: Run unit tests
run: pytest --doctest-modules -v ./scanpy-scripts

- name: Test with bats
run: |
./scanpy-scripts/scanpy-scripts-tests.bats
- name: Run unit tests
run: pytest --doctest-modules -v ./scanpy-scripts

66 changes: 58 additions & 8 deletions scanpy_scripts/cmd_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _write_obj(
click.echo(adata, err=show_obj == 'stderr')
return 0

def write_mtx(adata, fname_prefix='', var=None, obs=None, use_raw=False, use_layer=None):
def write_mtx(adata, fname_prefix='', var=None, obs=None, use_raw=False, use_layer=None, compression = None):
"""Export AnnData object to mtx formt
* Parameters
+ adata : AnnData
Expand All @@ -133,6 +133,32 @@ def write_mtx(adata, fname_prefix='', var=None, obs=None, use_raw=False, use_lay
A list of column names to be exported to gene table
+ obs : list
A list of column names to be exported to barcode/cell table
+ use_raw : bool
Take data the matrix from .raw.X?
+ use_layer: str
Specify a layer to use instead of .X (non-raw only)
+ compression: None, str or dict
Compression parameter for Pandas' to_csv(). For compression, a dict
with a 'method' key, e.g. {'method': 'gzip', 'compresslevel': 1,
'mtime': 1}
>>> import os
>>> from pathlib import Path
>>> adata = sc.datasets.pbmc3k()
>>> # Test uncompressed write
>>> Path("uncompressed").mkdir(parents=True, exist_ok=True)
>>> write_mtx(adata, fname_prefix = 'uncompressed/', use_raw = False, use_layer = None, var = ['gene_name'])
>>> sorted(os.listdir('uncompressed'))
['barcodes.tsv', 'genes.tsv', 'matrix.mtx']
>>> # Test that the matrix is the same when we read it back
>>> test_readable = sc.read_10x_mtx('uncompressed')
>>> if any(test_readable.obs_names != adata.obs_names) or any(test_readable.var_names != adata.var_names) or (test_readable.X[1].sum() - adata.X[1].sum()) > 1e-5:
... print("Re-read matrix is different to the one we stored, something is wrong with the writing")
>>> # Test compressed write
>>> Path("compressed").mkdir(parents=True, exist_ok=True)
>>> write_mtx(adata, fname_prefix = 'compressed/', use_raw = False, use_layer = None, var = ['gene_name'], compression = {'method': 'gzip'})
>>> sorted(os.listdir('compressed'))
['barcodes.tsv.gz', 'genes.tsv.gz', 'matrix.mtx.gz']
"""
if fname_prefix and not (fname_prefix.endswith('/') or fname_prefix.endswith('_')):
fname_prefix = fname_prefix + '_'
Expand All @@ -157,22 +183,46 @@ def write_mtx(adata, fname_prefix='', var=None, obs=None, use_raw=False, use_lay

n_obs, n_var = mat.shape
n_entry = len(mat.data)
header = '%%MatrixMarket matrix coordinate real general\n%\n{} {} {}\n'.format(
n_var, n_obs, n_entry)

# Define the header lines as a Pandas DataFrame so we can use the same compression
header = pd.DataFrame(['%%MatrixMarket matrix coordinate real general', f"{n_var} {n_obs} {n_entry}"])
df = pd.DataFrame({'col': mat.col + 1, 'row': mat.row + 1, 'data': mat.data})

# Define outputs
mtx_fname = fname_prefix + 'matrix.mtx'
gene_fname = fname_prefix + 'genes.tsv'
barcode_fname = fname_prefix + 'barcodes.tsv'
with open(mtx_fname, 'a') as fh:
fh.write(header)
df.to_csv(fh, sep=' ', header=False, index=False)

# Write matrix with Pandas CSV and use its compression where requested
if compression is not None and type(compression) is dict and 'method' in compression:
compressed_exts = {
'zip': 'zip',
'gzip': 'gz',
'bz2': 'bz2',
'zstd': 'zst'
}
ext = compressed_exts.get(compression['method'], 'None')

if ext is None:
errmsg = "Invalid compression method"
raise Exception(errmsg)

mtx_fname += f".{ext}"
gene_fname += f".{ext}"
barcode_fname += f".{ext}"
else:
compression = None

header.to_csv(mtx_fname, header = False, index = False, compression = compression)
df.to_csv(mtx_fname, sep=' ', header=False, index=False, compression = compression, mode = 'a')

# Now write the obs and var, also with compression if appropriate
obs_df = adata.obs[obs].reset_index(level=0)
obs_df.to_csv(barcode_fname, sep='\t', header=False, index=False)
obs_df.to_csv(barcode_fname, sep='\t', header=False, index=False, compression = compression)
var_df = var_source[var].reset_index(level=0)
if not var:
var_df['gene'] = var_df['index']
var_df.to_csv(gene_fname, sep='\t', header=False, index=False)
var_df.to_csv(gene_fname, sep='\t', header=False, index=False, compression = compression)


def make_plot_function(func_name, kind=None):
Expand Down

0 comments on commit 4d610f6

Please sign in to comment.