Skip to content

Commit

Permalink
Merge pull request #371 from broadinstitute/development
Browse files Browse the repository at this point in the history
Release 1.37.0
  • Loading branch information
eweitz authored Oct 30, 2024
2 parents 5fb00bb + 00109a5 commit 663af62
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 4 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name: Minify ontologies
on:
pull_request:
types: [opened] # Only trigger on PR "opened" event
push: # Uncomment, update branches to develop / debug
branches:
jb-metadata-boolean
# push: # Uncomment, update branches to develop / debug
# branches:
# jb-metadata-boolean

jobs:
build:
Expand Down
11 changes: 10 additions & 1 deletion ingest/anndata_.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def generate_processed_matrix(adata):
h5ad_frag.features.processed.tsv
Gzip files for faster delocalization
"""
if adata.var.index.name == 'gene_ids':
if AnnDataIngestor.check_ensembl_index(adata):
# CELLxGENE indexes by Ensembl gene ID, not gene name (i.e. symbol).
# Gene name is encoded in feature_name, which is needed for gene search.
feature_frame = adata.var.feature_name
Expand Down Expand Up @@ -216,6 +216,15 @@ def generate_processed_matrix(adata):
)
AnnDataIngestor.compress_file(mtx_filename)

@staticmethod
def check_ensembl_index(adata):
"""Check if an AnnData file is indexed on Ensembl gene IDs (e.g. ENSG00000243485) instead of gene symbols"""
if adata.var.index.name == 'gene_ids':
return True
else:
prefixes = list(set(gene_id[:3] for gene_id in adata.var_names))
return len(prefixes) == 1 and prefixes[0] == 'ENS'

@staticmethod
def delocalize_extracted_files(
file_path, study_file_id, accession, files_to_delocalize
Expand Down
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/test_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,27 @@ def test_gene_id_indexed_generate_processed_matrix(self):
first_line, expected_first_line, 'Expected Ensembl ID and gene name'
)

def test_check_if_indexed_by_gene_id(self):
# check var.index.name
feature_name = AnnDataIngestor(
"../tests/data/anndata/indexed_by_gene_id.h5ad", self.study_id, self.study_file_id
)
adata = feature_name.obtain_adata()
self.assertTrue(feature_name.check_ensembl_index(adata))

# check data inspection
data_inspect = AnnDataIngestor(
"../tests/data/anndata/cellxgene.human_liver_b_cells.h5ad", self.study_id, self.study_file_id
)
liver_adata = data_inspect.obtain_adata()
self.assertTrue(data_inspect.check_ensembl_index(liver_adata))

# negative test
gene_symbols = AnnDataIngestor(
"../tests/data/anndata/anndata_test.h5ad", self.study_id, self.study_file_id
)
normal_adata = gene_symbols.obtain_adata()
self.assertFalse(gene_symbols.check_ensembl_index(normal_adata))

def test_get_files_to_delocalize(self):
files = AnnDataIngestor.clusterings_to_delocalize(self.valid_kwargs)
Expand Down

0 comments on commit 663af62

Please sign in to comment.