Skip to content

Commit 27eb07f

Browse files
Merge pull request #729 from Starlitnightly/master
Add SUDE dimension reduction method and update related documentation
2 parents c1ed873 + 5897f70 commit 27eb07f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+16247
-75
lines changed

.flake8

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[flake8]
2+
exclude =
3+
.git,
4+
__pycache__,
5+
.venv,
6+
venv,
7+
build,
8+
dist,
9+
*.egg-info,
10+
dynamo/external,
11+
external
12+
select = E9,F63,F7,F82
13+
count = True
14+
show-source = True
15+
statistics = True

.github/workflows/python-package.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ jobs:
3535
- name: Lint with flake8
3636
run: |
3737
# stop the build if there are Python syntax errors or undefined names
38-
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=dynamo/external,external
3939
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
40-
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude=dynamo/external,external
4141
# - name: Test with pytest
4242
# run: |
4343
# pytest

.github/workflows/python-plain-run-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ jobs:
3434
- name: Lint with flake8
3535
run: |
3636
# stop the build if there are Python syntax errors or undefined names
37-
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=dynamo/external,external
3838
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39-
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude=dynamo/external,external
4040
# - name: Test with pytest
4141
# run: |
4242
# pytest

dynamo/external/celldancer/.Rapp.history

Whitespace-only changes.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from . import *
2+
from .velocity_estimation import velocity
3+
from .pseudo_time import pseudo_time
4+
from .compute_cell_velocity import compute_cell_velocity
5+
from .embedding_kinetic_para import embedding_kinetic_para
6+
from .utilities import adata_to_df_with_embed
7+
from .utilities import to_dynamo
8+
from .utilities import export_velocity_to_dynamo
9+
from .simulation import simulate
10+
from . import cdplt
11+
12+
__all__ = [
13+
"cdplt",
14+
"velocity_estimation",
15+
"pseudo_time",
16+
"diffusion",
17+
"compute_cell_velocity",
18+
"simulation",
19+
"embedding_kinetic_para",
20+
"sampling",
21+
"utilities",
22+
"simulation"
23+
]
24+
25+
26+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .plotting import *
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import os
2+
import sys
3+
import random
4+
import pandas as pd
5+
import numpy as np
6+
from sklearn.neighbors import NearestNeighbors
7+
import matplotlib.pyplot as plt
8+
9+
10+
if __name__ == "__main__":
11+
sys.path.append('.')
12+
from sampling import *
13+
else:
14+
try:
15+
from .sampling import *
16+
except ImportError:
17+
from sampling import *
18+
19+
20+
def compute_cell_velocity(
21+
cellDancer_df,
22+
gene_list=None,
23+
speed_up=(60,60),
24+
expression_scale=None,
25+
projection_neighbor_size=200,
26+
projection_neighbor_choice='embedding'):
27+
28+
"""Project the RNA velocity onto the embedding space.
29+
30+
Arguments
31+
---------
32+
cellDancer_df: `pandas.DataFrame`
33+
Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID, 'clusters', 'embedding1', 'embedding2']
34+
gene_list: optional, `list` (default: None)
35+
Genes selected to calculate the cell velocity. `None` if all genes in the cellDancer_df are to be used.
36+
speed_up: optional, `tuple` (default: (60,60))
37+
Speed up by giving the sampling grid to downsample cells.
38+
`None` if all cells are used to compute cell velocity.
39+
expression_scale: optional, `str` (default: None)
40+
`None` if no expression scale is to be used.
41+
`'power10'` if the 10th power is been used to scale spliced and unspliced reads.
42+
projection_neighbor_size: optional, `int` (default: '200')
43+
The number of neighboring cells used for the transition probability matrix for one cell.
44+
projection_neighbor_choice: optional, `str` (default: 'embedding')
45+
`'embedding'` if using the embedding space to obtain the neighbors.
46+
`'gene'` if using the spliced reads of all genes to obtain the neighbors.
47+
48+
Returns
49+
-------
50+
cellDancer_df: `pandas.DataFrame`
51+
The updated cellDancer_df with additional columns ['velocity1', 'velocity2'].
52+
"""
53+
54+
def velocity_correlation(cell_matrix, velocity_matrix):
55+
"""Calculate the correlation between the predict velocity (velocity_matrix[:,i])
56+
and the difference between a cell and every other (cell_matrix - cell_matrix[:, i])
57+
58+
Arguments
59+
---------
60+
cell_matrix: np.ndarray (ngenes, ncells)
61+
gene expression matrix
62+
velocity_matrix: np.ndarray (ngenes, ncells)
63+
Return
64+
---------
65+
c_matrix: np.ndarray (ncells, ncells)
66+
"""
67+
c_matrix = np.zeros((cell_matrix.shape[1], velocity_matrix.shape[1]))
68+
for i in range(cell_matrix.shape[1]):
69+
c_matrix[i, :] = corr_coeff(cell_matrix, velocity_matrix, i)[0, :]
70+
np.fill_diagonal(c_matrix, 0)
71+
return c_matrix
72+
73+
74+
def velocity_projection(cell_matrix, velocity_matrix, embedding, knn_embedding):
75+
'''
76+
cell_matrix: np.ndarray (ngenes, ncells)
77+
gene expression matrix
78+
velocity_matrix: np.ndarray (ngenes, ncells)
79+
'''
80+
# cell_matrix = np_splice[:,sampling_ixs]
81+
# velocity_matrix = np_dMatrix[:,sampling_ixs]
82+
sigma_corr = 0.05
83+
cell_matrix[np.isnan(cell_matrix)] = 0
84+
velocity_matrix[np.isnan(velocity_matrix)] = 0
85+
corrcoef = velocity_correlation(cell_matrix, velocity_matrix)
86+
probability_matrix = np.exp(corrcoef / sigma_corr)*knn_embedding.A
87+
probability_matrix /= probability_matrix.sum(1)[:, None]
88+
unitary_vectors = embedding.T[:, None, :] - embedding.T[:, :, None]
89+
with np.errstate(divide='ignore', invalid='ignore'):
90+
unitary_vectors /= np.linalg.norm(unitary_vectors, ord=2, axis=0)
91+
np.fill_diagonal(unitary_vectors[0, ...], 0)
92+
np.fill_diagonal(unitary_vectors[1, ...], 0)
93+
velocity_embedding = (probability_matrix * unitary_vectors).sum(2)
94+
velocity_embedding -= (knn_embedding.A * unitary_vectors).sum(2) / \
95+
knn_embedding.sum(1).A.T # embedding_knn.A *
96+
velocity_embedding = velocity_embedding.T
97+
return velocity_embedding
98+
99+
# remove invalid prediction
100+
is_NaN = cellDancer_df[['alpha','beta']].isnull()
101+
row_has_NaN = is_NaN. any(axis=1)
102+
cellDancer_df = cellDancer_df[~row_has_NaN].reset_index(drop=True)
103+
104+
if 'velocity1' in cellDancer_df.columns:
105+
del cellDancer_df['velocity1']
106+
if 'velocity2' in cellDancer_df.columns:
107+
del cellDancer_df['velocity2']
108+
109+
if gene_list is None:
110+
gene_list=cellDancer_df.gene_name.drop_duplicates()
111+
112+
113+
# This creates a new dataframe
114+
cellDancer_df_input = cellDancer_df[cellDancer_df.gene_name.isin(gene_list)].reset_index(drop=True)
115+
np_splice_all, np_dMatrix_all= data_reshape(cellDancer_df_input)
116+
# print("(genes, cells): ", end="")
117+
# print(np_splice_all.shape)
118+
n_genes, n_cells = np_splice_all.shape
119+
120+
# This creates a new dataframe
121+
data_df = cellDancer_df_input.loc[:,
122+
['gene_name', 'unsplice', 'splice', 'cellID','embedding1', 'embedding2']]
123+
# random.seed(10)
124+
embedding_downsampling, sampling_ixs, knn_embedding = downsampling_embedding(data_df,
125+
para='neighbors',
126+
target_amount=0,
127+
step=speed_up,
128+
n_neighbors=projection_neighbor_size,
129+
projection_neighbor_choice=projection_neighbor_choice,
130+
expression_scale=expression_scale,
131+
pca_n_components=None,
132+
umap_n=None,
133+
umap_n_components=None)
134+
135+
136+
# projection_neighbor_choice only provides neighborlist, use embedding(from raw data) to compute cell velocity
137+
embedding = cellDancer_df_input[cellDancer_df_input.gene_name ==
138+
gene_list[0]][['embedding1', 'embedding2']]
139+
embedding = embedding.to_numpy()
140+
velocity_embedding = velocity_projection(
141+
np_splice_all[:, sampling_ixs],
142+
np_dMatrix_all[:, sampling_ixs],
143+
embedding[sampling_ixs, :],
144+
knn_embedding)
145+
146+
if set(['velocity1','velocity2']).issubset(cellDancer_df.columns):
147+
print("Caution! Overwriting the \'velocity\' columns.")
148+
cellDancer_df.drop(['velocity1','velocity2'], axis=1, inplace=True)
149+
150+
sampling_ixs_all_genes = cellDancer_df_input[cellDancer_df_input.cellIndex.isin(sampling_ixs)].index
151+
cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity1'] = np.tile(velocity_embedding[:,0], n_genes)
152+
cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity2'] = np.tile(velocity_embedding[:,1], n_genes)
153+
# print("After downsampling, there are ", len(sampling_ixs), "cells.")
154+
return(cellDancer_df_input)
155+
156+
def corr_coeff(ematrix, vmatrix, i):
157+
'''
158+
Calculate the correlation between the predict velocity (velocity_matrix[:,i])
159+
and the displacement between a cell and every other (cell_matrix - cell_matrix[:, i])
160+
ematrix = cell_matrix
161+
vmatrix = velocity_matrix
162+
'''
163+
ematrix = ematrix.T
164+
vmatrix = vmatrix.T
165+
ematrix = ematrix - ematrix[i, :]
166+
vmatrix = vmatrix[i, :][None, :]
167+
ematrix_m = ematrix - ematrix.mean(1)[:, None]
168+
vmatrix_m = vmatrix - vmatrix.mean(1)[:, None]
169+
170+
# Sum of squares across rows
171+
ematrix_ss = (ematrix_m**2).sum(1)
172+
vmatrix_ss = (vmatrix_m**2).sum(1)
173+
cor = np.dot(ematrix_m, vmatrix_m.T)
174+
N = np.sqrt(np.dot(ematrix_ss[:, None], vmatrix_ss[None]))
175+
cor=np.divide(cor, N, where=N!=0)
176+
return cor.T
177+
178+
179+
def data_reshape(cellDancer_df): # pengzhi version
180+
'''
181+
load detail file
182+
return expression matrix and velocity (ngenes, ncells)
183+
'''
184+
psc = 1
185+
gene_names = cellDancer_df['gene_name'].drop_duplicates().to_list()
186+
# PZ uncommented this.
187+
cell_number = cellDancer_df[cellDancer_df['gene_name']==gene_names[0]].shape[0]
188+
cellDancer_df['index'] = np.tile(range(cell_number),len(gene_names))
189+
190+
splice_reshape = cellDancer_df.pivot(
191+
index='gene_name', values='splice', columns='index')
192+
splice_predict_reshape = cellDancer_df.pivot(
193+
index='gene_name', values='splice_predict', columns='index')
194+
dMatrix = splice_predict_reshape-splice_reshape
195+
np_splice_reshape = np.array(splice_reshape)
196+
np_dMatrix = np.array(dMatrix)
197+
np_dMatrix2 = np.sqrt(np.abs(np_dMatrix) + psc) * \
198+
np.sign(np_dMatrix)
199+
return(np_splice_reshape, np_dMatrix2)
200+

0 commit comments

Comments
 (0)