|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import random |
| 4 | +import pandas as pd |
| 5 | +import numpy as np |
| 6 | +from sklearn.neighbors import NearestNeighbors |
| 7 | +import matplotlib.pyplot as plt |
| 8 | + |
| 9 | + |
| 10 | +if __name__ == "__main__": |
| 11 | + sys.path.append('.') |
| 12 | + from sampling import * |
| 13 | +else: |
| 14 | + try: |
| 15 | + from .sampling import * |
| 16 | + except ImportError: |
| 17 | + from sampling import * |
| 18 | + |
| 19 | + |
| 20 | +def compute_cell_velocity( |
| 21 | + cellDancer_df, |
| 22 | + gene_list=None, |
| 23 | + speed_up=(60,60), |
| 24 | + expression_scale=None, |
| 25 | + projection_neighbor_size=200, |
| 26 | + projection_neighbor_choice='embedding'): |
| 27 | + |
| 28 | + """Project the RNA velocity onto the embedding space. |
| 29 | + |
| 30 | + Arguments |
| 31 | + --------- |
| 32 | + cellDancer_df: `pandas.DataFrame` |
| 33 | + Dataframe of velocity estimation results. Columns=['cellIndex', 'gene_name', unsplice', 'splice', 'unsplice_predict', 'splice_predict', 'alpha', 'beta', 'gamma', 'loss', 'cellID, 'clusters', 'embedding1', 'embedding2'] |
| 34 | + gene_list: optional, `list` (default: None) |
| 35 | + Genes selected to calculate the cell velocity. `None` if all genes in the cellDancer_df are to be used. |
| 36 | + speed_up: optional, `tuple` (default: (60,60)) |
| 37 | + Speed up by giving the sampling grid to downsample cells. |
| 38 | + `None` if all cells are used to compute cell velocity. |
| 39 | + expression_scale: optional, `str` (default: None) |
| 40 | + `None` if no expression scale is to be used. |
| 41 | + `'power10'` if the 10th power is been used to scale spliced and unspliced reads. |
| 42 | + projection_neighbor_size: optional, `int` (default: '200') |
| 43 | + The number of neighboring cells used for the transition probability matrix for one cell. |
| 44 | + projection_neighbor_choice: optional, `str` (default: 'embedding') |
| 45 | + `'embedding'` if using the embedding space to obtain the neighbors. |
| 46 | + `'gene'` if using the spliced reads of all genes to obtain the neighbors. |
| 47 | +
|
| 48 | + Returns |
| 49 | + ------- |
| 50 | + cellDancer_df: `pandas.DataFrame` |
| 51 | + The updated cellDancer_df with additional columns ['velocity1', 'velocity2']. |
| 52 | + """ |
| 53 | + |
| 54 | + def velocity_correlation(cell_matrix, velocity_matrix): |
| 55 | + """Calculate the correlation between the predict velocity (velocity_matrix[:,i]) |
| 56 | + and the difference between a cell and every other (cell_matrix - cell_matrix[:, i]) |
| 57 | +
|
| 58 | + Arguments |
| 59 | + --------- |
| 60 | + cell_matrix: np.ndarray (ngenes, ncells) |
| 61 | + gene expression matrix |
| 62 | + velocity_matrix: np.ndarray (ngenes, ncells) |
| 63 | + Return |
| 64 | + --------- |
| 65 | + c_matrix: np.ndarray (ncells, ncells) |
| 66 | + """ |
| 67 | + c_matrix = np.zeros((cell_matrix.shape[1], velocity_matrix.shape[1])) |
| 68 | + for i in range(cell_matrix.shape[1]): |
| 69 | + c_matrix[i, :] = corr_coeff(cell_matrix, velocity_matrix, i)[0, :] |
| 70 | + np.fill_diagonal(c_matrix, 0) |
| 71 | + return c_matrix |
| 72 | + |
| 73 | + |
| 74 | + def velocity_projection(cell_matrix, velocity_matrix, embedding, knn_embedding): |
| 75 | + ''' |
| 76 | + cell_matrix: np.ndarray (ngenes, ncells) |
| 77 | + gene expression matrix |
| 78 | + velocity_matrix: np.ndarray (ngenes, ncells) |
| 79 | + ''' |
| 80 | + # cell_matrix = np_splice[:,sampling_ixs] |
| 81 | + # velocity_matrix = np_dMatrix[:,sampling_ixs] |
| 82 | + sigma_corr = 0.05 |
| 83 | + cell_matrix[np.isnan(cell_matrix)] = 0 |
| 84 | + velocity_matrix[np.isnan(velocity_matrix)] = 0 |
| 85 | + corrcoef = velocity_correlation(cell_matrix, velocity_matrix) |
| 86 | + probability_matrix = np.exp(corrcoef / sigma_corr)*knn_embedding.A |
| 87 | + probability_matrix /= probability_matrix.sum(1)[:, None] |
| 88 | + unitary_vectors = embedding.T[:, None, :] - embedding.T[:, :, None] |
| 89 | + with np.errstate(divide='ignore', invalid='ignore'): |
| 90 | + unitary_vectors /= np.linalg.norm(unitary_vectors, ord=2, axis=0) |
| 91 | + np.fill_diagonal(unitary_vectors[0, ...], 0) |
| 92 | + np.fill_diagonal(unitary_vectors[1, ...], 0) |
| 93 | + velocity_embedding = (probability_matrix * unitary_vectors).sum(2) |
| 94 | + velocity_embedding -= (knn_embedding.A * unitary_vectors).sum(2) / \ |
| 95 | + knn_embedding.sum(1).A.T # embedding_knn.A * |
| 96 | + velocity_embedding = velocity_embedding.T |
| 97 | + return velocity_embedding |
| 98 | + |
| 99 | + # remove invalid prediction |
| 100 | + is_NaN = cellDancer_df[['alpha','beta']].isnull() |
| 101 | + row_has_NaN = is_NaN. any(axis=1) |
| 102 | + cellDancer_df = cellDancer_df[~row_has_NaN].reset_index(drop=True) |
| 103 | + |
| 104 | + if 'velocity1' in cellDancer_df.columns: |
| 105 | + del cellDancer_df['velocity1'] |
| 106 | + if 'velocity2' in cellDancer_df.columns: |
| 107 | + del cellDancer_df['velocity2'] |
| 108 | + |
| 109 | + if gene_list is None: |
| 110 | + gene_list=cellDancer_df.gene_name.drop_duplicates() |
| 111 | + |
| 112 | + |
| 113 | + # This creates a new dataframe |
| 114 | + cellDancer_df_input = cellDancer_df[cellDancer_df.gene_name.isin(gene_list)].reset_index(drop=True) |
| 115 | + np_splice_all, np_dMatrix_all= data_reshape(cellDancer_df_input) |
| 116 | + # print("(genes, cells): ", end="") |
| 117 | + # print(np_splice_all.shape) |
| 118 | + n_genes, n_cells = np_splice_all.shape |
| 119 | + |
| 120 | + # This creates a new dataframe |
| 121 | + data_df = cellDancer_df_input.loc[:, |
| 122 | + ['gene_name', 'unsplice', 'splice', 'cellID','embedding1', 'embedding2']] |
| 123 | + # random.seed(10) |
| 124 | + embedding_downsampling, sampling_ixs, knn_embedding = downsampling_embedding(data_df, |
| 125 | + para='neighbors', |
| 126 | + target_amount=0, |
| 127 | + step=speed_up, |
| 128 | + n_neighbors=projection_neighbor_size, |
| 129 | + projection_neighbor_choice=projection_neighbor_choice, |
| 130 | + expression_scale=expression_scale, |
| 131 | + pca_n_components=None, |
| 132 | + umap_n=None, |
| 133 | + umap_n_components=None) |
| 134 | + |
| 135 | + |
| 136 | + # projection_neighbor_choice only provides neighborlist, use embedding(from raw data) to compute cell velocity |
| 137 | + embedding = cellDancer_df_input[cellDancer_df_input.gene_name == |
| 138 | + gene_list[0]][['embedding1', 'embedding2']] |
| 139 | + embedding = embedding.to_numpy() |
| 140 | + velocity_embedding = velocity_projection( |
| 141 | + np_splice_all[:, sampling_ixs], |
| 142 | + np_dMatrix_all[:, sampling_ixs], |
| 143 | + embedding[sampling_ixs, :], |
| 144 | + knn_embedding) |
| 145 | + |
| 146 | + if set(['velocity1','velocity2']).issubset(cellDancer_df.columns): |
| 147 | + print("Caution! Overwriting the \'velocity\' columns.") |
| 148 | + cellDancer_df.drop(['velocity1','velocity2'], axis=1, inplace=True) |
| 149 | + |
| 150 | + sampling_ixs_all_genes = cellDancer_df_input[cellDancer_df_input.cellIndex.isin(sampling_ixs)].index |
| 151 | + cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity1'] = np.tile(velocity_embedding[:,0], n_genes) |
| 152 | + cellDancer_df_input.loc[sampling_ixs_all_genes,'velocity2'] = np.tile(velocity_embedding[:,1], n_genes) |
| 153 | + # print("After downsampling, there are ", len(sampling_ixs), "cells.") |
| 154 | + return(cellDancer_df_input) |
| 155 | + |
| 156 | +def corr_coeff(ematrix, vmatrix, i): |
| 157 | + ''' |
| 158 | + Calculate the correlation between the predict velocity (velocity_matrix[:,i]) |
| 159 | + and the displacement between a cell and every other (cell_matrix - cell_matrix[:, i]) |
| 160 | + ematrix = cell_matrix |
| 161 | + vmatrix = velocity_matrix |
| 162 | + ''' |
| 163 | + ematrix = ematrix.T |
| 164 | + vmatrix = vmatrix.T |
| 165 | + ematrix = ematrix - ematrix[i, :] |
| 166 | + vmatrix = vmatrix[i, :][None, :] |
| 167 | + ematrix_m = ematrix - ematrix.mean(1)[:, None] |
| 168 | + vmatrix_m = vmatrix - vmatrix.mean(1)[:, None] |
| 169 | + |
| 170 | + # Sum of squares across rows |
| 171 | + ematrix_ss = (ematrix_m**2).sum(1) |
| 172 | + vmatrix_ss = (vmatrix_m**2).sum(1) |
| 173 | + cor = np.dot(ematrix_m, vmatrix_m.T) |
| 174 | + N = np.sqrt(np.dot(ematrix_ss[:, None], vmatrix_ss[None])) |
| 175 | + cor=np.divide(cor, N, where=N!=0) |
| 176 | + return cor.T |
| 177 | + |
| 178 | + |
| 179 | +def data_reshape(cellDancer_df): # pengzhi version |
| 180 | + ''' |
| 181 | + load detail file |
| 182 | + return expression matrix and velocity (ngenes, ncells) |
| 183 | + ''' |
| 184 | + psc = 1 |
| 185 | + gene_names = cellDancer_df['gene_name'].drop_duplicates().to_list() |
| 186 | + # PZ uncommented this. |
| 187 | + cell_number = cellDancer_df[cellDancer_df['gene_name']==gene_names[0]].shape[0] |
| 188 | + cellDancer_df['index'] = np.tile(range(cell_number),len(gene_names)) |
| 189 | + |
| 190 | + splice_reshape = cellDancer_df.pivot( |
| 191 | + index='gene_name', values='splice', columns='index') |
| 192 | + splice_predict_reshape = cellDancer_df.pivot( |
| 193 | + index='gene_name', values='splice_predict', columns='index') |
| 194 | + dMatrix = splice_predict_reshape-splice_reshape |
| 195 | + np_splice_reshape = np.array(splice_reshape) |
| 196 | + np_dMatrix = np.array(dMatrix) |
| 197 | + np_dMatrix2 = np.sqrt(np.abs(np_dMatrix) + psc) * \ |
| 198 | + np.sign(np_dMatrix) |
| 199 | + return(np_splice_reshape, np_dMatrix2) |
| 200 | + |
0 commit comments