diff --git a/minerva_analysis/server/models/data_model.py b/minerva_analysis/server/models/data_model.py index 3eca629dd..d9fc5df69 100644 --- a/minerva_analysis/server/models/data_model.py +++ b/minerva_analysis/server/models/data_model.py @@ -9,6 +9,8 @@ from ome_types import from_xml from minerva_analysis import config_json_path, data_path, cwd_path from minerva_analysis.server.utils import pyramid_assemble +from minerva_analysis.server.utils import smallestenclosingcircle +import matplotlib.path as mpltPath from minerva_analysis.server.models import database_model import dateutil.parser import time @@ -47,6 +49,14 @@ def load_datasource(datasource_name, reload=False): csvPath = Path(config[datasource_name]['featureData'][0]['src']) print("Loading csv data.. (this can take some time)") datasource = pd.read_csv(csvPath) + if 'cellType' not in datasource.columns: + embedding_data_path = Path(config[datasource_name]['featureData'][0]['embeddingData']) + scatter_df = pd.read_csv(embedding_data_path) + scatter_np = scatter_df.to_numpy() + if scatter_np.shape[1] > 3: + datasource['cellType'] = scatter_np[:, 3].astype('int').tolist() + else: + datasource['cellType'] = 0 datasource['id'] = datasource.index datasource = datasource.replace(-np.Inf, 0) source = datasource_name @@ -110,11 +120,11 @@ def load_ball_tree(datasource_name_name, reload=False): # Path( # os.path.join(os.getcwd())) / data_path / datasource_name_name / "ball_tree.pickle") - #using pathlib now: + # using pathlib now: pickled_kd_tree_path = str( PurePath(cwd_path, data_path, datasource_name_name, "ball_tree.pickle")) - #old os.path way: if os.path.isfile(pickled_kd_tree_path) and reload is False: + # old os.path way: if os.path.isfile(pickled_kd_tree_path) and reload is False: if Path(pickled_kd_tree_path).is_file() and reload is False: print("Pickled KD Tree Exists, Loading") @@ -198,12 +208,21 @@ def get_channel_cells(datasource_name, channels): return query +def get_celltype_column_name(datasource): + try: + return config[datasource]['featureData'][0]['celltype'] + except KeyError: + return 'cellType' + except TypeError: + return 'cellType' + + def get_phenotype_description(datasource): try: data = '' csvPath = config[datasource]['featureData'][0]['celltypeData'] if Path(csvPath).is_file(): - #old os.path usage: if os.path.isfile(csvPath): + # old os.path usage: if os.path.isfile(csvPath): data = pd.read_csv(csvPath) data = data.to_numpy().tolist() # data = data.to_json(orient='records', lines=True) @@ -223,6 +242,48 @@ def get_phenotype_column_name(datasource): return '' +def get_cell_groups(datasource_name): + global datasource + global source + global config + try: + if 'celltypeData' in config[datasource_name]['featureData'][0]: + celltype_data = Path(config[datasource_name]['featureData'][0]['celltypeData']) + celltype_df = pd.read_csv(celltype_data) + obj = celltype_df.to_numpy()[:, 1].tolist() + else: + + celltype_data = sorted(datasource['cellType'].unique()) + obj = [str(i) for i in celltype_data] + # Test + return obj + except: + return [0] + + +def get_cells_by_cell_group(datasource_name, cell_group): + global datasource + global source + global config + try: + if 'celltypeData' in config[datasource_name]['featureData'][0]: + celltype_data = Path(config[datasource_name]['featureData'][0]['celltypeData']) + celltype_df = pd.read_csv(celltype_data) + group_id = celltype_df[celltype_df.name == cell_group].values[0][0] + else: + group_id = int(cell_group) + fields = [config[datasource_name]['featureData'][0]['xCoordinate'], + config[datasource_name]['featureData'][0]['yCoordinate'], + config[datasource_name]['featureData'][0]['celltype'], 'id', + config[datasource_name]['featureData'][0]['idField']] + obj = datasource[ + datasource[config[datasource_name]['featureData'][0]['celltype']] == group_id][fields].to_dict( + orient='records') + return obj + except: + return [] + + def get_cells_phenotype(datasource_name): global datasource global source @@ -299,63 +360,53 @@ def get_number_of_cells_in_circle(x, y, datasource_name, r): return 0 -def get_color_scheme(datasource_name, refresh, label_field='celltype'): - - # old os.path way: - # color_scheme_path = str( - # Path(os.path.join(os.getcwd())) / data_path / datasource_name / str( - # label_field + "_color_scheme.pickle")) - - color_scheme_path = str(PurePath(cwd_path, data_path, datasource_name, str( - label_field + "_color_scheme.pickle")) ) - - if refresh == False: - #old os.path way: if os.path.isfile(color_scheme_path): - if Path(color_scheme_path).is_file(): - print("Color Scheme Exists, Loading") - color_scheme = pickle.load(open(color_scheme_path, "rb")) - return color_scheme - if label_field == 'celltype': - labels = get_phenotypes(datasource_name) - print(labels) - labels.append('SelectedCluster') +def get_color_scheme(datasource_name): + labels = get_cell_groups(datasource_name) color_scheme = {} - colors = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#a65628", "#f781bf", "#808080", "#7A4900", - "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", "#5A0007", "#809693", "#FEFFE6", - "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", "#61615A", "#BA0900", "#6B7900", "#00C2A0", - "#FFAA92", "#FF90C9", "#B903AA", "#D16100", "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", - "#0AA6D8", "#013349", "#00846F", "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", - "#C2FF99", "#001E09", "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", - "#788D66", "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", - "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", "#575329", - "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", "#7900D7", "#A77500", - "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", "#549E79", "#FFF69F", "#201625", - "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", "#5B4534", "#FDE8DC", "#404E55", "#0089A3", - "#CB7E98", "#A4E804", "#324E72", "#6A3A4C", "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", - "#A3A489", "#806C66", "#222800", "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", - "#1E0200", "#5B4E51", "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", - "#012C58", "#7A7BFF", "#D68E01", "#353339", "#78AFA1", "#FEB2C6", "#75797C", "#837393", "#943A4D", - "#B5F4FF", "#D2DCD5", "#9556BD", "#6A714A", "#001325", "#02525F", "#0AA3F7", "#E98176", "#DBD5DD", - "#5EBCD1", "#3D4F44", "#7E6405", "#02684E", "#962B75", "#8D8546", "#9695C5", "#E773CE", "#D86A78", - "#3E89BE", "#CA834E", "#518A87", "#5B113C", "#55813B", "#E704C4", "#00005F", "#A97399", "#4B8160", - "#59738A", "#FF5DA7", "#F7C9BF", "#643127", "#513A01", "#6B94AA", "#51A058", "#A45B02", "#1D1702", - "#E20027", "#E7AB63", "#4C6001", "#9C6966", "#64547B", "#97979E", "#006A66", "#391406", "#F4D749", - "#0045D2", "#006C31", "#DDB6D0", "#7C6571", "#9FB2A4", "#00D891", "#15A08A", "#BC65E9", "#FFFFFE", - "#C6DC99", "#203B3C", "#671190", "#6B3A64", "#F5E1FF", "#FFA0F2", "#CCAA35", "#374527", "#8BB400", - "#797868", "#C6005A", "#3B000A", "#C86240", "#29607C", "#402334", "#7D5A44", "#CCB87C", "#B88183", - "#AA5199", "#B5D6C3", "#A38469", "#9F94F0", "#A74571", "#B894A6", "#71BB8C", "#00B433", "#789EC9", - "#6D80BA", "#953F00", "#5EFF03", "#E4FFFC", "#1BE177", "#BCB1E5", "#76912F", "#003109", "#0060CD", - "#D20096", "#895563", "#29201D", "#5B3213", "#A76F42", "#89412E", "#1A3A2A", "#494B5A", "#A88C85", - "#F4ABAA", "#A3F3AB", "#00C6C8", "#EA8B66", "#958A9F", "#BDC9D2", "#9FA064", "#BE4700", "#658188", - "#83A485", "#453C23", "#47675D", "#3A3F00", "#061203", "#DFFB71", "#868E7E", "#98D058", "#6C8F7D", - "#D7BFC2", "#3C3E6E", "#D83D66", "#2F5D9B", "#6C5E46", "#D25B88", "#5B656C", "#00B57F", "#545C46", - "#866097", "#365D25", "#252F99", "#00CCFF", "#674E60", "#FC009C", "#92896B"] + # http://godsnotwheregodsnot.blogspot.com/2013/11/kmeans-color-quantization-seeding.html + + colors = ["#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", + "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", + "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", + "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", + "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", + "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", + "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", + "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", + "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", + "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", + "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", + "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", + "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C", + "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800", + "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51", + "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", "#012C58", + "#7A7BFF", "#D68E01", "#353339", "#78AFA1", "#FEB2C6", "#75797C", "#837393", "#943A4D", + "#B5F4FF", "#D2DCD5", "#9556BD", "#6A714A", "#001325", "#02525F", "#0AA3F7", "#E98176", + "#DBD5DD", "#5EBCD1", "#3D4F44", "#7E6405", "#02684E", "#962B75", "#8D8546", "#9695C5", + "#E773CE", "#D86A78", "#3E89BE", "#CA834E", "#518A87", "#5B113C", "#55813B", "#E704C4", + "#00005F", "#A97399", "#4B8160", "#59738A", "#FF5DA7", "#F7C9BF", "#643127", "#513A01", + "#6B94AA", "#51A058", "#A45B02", "#1D1702", "#E20027", "#E7AB63", "#4C6001", "#9C6966", + "#64547B", "#97979E", "#006A66", "#391406", "#F4D749", "#0045D2", "#006C31", "#DDB6D0", + "#7C6571", "#9FB2A4", "#00D891", "#15A08A", "#BC65E9", "#FFFFFE", "#C6DC99", "#203B3C", + + "#671190", "#6B3A64", "#F5E1FF", "#FFA0F2", "#CCAA35", "#374527", "#8BB400", "#797868", + "#C6005A", "#3B000A", "#C86240", "#29607C", "#402334", "#7D5A44", "#CCB87C", "#B88183", + "#AA5199", "#B5D6C3", "#A38469", "#9F94F0", "#A74571", "#B894A6", "#71BB8C", "#00B433", + "#789EC9", "#6D80BA", "#953F00", "#5EFF03", "#E4FFFC", "#1BE177", "#BCB1E5", "#76912F", + "#003109", "#0060CD", "#D20096", "#895563", "#29201D", "#5B3213", "#A76F42", "#89412E", + "#1A3A2A", "#494B5A", "#A88C85", "#F4ABAA", "#A3F3AB", "#00C6C8", "#EA8B66", "#958A9F", + "#BDC9D2", "#9FA064", "#BE4700", "#658188", "#83A485", "#453C23", "#47675D", "#3A3F00", + "#061203", "#DFFB71", "#868E7E", "#98D058", "#6C8F7D", "#D7BFC2", "#3C3E6E", "#D83D66", + "#2F5D9B", "#6C5E46", "#D25B88", "#5B656C", "#00B57F", "#545C46", "#866097", "#365D25", + "#252F99", "#00CCFF", "#674E60", "#FC009C", "#92896B"] for i in range(len(labels)): color_scheme[str(labels[i])] = {} color_scheme[str(labels[i])]['rgb'] = list(ImageColor.getcolor(colors[i], "RGB")) color_scheme[str(labels[i])]['hex'] = colors[i] - - pickle.dump(color_scheme, open(color_scheme_path, 'wb')) + color_scheme[str(i)] = {} + color_scheme[str(i)]['rgb'] = list(ImageColor.getcolor(colors[i], "RGB")) + color_scheme[str(i)]['hex'] = colors[i] return color_scheme @@ -488,9 +539,31 @@ def get_datasource_description(datasource_name): return description +def get_scatterplot_data(datasource_name): + global config + global datasource -def spatial_corr (adata, raw=False, log=False, threshold=None, x_coordinate='X_centroid',y_coordinate='Y_centroid', - marker=None, k=500, label='spatial_corr'): + embedding_data_path = Path(config[datasource_name]['featureData'][0]['embeddingData']) + scatter_df = pd.read_csv(embedding_data_path) + scatter_np = scatter_df.to_numpy() + # scatter_np[:, 1:3] = datasource[['X_centroid', 'Y_centroid']].to_numpy() + scatter_np[:, 1:3] = (scatter_np[:, 1:3] - np.min(scatter_np[:, 1:3])) / ( + np.max(scatter_np[:, 1:3]) - np.min(scatter_np[:, 1:3])) * 2 - 1 + try: + clusters = datasource[get_celltype_column_name(datasource_name)].astype('uint32').values.tolist() + except: + clusters = np.zeros((datasource.shape[0],), dtype='int').tolist() + scatter_np = np.append(scatter_np, np.expand_dims(clusters, 1), 1) + list_of_obs = [[elem[1], elem[2], int(elem[0]), int(elem[3])] for elem in scatter_np] + visData = { + 'data': list_of_obs, + 'clusters': clusters + } + return visData + + +def spatial_corr(adata, raw=False, log=False, threshold=None, x_coordinate='X_centroid', y_coordinate='Y_centroid', + marker=None, k=500, label='spatial_corr'): """ Parameters ---------- @@ -526,9 +599,9 @@ def spatial_corr (adata, raw=False, log=False, threshold=None, x_coordinate='X_c data = pd.DataFrame({'x': bdata.obs[x_coordinate], 'y': bdata.obs[y_coordinate]}) # user defined expression matrix if raw is True: - exp = pd.DataFrame(bdata.raw.X, index= bdata.obs.index, columns=bdata.var.index) + exp = pd.DataFrame(bdata.raw.X, index=bdata.obs.index, columns=bdata.var.index) else: - exp = pd.DataFrame(bdata.X, index= bdata.obs.index, columns=bdata.var.index) + exp = pd.DataFrame(bdata.X, index=bdata.obs.index, columns=bdata.var.index) # log the data if needed if log is True: exp = np.log1p(exp) @@ -542,19 +615,20 @@ def spatial_corr (adata, raw=False, log=False, threshold=None, x_coordinate='X_c marker = [marker] exp = exp[marker] # find the nearest neighbours - tree = BallTree(data, leaf_size= 2) - dist, ind = tree.query(data, k=k, return_distance= True) - neighbours = pd.DataFrame(ind, index = bdata.obs.index) + tree = BallTree(data, leaf_size=2) + dist, ind = tree.query(data, k=k, return_distance=True) + neighbours = pd.DataFrame(ind, index=bdata.obs.index) # find the mean dist rad_approx = np.mean(dist, axis=0) # Calculate the correlation mean = np.mean(exp).values std = np.std(exp).values A = (exp - mean) / std - def corrfunc (marker, A, neighbours, ind): + + def corrfunc(marker, A, neighbours, ind): print('Processing ' + str(marker)) # Map phenotype - ind_values = dict(zip(list(range(len(ind))), A[marker])) # Used for mapping + ind_values = dict(zip(list(range(len(ind))), A[marker])) # Used for mapping # Loop through (all functionized methods were very slow) neigh = neighbours.copy() for i in neigh.columns: @@ -564,9 +638,10 @@ def corrfunc (marker, A, neighbours, ind): corrfunc = np.mean(Y, axis=1) # return return corrfunc + # apply function to all markers # Create lamda function - r_corrfunc = lambda x: corrfunc(marker=x,A=A, neighbours=neighbours, ind=ind) - all_data = list(map(r_corrfunc, exp.columns)) # Apply function + r_corrfunc = lambda x: corrfunc(marker=x, A=A, neighbours=neighbours, ind=ind) + all_data = list(map(r_corrfunc, exp.columns)) # Apply function # Merge all the results into a single dataframe df = pd.concat(all_data, axis=1) df.columns = exp.columns @@ -660,6 +735,46 @@ def convertOmeTiff(filePath, channelFilePath=None, dataDirectory=None, isLabelIm return {'segmentation': str(directory)} +def get_cells_in_polygon(datasource_name, points, similar_neighborhood=False): + global config + global ball_tree + point_tuples = [(e['imagePoints']['x'], e['imagePoints']['y']) for e in points] + (x, y, r) = smallestenclosingcircle.make_circle(point_tuples) + fields = [config[datasource_name]['featureData'][0]['xCoordinate'], + config[datasource_name]['featureData'][0]['yCoordinate'], + config[datasource_name]['featureData'][0]['celltype'], 'id', + config[datasource_name]['featureData'][0]['idField']] + index = ball_tree.query_radius([[x, y]], r=r) + cells = index[0] + circle_cells = datasource.iloc[cells][fields].values + path = mpltPath.Path(point_tuples) + inside = path.contains_points(circle_cells[:, [0, 1]].astype('float')) + neighbor_ids = circle_cells[np.where(inside == True), 3].flatten().tolist() + # obj = get_neighborhood_stats(datasource_name, neighbor_ids, fields=fields) + # try: + if fields and len(fields) > 0: + if len(fields) > 1: + poly_cells = datasource.iloc[neighbor_ids][fields].to_dict(orient='records') + else: + poly_cells = datasource.iloc[neighbor_ids][fields].to_dict() + else: + poly_cells = datasource.iloc[neighbor_ids].to_dict(orient='records') + return poly_cells + + +def get_cells(elem, datasource_name): + global datasource + global source + global config + fields = [config[datasource_name]['featureData'][0]['xCoordinate'], + config[datasource_name]['featureData'][0]['yCoordinate'], + config[datasource_name]['featureData'][0]['celltype'], 'id', + config[datasource_name]['featureData'][0]['idField']] + ids = np.array(elem['ids']) + obj = datasource.iloc[ids][fields].to_dict(orient='records') + return obj + + def save_dot(datasource_name, dot): database_model.create_or_update(database_model.Dot, id=dot['id'], datasource=datasource_name, group=dot['group'], name=dot['name'], description=dot['description'], shape_type=dot['shape_type'], diff --git a/minerva_analysis/server/routes/import_routes.py b/minerva_analysis/server/routes/import_routes.py index 6c99422f5..e257719a5 100644 --- a/minerva_analysis/server/routes/import_routes.py +++ b/minerva_analysis/server/routes/import_routes.py @@ -405,7 +405,8 @@ def save_config(): if 'celltypeData' in originalData: configData[datasetName]['featureData'][0]['celltypeData'] = str(data_path / datasetName / celltypeName) configData[datasetName]['featureData'][0]['celltype'] = headerList[3][1]['value'] - + else: + configData[datasetName]['featureData'][0]['celltype'] = 'cellType' if 'embeddingData' in originalData: configData[datasetName]['featureData'][0]['embeddingData'] = str( data_path / datasetName / embeddingName)