11
11
from scipy .stats import ks_2samp
12
12
from sklearn .mixture import GaussianMixture
13
13
14
- # To make Travis happy. Attempt absolute path first and then from raw Github file
15
- try :
16
- spectrum_data = pd .read_csv ('flowsym/data/FPbase_Spectra_updated.csv' ).fillna (value = 0 )
17
- except :
18
- spectrum_data = pd .read_csv ('https://raw.githubusercontent.com/harmslab/flowsym/master/flowsym/data/FPbase_Spectra_updated.csv' ).fillna (value = 0 )
14
+ # Import from raw Github file
15
+ spectrum_data = pd .read_csv ('https://raw.githubusercontent.com/harmslab/flowsym/master/flowsym/data'
16
+ '/FPbase_Spectra_updated.csv' ).fillna (value = 0 )
19
17
20
18
21
- def create_controls (size , colors = ( 'blue' , 'cyan' , 'green' , 'yellow' , 'orange' , 'red' , 'far_red' , 'nir' , 'ir' ) ):
19
+ def create_controls (size , colors = [ 'blue' , 'cyan' , 'green' , 'yellow' , 'orange' , 'red' , 'far_red' , 'nir' , 'ir' ] ):
22
20
"""
23
21
This is a function that takes a DataFrame size (i.e. number of controls) and
24
22
a list of colors the user wants to run controls for.
@@ -134,9 +132,10 @@ def create_controls(size, colors=('blue', 'cyan', 'green', 'yellow', 'orange', '
134
132
def create_sample (size , colors = ['blue' , 'cyan' , 'green' , 'yellow' , 'orange' , 'red' , 'far_red' , 'nir' , 'ir' ],
135
133
weights = []):
136
134
"""
137
- This is a function that takes a defined dataframe length for number of samples (int)
138
- and excitation and emission wavelengths (list,list). Assumes equal probability of each
139
- color unless specified by the user.
135
+ This is a function that takes a defined dataframe length for
136
+ number of samples (int) and excitation and emission wavelengths
137
+ (list,list). Assumes equal probability of each color unless
138
+ specified by the user.
140
139
141
140
Parameters
142
141
----------
@@ -239,22 +238,25 @@ def create_sample(size, colors=['blue', 'cyan', 'green', 'yellow', 'orange', 're
239
238
240
239
# Bandwidth on lasers is +-5 nm. channels are [450+-25, 525+-25, 600+-30, 665+-15, 720+-30, 785+-30] for filter set 2
241
240
def measure (dataframe , lasers = [405 , 488 , 561 , 638 ], channels = [1 , 2 , 3 , 4 , 5 , 6 ],
242
- create_fcs = True , outfile_name = 'data/ sample_output.fcs' ):
241
+ create_fcs = False , outfile_name = 'sample_output.fcs' ):
243
242
"""
244
- This is a function that will measure fluorescence intensity for any given sample
245
- DataFrame and laser/channel parameters. Output will be an fcs file (default) that is
246
- the same size as the sample you ran in the function. Alternatively, you can return
247
- just a pandas DataFrame object by setting return_fcs=False. The user can set the output
248
- file name manually to simulate creating multiple samples and measurements.
243
+ This is a function that will measure fluorescence
244
+ intensity for any given sample DataFrame and laser/channel
245
+ parameters. Output will be just a pandas DataFrame object
246
+ because return_fcs=False by default.
247
+
248
+ Alternatively, you can return fcs file if return_fcs = True.
249
+ The user can set the output file name manually to simulate
250
+ creating multiple samples and measurements.
249
251
250
252
Parameters
251
253
----------
252
- dataframe : the Dataframe of sample data that will be used to generate the simulated
253
- fluorescence intensity
254
+ dataframe : the Dataframe of sample data that will be used
255
+ to generate the simulated fluorescence intensity
254
256
lasers : laser channel parameters, default are [405, 488, 561, 638] nm
255
257
channels: return output for select channels, options are [1,2,3,4,5,6]
256
- create_fcs : create a .fcs file from generated Pandas Dataframe using 'fcsy' module.
257
- Default = True.
258
+ create_fcs : create a .fcs file from generated Pandas Dataframe
259
+ using 'fcsy' module. Default = True.
258
260
outfile_name : name of the .fcs file created
259
261
260
262
Returns
@@ -359,36 +361,49 @@ def measure(dataframe, lasers=[405, 488, 561, 638], channels=[1, 2, 3, 4, 5, 6],
359
361
return output
360
362
361
363
362
- def cluster (measured_data , min_cluster_size = 50 , savefig = True ):
364
+ def cluster (measured_data , min_cluster_size = 50 , savefig = False ):
363
365
"""
364
- This is a function to cluster flow cytometry data that has been measured in fluorescence channels using
365
- density-based spatial clustering of applications with noise (DBSCAN), which clusters based on density of points
366
- in an unsupervised method. The number of clusters does not need to be explicitly stated by the users. The only
367
- parameter that needs to be optimized is min_cluster_size, which is set to 50 here. But I recommend 1% of the len(
368
- data) Resulting plots are a bar chart showing the number of cells in each cluster and a heatmap of the median
369
- fluorescence intensity in each channel for each cluster.
370
-
371
- Note: clusters that are labeled '0' are cells that the DBSCAN could not cluster.
372
-
373
- Returns a tuple of two dictionaries. The first dictionary is the median fluorescence represented in the heatmap
374
- while the second dictionary holds all the fluorescence vectors for each cluster. Both of these are needed
375
- for a dip test and re-clustering.
366
+ This is a function to cluster flow cytometry data that
367
+ has been measured in fluorescence channels using density-based
368
+ spatial clustering of applications with noise (DBSCAN), which
369
+ clusters based on density of points in an unsupervised method.
370
+ The number of clusters does not need to be explicitly stated by
371
+ the users. The only parameter that needs to be optimized is
372
+ min_cluster_size, which is set to 50 here. But I recommend 1% of
373
+ the len(data) Resulting plots are a bar chart showing the number
374
+ of cells in each cluster and a heatmap of the median fluorescence
375
+ intensity in each channel for each cluster.
376
+
377
+ Note: clusters that are labeled '0' are cells that the
378
+ DBSCAN could not cluster.
379
+
380
+ Returns a tuple of two dictionaries. The first dictionary is the
381
+ median fluorescence represented in the heatmap while the second
382
+ dictionary holds all the fluorescence vectors for each cluster.
383
+ Both of these are needed for a dip test and re-clustering.
376
384
377
385
Parameters
378
386
----------
379
- measured_data : simulated or experimental flow cytometry data that has been measured in
380
- fluorescence channels
381
- min_cluster_size : default = 50, needs to be optimized by user. Typically needs to be
382
- 1% of len(data).
383
- savefig: Save generated bar chart showing the number of cells in each cluster and a heat map
384
- of the median fluorescence intensity in each channel for each cluster.
387
+ measured_data : simulated or experimental flow cytometry data
388
+ that has been measured in fluorescence channels.
389
+ min_cluster_size : default = 50, needs to be optimized by user.
390
+ Typically needs to be 1% of len(data).
391
+ savefig: Save generated bar chart showing the number of cells in
392
+ each cluster and a heat map of the median fluorescence
393
+ intensity in each channel for each cluster.
385
394
Figure is saved using 'matplotlib' module.
386
395
387
396
Returns
388
397
-------
389
- output : a tuple of two dictionaries. The first dictionary is the median fluorescence represented
390
- in the heatmap while the second dictionary holds all the fluorescence vectors for each
391
- cluster. Both of these are needed for a dip test and re-clustering.
398
+ (final_dictionary, cluster_dict) : a tuple of two dictionaries.
399
+ The first dictionary is the
400
+ median fluorescence represented
401
+ in the heatmap while the second
402
+ dictionary holds all the
403
+ fluorescence vectors for each
404
+ cluster.
405
+ Both of these are needed for a
406
+ dip test and re-clustering.
392
407
393
408
See Also
394
409
--------
@@ -473,32 +488,41 @@ def cluster(measured_data, min_cluster_size=50, savefig=True):
473
488
return (final_dictionary , cluster_dict )
474
489
475
490
476
- def dip_test (median_FL_data , total_data , alpha = 0.05 , save_figure = True ):
491
+ def dip_test (median_FL_data , total_data , alpha = 0.05 , save_figure = False ):
477
492
"""
478
- Perform a Hartigan's dip test to check for unimodality in clusters and splits clusters if bimodality is found.
479
- This function will take the highest intensity channel for each cluster and
480
- check for bimodality to correct for errors in clustering similar fluorescence profiles.
481
- Changing alpha will alter how stringent the dip test is. A higher alpha will result in higher detection
482
- of bimodality, but runs a greater risk of false identification. It is important to note
483
- that this dip test is relatively coarse grained and will not identify very slight populations
484
- of mixed cells (e.g. 10 orange cells clustered with 1000 red cells)
485
-
486
- Returns an updated clustering of the primary clustering after performing a dip test
493
+ Perform a Hartigan's dip test to check for unimodality
494
+ in clusters and splits clusters if bimodality is found.
495
+ This function will take the highest intensity channel
496
+ for each cluster and check for bimodality to correct for
497
+ errors in clustering similar fluorescence profiles.
498
+
499
+ Changing alpha will alter how stringent the dip test is.
500
+ A higher alpha will result in higher detection of bimodality,
501
+ but runs a greater risk of false identification. It is
502
+ important to note that this dip test is relatively coarse
503
+ grained and will not identify very slight populations of mixed
504
+ cells (e.g. 10 orange cells clustered with 1000 red cells).
505
+
506
+ Returns an updated clustering of the primary clustering
507
+ after performing a dip test.
487
508
488
509
Parameters
489
510
----------
490
- median_FL_data : dict, clustering data generated by 'flowsym.cluster' function
491
- total_data : other fluorescence profiles for which errors will be corrected
492
- alpha: how stringent the dip test is
493
- save_figure : Save generated bar chart showing the number of cells in each cluster and a heat map
494
- of the median fluorescence intensity in each channel for each cluster.
495
- Figure is saved using 'matplotlib' module.
511
+ median_FL_data : dict, clustering data generated by
512
+ 'flowsym.cluster' function
513
+ total_data : other fluorescence profiles for which errors
514
+ will be corrected
515
+ alpha : how stringent the dip test is
516
+ save_figure : Save generated bar chart showing the number of
517
+ cells in each cluster and a heat map of the median
518
+ fluorescence intensity in each channel for each
519
+ cluster. Figure is saved using 'matplotlib' module.
496
520
497
521
Returns
498
522
-------
499
- output : a tuple of two dictionaries. The first dictionary is the median fluorescence represented
500
- in the heatmap while the second dictionary holds all the fluorescence vectors for each
501
- cluster. Both of these are needed for a dip test and re-clustering .
523
+ change_dict : a dictory containing the corection that must be
524
+ applied to similar fluorescence profiles if
525
+ bimodality is found .
502
526
503
527
See Also
504
528
--------
@@ -627,7 +651,7 @@ def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=True):
627
651
return change_dict
628
652
629
653
630
- def gaus_recluster (median_FL_data , total_data , tolerance = .25 , savefig = True ):
654
+ def gaus_recluster (median_FL_data , total_data , tolerance = .25 , save_figure = False ):
631
655
"""
632
656
Applies a gaussian mixture model with n_components=2
633
657
to try and separate rare populations of cells from
@@ -649,11 +673,11 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
649
673
----------
650
674
median_FL_data : data with median FL for each cluster
651
675
total_data : data with all measured FL for each cluster
652
- tolerance : how different do the sizes of clusters have to be before they
653
- are considered actually distinct spectrally ?
676
+ tolerance : how different do the sizes of clusters have
677
+ to be before they are considered actually distinct?
654
678
Increase this to be more stringent in splitting clusters.
655
- Decrease the value to allow more re-clustering at the cost of
656
- false positives.
679
+ Decrease the value to allow more re-clustering at
680
+ the cost of false positives.
657
681
save_figure : Save figure using 'matplotlib' module.
658
682
659
683
Returns
@@ -712,11 +736,12 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
712
736
# Do a ks 2 test to see if clusters are different
713
737
result = ks_2samp (clust1 [max_channel ], clust2 [max_channel ])
714
738
715
- # Test how different our cluster populations are. If the difference between the sizes is more than <tolerance>, of the
716
- # total, then we'll say we actually found a bimodal population to split
739
+ # Test how different our cluster populations are. If the difference between the sizes is more than
740
+ # <tolerance>, of the total, then we'll say we actually found a bimodal population to split
717
741
clust_split = abs (len (clust1 ) - len (clust2 )) / (len (clust1 ) + len (clust2 ))
718
742
719
- # Keep the split clusters if they meet our splitting criteria, otherwise retain original clusters from DB scan
743
+ # Keep the split clusters if they meet our splitting criteria, otherwise retain original clusters from DB
744
+ # scan
720
745
if clust_split > tolerance :
721
746
if result [1 ] < 1e-10 :
722
747
new_val = clust1 .values .tolist ()
@@ -736,7 +761,7 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
736
761
737
762
plt .tight_layout ()
738
763
739
- if savefig :
764
+ if save_figure :
740
765
plt .savefig ('gaus_mix_cluster_split' )
741
766
742
767
final_reclustered = {}
@@ -788,7 +813,7 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
788
813
plt .yticks (rotation = 0 )
789
814
plt .tight_layout ()
790
815
791
- if savefig :
816
+ if save_figure :
792
817
plt .savefig ('reclustered_after_gaus_mix_ks2' )
793
818
794
819
return reclustered
0 commit comments