@@ -794,7 +794,7 @@ def estimate_phase_beagle(
794
794
VCF's contig names.
795
795
panel : str, optional
796
796
VCF file corresponding to a reference haplotype panel (compressed or
797
- uncompressed). By default, the 1KGP panel in the ``~/ pypgx-bundle``
797
+ uncompressed). By default, the 1KGP panel in the ``pypgx-bundle``
798
798
directory will be used.
799
799
impute : bool, default: False
800
800
If True, perform imputation of missing genotypes.
@@ -819,8 +819,7 @@ def estimate_phase_beagle(
819
819
metadata ['Program' ] = 'Beagle'
820
820
821
821
if panel is None :
822
- home = os .path .expanduser ('~' )
823
- panel = f'{ home } /pypgx-bundle/1kgp/{ assembly } /{ gene } .vcf.gz'
822
+ panel = f'{ sdk .get_bundle_path ()} /1kgp/{ assembly } /{ gene } .vcf.gz'
824
823
825
824
has_chr_prefix = pyvcf .has_chr_prefix (panel )
826
825
@@ -839,6 +838,31 @@ def estimate_phase_beagle(
839
838
if metadata ['Platform' ] == 'Chip' :
840
839
vf1 = vf1 .filter_gsa ()
841
840
841
+ def run_beagle (vf1 , em ):
842
+ with tempfile .TemporaryDirectory () as t :
843
+ vf1 .to_file (f'{ t } /input.vcf' )
844
+ command = [
845
+ 'java' , '-Xmx2g' , '-jar' , beagle ,
846
+ f'gt={ t } /input.vcf' ,
847
+ f'chrom={ region } ' ,
848
+ f'ref={ panel } ' ,
849
+ f'out={ t } /output' ,
850
+ f'impute={ str (impute ).lower ()} ' ,
851
+ f'em={ em } '
852
+ ]
853
+ subprocess .run (
854
+ command ,
855
+ check = True ,
856
+ stdout = subprocess .DEVNULL ,
857
+ stderr = subprocess .PIPE
858
+ )
859
+ vf3 = pyvcf .VcfFrame .from_file (f'{ t } /output.vcf.gz' )
860
+ if common_samples :
861
+ vf = vf3 .rename ({f'{ x } _TEMP' : x for x in common_samples })
862
+ if has_chr_prefix :
863
+ vf = vf3 .update_chr_prefix ('remove' )
864
+ return vf3
865
+
842
866
# Beagle will throw an error if there is only one marker overlapping with
843
867
# the reference panel in a given window. This typically occurs when the
844
868
# input VCF has very few markers or only one marker. Therefore, these
@@ -863,39 +887,26 @@ def estimate_phase_beagle(
863
887
common_samples = list (set (vf1 .samples ) & set (vf2 .samples ))
864
888
if common_samples :
865
889
vf1 = vf1 .rename ({x : f'{ x } _TEMP' for x in common_samples })
866
- with tempfile .TemporaryDirectory () as t :
867
- vf1 .to_file (f'{ t } /input.vcf' )
868
- command = [
869
- 'java' , '-Xmx2g' , '-jar' , beagle ,
870
- f'gt={ t } /input.vcf' ,
871
- f'chrom={ region } ' ,
872
- f'ref={ panel } ' ,
873
- f'out={ t } /output' ,
874
- f'impute={ str (impute ).lower ()} '
875
- ]
876
- try :
877
- subprocess .run (
878
- command ,
879
- check = True ,
880
- stdout = subprocess .DEVNULL ,
881
- stderr = subprocess .PIPE
882
- )
883
- vf3 = pyvcf .VcfFrame .from_file (f'{ t } /output.vcf.gz' )
884
- if common_samples :
885
- vf3 = vf3 .rename ({f'{ x } _TEMP' : x for x in common_samples })
886
- if has_chr_prefix :
887
- vf3 = vf3 .update_chr_prefix ('remove' )
890
+
891
+ try :
892
+ vf3 = run_beagle (vf1 , em = 'true' )
893
+ except subprocess .CalledProcessError as e :
894
+ message = e .stderr .decode ()
888
895
# Beagle may throw an error even when multiple overlapping markers
889
896
# exist because they are too distant from each other -- that is,
890
897
# located in separate haplotype windows.
891
- except subprocess .CalledProcessError as e :
892
- message = e .stderr .decode ()
893
- if "Window has only one position" in message :
894
- warnings .warn ("Beagle: Window has only one position" )
895
- vf3 = pyvcf .VcfFrame ([], vf1 .df [0 :0 ])
896
- else :
897
- print (message )
898
- raise e
898
+ if "Window has only one position" in message :
899
+ warnings .warn ("Beagle: Window has only one position" )
900
+ vf3 = pyvcf .VcfFrame ([], vf1 .df [0 :0 ])
901
+ # Beagle will throw an error if the expectation-maximization
902
+ # algorithm estimates a parameter value outside the permitted
903
+ # range. When this happens, we skip the expectation-maximization.
904
+ elif "IllegalArgumentException: 1.0" in message :
905
+ warnings .warn ("Beagle: Expectation-maximization skipped" )
906
+ vf3 = run_beagle (vf1 , em = 'false' )
907
+ else :
908
+ print (message )
909
+ raise e
899
910
900
911
return sdk .Archive (metadata , vf3 )
901
912
@@ -1203,7 +1214,7 @@ def predict_cnv(copy_number, cnv_caller=None):
1203
1214
Archive file or object with the semantic type CovFrame[CopyNumber].
1204
1215
cnv_caller : str or pypgx.Archive, optional
1205
1216
Archive file or object with the semantic type Model[CNV]. By default,
1206
- a pre-trained CNV caller in the ``~/ pypgx-bundle`` directory will be
1217
+ a pre-trained CNV caller in the ``pypgx-bundle`` directory will be
1207
1218
used.
1208
1219
1209
1220
Returns
@@ -1218,8 +1229,7 @@ def predict_cnv(copy_number, cnv_caller=None):
1218
1229
1219
1230
gene = copy_number .metadata ['Gene' ]
1220
1231
assembly = copy_number .metadata ['Assembly' ]
1221
- home = os .path .expanduser ('~' )
1222
- model_file = f'{ home } /pypgx-bundle/cnv/{ assembly } /{ gene } .zip'
1232
+ model_file = f'{ sdk .get_bundle_path ()} /cnv/{ assembly } /{ gene } .zip'
1223
1233
1224
1234
if cnv_caller is None :
1225
1235
cnv_caller = sdk .Archive .from_file (model_file )
0 commit comments