Skip to content

Commit 5979759

Browse files
authored
Merge pull request #39 from USDA-ARS-GBRU/issue38
fixed an issue where N's after the PAM site caused an error
2 parents a699c6c + b6d55e4 commit 5979759

File tree

4 files changed

+19
-9
lines changed

4 files changed

+19
-9
lines changed

CHANGELOG.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# v0.4.2
2+
3+
Fixed a bug where calulating Doench efficiency scores raised an error if there was an 'N' in the first three nucleotides past the PAM in the flanking genomic sequence. Guidemaker now removes those guides from consideration and reports it as a warning if the flag `--doench_efficiency_score` is used.
4+
15
# v0.4.1
26

37
* Changed how Guidemaker handles DNA sequences that are soft-masked with lowercase letters. The new behavior unmasks all
@@ -14,4 +18,5 @@
1418
* replaced append methods with concat methods for Pandas 2.1.1
1519
* output data is now gzipped
1620
* updated Dockerfile to use Minimamba base image
17-
* Updates to Python dependencies
21+
* Updates to Python dependencies
22+

guidemaker/core.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -1151,11 +1151,16 @@ def get_max_cfd(cfdlist):
11511151

11521152
def get_doench_efficiency_score(df, pam_orientation, num_threads=1):
11531153
checkset={'AGG','CGG','TGG','GGG'}
1154-
if pam_orientation == "3prime" and set(df.PAM)==checkset:
1155-
1156-
doenchscore = doench_predict.predict(np.array([x.upper() for x in df.target_seq30]), num_threads=num_threads)
1157-
df["Efficiency"] = doenchscore
1154+
# filter out lines with N'safter the PAM, these cannot be scored
1155+
df2 = df[-df.target_seq30.str.contains('N')]
1156+
if len(df) != len(df2):
1157+
n_removed = len(df) - len(df2)
1158+
logger.warning("{} guides were removed from consideration becasue there were N's in the region flanking the PAM site. These cannot be scored.".format(n_removed) )
1159+
if pam_orientation == "3prime" and set(df2.PAM)==checkset:
1160+
1161+
doenchscore = doench_predict.predict(np.array([x.upper() for x in df2.target_seq30]), num_threads=num_threads)
1162+
df2["Efficiency"] = doenchscore
11581163
else:
11591164
logger.warning("NOTE: doench_efficiency_score based on Doench et al. 2016 - can only be used for NGG PAM).Check PAM sequence and PAM orientation")
1160-
df["Efficiency"] = "Not Available"
1161-
return df.drop('target_seq30', axis=1)
1165+
df2["Efficiency"] = "Not Available"
1166+
return df2.drop('target_seq30', axis=1)

guidemaker/doench_predict.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def predict(
8888
length_audit: bool = False,
8989
num_threads: int = 1
9090
) -> np.array:
91-
"""Pedicts regressions scored from sequences.
91+
"""Predicts regression scores from sequences.
9292
9393
Args:
9494
seq (numpy.ndarray) numpy array of 30 nt sequences with 25 nt of guide, NGG pam in 25:27 and the following 2 nts.

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ tornado==6.3.3
7070
typing_extensions==4.7.1
7171
tzdata==2023c
7272
tzlocal==4.3.1
73-
urllib3==2.0.6
73+
urllib3==2.0.7
7474
validators==0.22.0
7575
watchdog==3.0.0
7676
zipp==3.16.2

0 commit comments

Comments
 (0)