You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: CHANGELOG.md
+1-1
Original file line number
Diff line number
Diff line change
@@ -5,7 +5,7 @@
5
5
* Improves support for datamodules with multiple test sets. Generalises this to support GO and FOLD. Also adds multiple seq ID.-based splits for GO. [#72](https://github.com/a-r-j/ProteinWorkshop/pull/72)
6
6
* Add redownload checks for already downloaded datasets and harmonise pdb download interface [#86](https://github.com/a-r-j/ProteinWorkshop/pull/86)
7
7
* Remove remaining errors from PDB dataset change
8
-
* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88)
8
+
* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88) as well as time-based splits [#89](https://github.com/a-r-j/ProteinWorkshop/pull/89)
molecule_type: "protein"# Type of molecule for which to select
15
15
experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types
16
-
max_length: 1000# Exclude polypeptides greater than length 1000
16
+
max_length: 150# Exclude polypeptides greater than length 1000
17
17
min_length: 10# Exclude peptides of length 10
18
18
oligomeric_min: 1# Include only monomeric proteins
19
19
oligomeric_max: 5# Include up to 5-meric proteins
@@ -24,6 +24,9 @@ datamodule:
24
24
remove_non_standard_residues: True # Include only proteins containing standard amino acid residues
25
25
remove_pdb_unavailable: True # Include only proteins that are available to download
26
26
train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
27
-
split_type: "sequence_similarity"# Split sequences by sequence similarity clustering, other option is "random"
28
-
split_sequence_similiarity: 0.3# Clustering at 30% sequence similarity (argument is ignored if split_type="random")
27
+
split_type: "sequence_similarity"# Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
28
+
split_sequence_similiarity: 0.3# Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
29
29
overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
30
+
split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
0 commit comments