physionetchallenges
diff --git a/‎README.md
+41-6 b/‎README.md
+41-6
diff --git a/‎helper_code.py
-1 b/‎helper_code.py
-1
diff --git a/‎prepare_code15_data.py
+48-65 b/‎prepare_code15_data.py
+48-65
@@ -49,7 +49,13 @@ You can use the provided training set for the `training_data` and `holdout_data`
 
 ## How do I create data for these scripts?
 
-You can use the scripts in this repository to convert the [CODE-15% dataset](https://zenodo.org/records/4916206) to [WFDB](https://wfdb.io/) format. These instructions use `code15_hdf5` as the path for the input data files and `code15_wfdb` for the output data files, but you can replace them with the absolute or relative paths for the files on your machine.
+You can use the scripts in this repository to convert the [CODE-15% dataset](https://zenodo.org/records/4916206), the [SaMi-Trop dataset](https://zenodo.org/records/4905618), and the [PTB-XL dataset](https://physionet.org/content/ptb-xl/) to [WFDB](https://wfdb.io/) format.
+
+Please see the [data](https://physionetchallenges.org/2025/#data) section of the website for more information about the Challenge data.
+
+#### CODE-15% dataset
+
+These instructions use `code15_input` as the path for the input data files and `code15_output` for the output data files, but you can replace them with the absolute or relative paths for the files on your machine.
 
 1. Download and unzip one or more of the `exam_part` files and the `exams.csv` file in the [CODE-15% dataset](https://zenodo.org/records/4916206).
 
@@ -58,13 +64,42 @@ You can use the scripts in this repository to convert the [CODE-15% dataset](htt
 3. Convert the CODE-15% dataset to WFDB format, with the available demographics information and Chagas labels in the WFDB header file, by running
 
         python prepare_code15_data.py \
-            -i code15_hdf5/exams_part0.hdf5 code15_hdf5/exams_part1.hdf5 \
-            -d code15_hdf5/exams.csv \
-            -l code15_hdf5/code15_chagas_labels.csv \
-            -o code15_wfdb
+            -i code15_input/exams_part0.hdf5 code15_input/exams_part1.hdf5 \
+            -d code15_input/exams.csv \
+            -l code15_input/code15_chagas_labels.csv \
+            -o code15_output
 
 Each `exam_part` file in the [CODE-15% dataset](https://zenodo.org/records/4916206) contains approximately 20,000 ECG recordings. You can include more or fewer of these files to increase or decrease the number of ECG recordings, respectively. You may want to start with fewer ECG recordings to debug your code.
 
+#### SaMi-Trop dataset
+
+These instructions use `samitrop_input` as the path for the input data files and `samitrop_output` for the output data files, but you can replace them with the absolute or relative paths for the files on your machine.
+
+1. Download and unzip `exams.zip` file and the `exams.csv` file in the [SaMi-Trop dataset](https://zenodo.org/records/4905618).
+
+2. Download and unzip the Chagas labels, i.e., the [`samitrop_chagas_labels.csv`](https://physionetchallenges.org/2025/data/samitrop_chagas_labels.zip) file.
+
+3. Convert the SaMi-Trop dataset to WFDB format, with the available demographics information and Chagas labels in the WFDB header file, by running
+
+        python prepare_samitrop_data.py \
+            -i samitrop_input/exams.hdf5 \
+            -d samitrop_input/exams.csv \
+            -l samitrop_input/samitrop_chagas_labels.csv \
+            -o samitrop_output
+
+#### PTB-XL dataset
+
+These instructions use `ptbxl_input` as the path for the input data files and `ptbxl_output` for the output data files, but you can replace them with the absolute or relative paths for the files on your machine. We are using the `records500` folder, which has a 500Hz sampling frequency, but you can also try the `records100` folder, which has a 100Hz sampling frequency.
+
+1. Download and, if necessary, unzip the [PTB-XL dataset](https://physionet.org/content/ptb-xl/).
+
+2. Update the WFDB files with the available demographics information and Chagas labels  by running
+
+        python prepare_ptbxl_data.py \
+            -i ptbxl_input/records500/ \
+            -d ptbxl_input/ptbxl_database.csv \
+            -o ptbxl_output
+
 ## Which scripts I can edit?
 
 Please edit the following script to add your code:
@@ -122,7 +157,7 @@ If you have trouble running your code, then please try the follow steps to run t
         user@computer:~/example/python-example-2025$ docker run -it -v ~/example/model:/challenge/model -v ~/example/holdout_data:/challenge/holdout_data -v ~/example/holdout_outputs:/challenge/holdout_outputs -v ~/example/training_data:/challenge/training_data image bash
 
         root@[...]:/challenge# ls
-            Dockerfile             holdout_outputs        run_mode.py
+            Dockerfile             holdout_outputs        run_model.py
             evaluate_model.py      LICENSE                training_data
             helper_code.py         README.md      
             holdout_data           requirements.txt
 
@@ -481,4 +481,3 @@ def sanitize_boolean_value(x):
         return 1
     else:
         return float('nan')
-    
@@ -6,19 +6,20 @@
 import numpy as np
 import os
 import os.path
+import pandas as pd
 import sys
 import wfdb
 
 from helper_code import is_integer, is_boolean, sanitize_boolean_value
 
 # Parse arguments.
 def get_parser():
-    description = 'Prepare the CODE-15 database.'
+    description = 'Prepare the CODE-15% dataset for the Challenge.'
     parser = argparse.ArgumentParser(description=description)
-    parser.add_argument('-i', '--signal_files', type=str, required=True, nargs='*')
-    parser.add_argument('-f', '--signal_format', type=str, required=False, default='dat', choices=['dat', 'mat']) 
-    parser.add_argument('-d', '--demographics_file', type=str, required=True) 
-    parser.add_argument('-l', '--label_file', type=str, required=True) 
+    parser.add_argument('-i', '--signal_files', type=str, required=True, nargs='*') # exams_part0.hdf5, exams_part1.hdf5, ...
+    parser.add_argument('-d', '--demographics_file', type=str, required=True) # exams.csv
+    parser.add_argument('-l', '--labels_file', type=str, required=True) # code15_chagas_labels.csv
+    parser.add_argument('-f', '--signal_format', type=str, required=False, default='dat', choices=['dat', 'mat'])
     parser.add_argument('-o', '--output_path', type=str, required=True)
     return parser
 
@@ -29,7 +30,7 @@ def suppress_stdout():
     with open(os.devnull, 'w') as devnull:
         stdout = sys.stdout
         sys.stdout = devnull
-        try:  
+        try:
             yield
         finally:
             sys.stdout = stdout
@@ -39,8 +40,8 @@ def convert_dat_to_mat(record, write_dir=None):
     import wfdb.io.convert
 
     # Change the current working directory; wfdb.io.convert.matlab.wfdb_to_matlab places files in the current working directory.
-    cwd = os.getcwd()
     if write_dir:
+        cwd = os.getcwd()
         os.chdir(write_dir)
 
     # Convert the .dat file to a .mat file.
@@ -75,7 +76,7 @@ def convert_dat_to_mat(record, write_dir=None):
 # Fix the checksums from the Python WFDB library.
 def fix_checksums(record, checksums=None):
     if checksums is None:
-        x = wfdb.rdrecord(record, physical=False)   
+        x = wfdb.rdrecord(record, physical=False)
         signals = np.asarray(x.d_signal)
         checksums = np.sum(signals, axis=0, dtype=np.int16)
 
@@ -98,56 +99,39 @@ def fix_checksums(record, checksums=None):
 # Run script.
 def run(args):
     # Load the patient demographic data.
-    exam_id_to_patient_id = dict()
     exam_id_to_age = dict()
     exam_id_to_sex = dict()
 
-    with open(args.demographics_file, 'r') as f:
-        for i, l in enumerate(f):
-            arrs = [arr.strip() for arr in l.split(',')]
-            if i == 0:
-                idx_exam_id = arrs.index('exam_id')
-                idx_patient_id = arrs.index('patient_id')
-                idx_age = arrs.index('age')
-                idx_is_male = arrs.index('is_male')
-            else:
-                exam_id = arrs[idx_exam_id]
-                assert(is_integer(exam_id))
-                exam_id = int(exam_id)
-
-                patient_id = arrs[idx_patient_id]
-                assert(is_integer(patient_id))
-                patient_id = int(patient_id)
-                exam_id_to_patient_id[exam_id] = patient_id
-
-                age = arrs[idx_age]
-                assert(is_integer(age))
-                age = int(age)
-                exam_id_to_age[exam_id] = age
-
-                is_male = arrs[idx_is_male]
-                assert(is_boolean(is_male))
-                is_male = sanitize_boolean_value(is_male)
-                sex = 'Male' if is_male else 'Female' # This variable was encoding as a binary value.
-                exam_id_to_sex[exam_id] = sex
-
-        # Load the Chagas labels.
-        exam_id_to_chagas = dict()
-
-        with open(args.label_file, 'r') as f:
-            for i, l in enumerate(f):
-                arrs = [arr.strip() for arr in l.split(',')]
-                if i == 0:
-                    idx_exam_id = arrs.index('exam_id')
-                    idx_chagas = arrs.index('chagas')
-                else:
-                    exam_id = arrs[idx_exam_id]
-                    assert(is_integer(exam_id))
-                    exam_id = int(exam_id)
-                    
-                    chagas = arrs[idx_chagas]
-                    chagas = sanitize_boolean_value(chagas)
-                    exam_id_to_chagas[exam_id] = bool(chagas)
+    df = pd.read_csv(args.demographics_file)
+    for idx, row in df.iterrows():
+        exam_id = row['exam_id']
+        assert(is_integer(exam_id))
+        exam_id = int(exam_id)
+
+        age = row['age']
+        assert(is_integer(age))
+        age = int(age)
+        exam_id_to_age[exam_id] = age
+
+        is_male = row['is_male']
+        assert(is_boolean(is_male))
+        is_male = sanitize_boolean_value(is_male)
+        sex = 'Male' if is_male else 'Female' # This variable was encoding as a binary value.
+        exam_id_to_sex[exam_id] = sex
+
+    # Load the Chagas labels.
+    exam_id_to_chagas = dict()
+
+    df = pd.read_csv(args.labels_file)
+    for idx, row in df.iterrows():
+        exam_id = row['exam_id']
+        assert(is_integer(exam_id))
+        exam_id = int(exam_id)
+
+        chagas = row['chagas']
+        assert(is_boolean(chagas))
+        chagas = sanitize_boolean_value(chagas)
+        exam_id_to_chagas[exam_id] = bool(chagas)
 
     # Load and convert the signal data.
 
@@ -156,7 +140,7 @@ def run(args):
     sampling_frequency = 400
     units = 'mV'
 
-    # Define the paramters for the WFDB files.
+    # Define the paramaters for the WFDB files.
     gain = 1000
     baseline = 0
     num_bits = 16
@@ -179,7 +163,7 @@ def run(args):
                     continue
                 else:
                     pass
-                
+
                 physical_signals = np.array(f['tracings'][i], dtype=np.float32)
 
                 # Perform basic error checking on the signal.
@@ -207,25 +191,24 @@ def run(args):
                 digital_signals[~np.isfinite(digital_signals)] = -2**(num_bits-1)
                 digital_signals = np.asarray(digital_signals, dtype=np.int32) # We need to promote from 16-bit integers due to an error in the Python WFDB library.
 
-                # Add the exam ID, the patient ID, age, sex, and the Chagas label.
-                patient_id = exam_id_to_patient_id[exam_id] 
+                # Add the exam ID, age, sex, and the Chagas label.
                 age = exam_id_to_age[exam_id]
                 sex = exam_id_to_sex[exam_id]
                 chagas = exam_id_to_chagas[exam_id]
-                comments = [f'Exam ID: {exam_id}', f'Patient ID: {patient_id}', f'Age: {age}', f'Sex: {sex}', f'Chagas label: {chagas}']
-                
+                comments = [f'Age: {age}', f'Sex: {sex}', f'Chagas label: {chagas}']
+
                 # Save the signal.
                 record = str(exam_id)
-                wfdb.wrsamp(record, fs=sampling_frequency, units=[units]*num_leads, sig_name=lead_names, 
+                wfdb.wrsamp(record, fs=sampling_frequency, units=[units]*num_leads, sig_name=lead_names,
                             d_signal=digital_signals, fmt=[fmt]*num_leads, adc_gain=[gain]*num_leads, baseline=[baseline]*num_leads,
                             write_dir=args.output_path, comments=comments)
 
-                if args.signal_format == 'mat':
+                if args.signal_format in ('mat', '.mat'):
                     convert_dat_to_mat(record, write_dir=args.output_path)
 
-                # Recompute the checksums for the checksum due to an error in the Python WFDB library.
+                # Recompute the checksums as needed.
                 checksums = np.sum(digital_signals, axis=0, dtype=np.int16)
                 fix_checksums(os.path.join(args.output_path, record), checksums)
 
 if __name__=='__main__':
-    run(get_parser().parse_args(sys.argv[1:]))
+    run(get_parser().parse_args(sys.argv[1:]))