Skip to content

Commit

Permalink
add/update scripts for extract face jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
trautmane committed Jun 4, 2024
1 parent cb34355 commit 1683914
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

set -e

ABSOLUTE_SCRIPT=$(readlink -m "$0")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")

source "${SCRIPT_DIR}"/00_config.sh "NA"

umask 0002

DATASET_CSV="$1"
N_NODES="20"
export RUNTIME="233:59" # using 20 11-core nodes, batches with 31 slabs took between ? and ? hours to complete

if [[ ! -f ${DATASET_CSV} ]]; then
echo "ERROR: csv file ${DATASET_CSV} not found"
exit 1
fi

#-----------------------------------------------------------
# Spark executor setup with 11 cores per worker ...

export N_EXECUTORS_PER_NODE=2
export N_CORES_PER_EXECUTOR=5
# To distribute work evenly, recommended number of tasks/partitions is 3 times the number of cores.
#N_TASKS_PER_EXECUTOR_CORE=3
export N_OVERHEAD_CORES_PER_WORKER=1
#N_CORES_PER_WORKER=$(( (N_EXECUTORS_PER_NODE * N_CORES_PER_EXECUTOR) + N_OVERHEAD_CORES_PER_WORKER ))
export N_CORES_DRIVER=1

#-----------------------------------------------------------
RUN_TIME=$(date +"%Y%m%d_%H%M%S")
CLASS="org.janelia.saalfeldlab.hotknife.SparkGenerateFaceScaleSpaceMultiSEMBatch"

ARGV="\
--n5Path=${N5_SAMPLE_PATH} \
--datasetCsv=${DATASET_CSV} \
--blockSize=1024,1024 \
--invert"

LOG_DIR="logs/72_face"
LOG_FILE="${LOG_DIR}/extract_face.${RUN_TIME}.out"
mkdir -p ${LOG_DIR}

# use shell group to tee all output to log file
{

echo "Running with arguments:
${ARGV}
"
/groups/flyTEM/flyTEM/render/spark/spark-janelia/flintstone.sh $N_NODES $HOT_KNIFE_JAR $CLASS $ARGV
} 2>&1 | tee -a "${LOG_FILE}"

Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,18 @@ source "${SCRIPT_DIR}"/../00_config.sh "NA"

SLABS_PER_FILE=35
COUNT=0
BATCH_COUNT=0
CSV_PREFIX="64_masked_clahe_dataset"

for SLAB in ${ALL_SLABS}; do

source "${SCRIPT_DIR}"/../00_config.sh "${SLAB}"

if ! (( COUNT % SLABS_PER_FILE )); then
C_VAL=$(printf '%05d' ${COUNT})
CSV_FILE="masked_clahe_dataset.${C_VAL}.csv"
BC_VAL=$(printf '%03d' ${BATCH_COUNT})
CSV_FILE="${CSV_PREFIX}.batch_${BC_VAL}.csv"
echo -n "" > "${CSV_FILE}"
BATCH_COUNT=$((BATCH_COUNT+=1))
fi

DATASET_INPUT="${N5_ALIGNED_SLAB_DATASET}_norm-layer/s0"
Expand All @@ -44,4 +47,4 @@ for SLAB in ${ALL_SLABS}; do

done

ls -alh masked_clahe_dataset.*.csv
ls -alh ${CSV_PREFIX}.batch_*.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,18 @@ source "${SCRIPT_DIR}"/../00_config.sh "NA"

SLABS_PER_FILE=31 # 31 slabs should take about 6 hours to complete, 402 / 31 = 13 files
COUNT=0
BATCH_COUNT=0
CSV_PREFIX="71_flat_dataset"

for SLAB in ${ALL_SLABS}; do

source "${SCRIPT_DIR}"/../00_config.sh "${SLAB}"

if ! (( COUNT % SLABS_PER_FILE )); then
C_VAL=$(printf '%05d' ${COUNT})
CSV_FILE="flat_dataset.${C_VAL}.csv"
BC_VAL=$(printf '%03d' ${BATCH_COUNT})
CSV_FILE="${CSV_PREFIX}.batch_${BC_VAL}.csv"
echo -n "" > "${CSV_FILE}"
BATCH_COUNT=$((BATCH_COUNT+=1))
fi

# /render/slab_000_to_009/s002_m395_align_no35_horiz_avgshd_ic___20240504_084955_norm-layer-clahe/s0
Expand Down Expand Up @@ -46,4 +49,4 @@ for SLAB in ${ALL_SLABS}; do

done

ls -alh flat_dataset.*.csv
ls -alh ${CSV_PREFIX}.batch_*.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

set -e

ABSOLUTE_SCRIPT=$(readlink -m "$0")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")

source "${SCRIPT_DIR}"/../00_config.sh "NA"

SLABS_PER_FILE=31 # ? slabs should take about ? hours to complete, 402 / 31 = 13 files
SURFACE_DEPTH=4
COLOR="i" # i (invert), n (normalize), in (invert+normalize), or nothing
FACE_SUFFIX="${SURFACE_DEPTH}${COLOR}"

COUNT=0
BATCH_COUNT=0
CSV_PREFIX="72_face_dataset"

for SLAB in ${ALL_SLABS}; do

source "${SCRIPT_DIR}"/../00_config.sh "${SLAB}"

if ! (( COUNT % SLABS_PER_FILE )); then
BC_VAL=$(printf '%03d' ${BATCH_COUNT})
CSV_FILE="${CSV_PREFIX}.batch_${BC_VAL}.csv"
echo -n "" > "${CSV_FILE}"
BATCH_COUNT=$((BATCH_COUNT+=1))
fi

# /flat_clahe/s036_m252/raw/s0
RAW_DATASET="${N5_FLAT_DATASET_ROOT}/raw/s0"
if [[ ! -d ${N5_SAMPLE_PATH}${OUT_DATASET} ]]; then
echo "ERROR: ${N5_SAMPLE_PATH}${OUT_DATASET} does not exist"
exit 1
fi

# /flat_clahe/s036_m252/top4i
TOP_DATASET="${N5_FLAT_DATASET_ROOT}/top${FACE_SUFFIX}"
if [[ -d ${N5_SAMPLE_PATH}${TOP_DATASET} ]]; then
echo "ERROR: ${N5_SAMPLE_PATH}${TOP_DATASET} already exists"
exit 1
fi

# /flat_clahe/s036_m252/bot4i
BOT_DATASET="${N5_FLAT_DATASET_ROOT}/bot${FACE_SUFFIX}"
if [[ -d ${N5_SAMPLE_PATH}${BOT_DATASET} ]]; then
echo "ERROR: ${N5_SAMPLE_PATH}${BOT_DATASET} already exists"
exit 1
fi

SLAB_PROJECT=$(getSlabProjectName "${SLAB}")

# /nrs/hess/data/hess_wafer_53/export/hess_wafer_53_center7.n5/heightfields_fix/slab_070_to_079/s070_m104/max/attributes.json
HF_FIX_MAX_ATTR_FILE="${N5_SAMPLE_PATH}/heightfields_fix/${SLAB_PROJECT}/${RAW_SLAB}/max/attributes.json"
if [[ ! -f ${HF_FIX_MAX_ATTR_FILE} ]]; then
echo "ERROR: missing file ${HF_FIX_MAX_ATTR_FILE}"
exit 1
fi

#{
# "dataType": "float32", "compression": { "type": "gzip", "useZlib": false, "level": -1 },
# "blockSize": [ 1024, 1024 ],
# "dimensions": [ 26497, 26072 ],
# "avg": 33.3189829188907,
# "downsamplingFactors": [ 2, 2, 1 ]
#}
AVG_SIZE=$(/groups/flyem/data/render/bin/jq '. .avg | tonumber | floor' "${HF_FIX_MAX_ATTR_FILE}")
SURFACE_SIZE=$((AVG_SIZE - 2))

echo "${RAW_DATASET},${TOP_DATASET},${SURFACE_DEPTH},${SURFACE_SIZE}" >> "${CSV_FILE}"
echo "${RAW_DATASET},${BOT_DATASET},-${SURFACE_DEPTH},-${SURFACE_SIZE}" >> "${CSV_FILE}"

COUNT=$((COUNT+=1))

done

ls -alh ${CSV_PREFIX}.batch_*.csv

0 comments on commit 1683914

Please sign in to comment.