Skip to content

Commit

Permalink
Baseline MLPF model for CMS, PF group status 24.09.21 [TF] (#81)
Browse files Browse the repository at this point in the history
* move gen jobs

* add additional samples

* up

* add timing

* update timing, add eta eff-fake

* baseline training

* tagged version of heptfds

* uncomment in gen scripts
  • Loading branch information
jpata authored Sep 22, 2021
1 parent 2629785 commit 4cf744f
Show file tree
Hide file tree
Showing 12 changed files with 524 additions and 535 deletions.
2 changes: 1 addition & 1 deletion hep_tfds
19 changes: 9 additions & 10 deletions mlpf/data/prepare_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
outdir = "/hdfs/local/joosep/mlpf/gen"
samples = [
"SinglePiFlatPt0p7To10_cfi",
#"SingleTauFlatPt2To150_cfi",
#"SingleMuFlatPt0p7To10_cfi",
#"SingleElectronFlatPt1To100_pythia8_cfi",
#"SingleGammaFlatPt10To100_pythia8_cfi",
#"SinglePi0E10_pythia8_cfi",
"SingleTauFlatPt2To150_cfi",
"SingleMuFlatPt0p7To10_cfi",
"SingleElectronFlatPt1To100_pythia8_cfi",
"SingleGammaFlatPt10To100_pythia8_cfi",
"SinglePi0E10_pythia8_cfi",
]

samples_pu = [
#"TTbar_14TeV_TuneCUETP8M1_cfi",
"TTbar_14TeV_TuneCUETP8M1_cfi",
]

if __name__ == "__main__":
Expand All @@ -24,11 +24,10 @@
os.makedirs(outdir + "/" + s + "/raw", exist_ok=True)
os.makedirs(outdir + "/" + s + "/root", exist_ok=True)

#for iseed in range(1,51):
for iseed in range(51,2001):
for iseed in range(1,2001):
if not os.path.isfile(outdir+"/"+s+"/raw/pfntuple_{}.pkl".format(iseed)):
if is_pu:
print("sbatch genjob_tallinn_pu.sh {} {}".format(s, iseed))
print("sbatch mlpf/tallinn/genjob_pu.sh {} {}".format(s, iseed))
else:
print("sbatch genjob_tallinn.sh {} {}".format(s, iseed))
print("sbatch mlpf/tallinn/genjob.sh {} {}".format(s, iseed))

6 changes: 6 additions & 0 deletions mlpf/data/pu_files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/932b9bde-6ae4-44cb-b1db-66dab83ab7d1.root
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/33b2fbd9-2544-44af-8652-c9a19edfa400.root
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/936b7b11-5eec-4f97-83ff-48be106b100d.root
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/f052be2f-f604-48e6-b484-1e412f0391f6.root
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/5ac1ea44-a5e0-4454-8945-45fd29c2947f.root
file:/scratch-persistent/joosep/store/relval/CMSSW_11_2_0_pre8/RelValMinBias_14TeV/GEN-SIM/112X_mcRun3_2021_realistic_v10-v1/00000/ae4379b2-d002-4adc-a168-e782ee296f91.root
21 changes: 21 additions & 0 deletions mlpf/tallinn/genjob.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH -p main
#SBATCH --mem-per-cpu=4G
#SBATCH --cpus-per-task=1

env
df -h

WORKDIR=/scratch/$USER/${SLURM_JOB_ID}
SAMPLE=$1
SEED=$2

mkdir -p $WORKDIR
cd $WORKDIR

/home/joosep/particleflow/mlpf/data/genjob.sh $SAMPLE $SEED

cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.root /hdfs/local/joosep/mlpf/gen/$SAMPLE/root/
cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.pkl /hdfs/local/joosep/mlpf/gen/$SAMPLE/raw/

rm -Rf $WORKDIR
21 changes: 21 additions & 0 deletions mlpf/tallinn/genjob_pu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH -p main
#SBATCH --mem-per-cpu=4G
#SBATCH --cpus-per-task=1

env
df -h

WORKDIR=/scratch/$USER/${SLURM_JOB_ID}
SAMPLE=$1
SEED=$2

mkdir -p $WORKDIR
cd $WORKDIR

/home/joosep/particleflow/mlpf/data/genjob_pu.sh $SAMPLE $SEED

cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.root /hdfs/local/joosep/mlpf/gen/$SAMPLE/root/
cp $WORKDIR/$SAMPLE/$SEED/pfntuple_*.pkl /hdfs/local/joosep/mlpf/gen/$SAMPLE/raw/

rm -Rf $WORKDIR
3 changes: 2 additions & 1 deletion mlpf/tfmodel/model_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def plot_eff_and_fake_rate(
plt.xlabel(xlabel)
plt.ylabel("Fraction of particles / bin")

image_path = str(cp_dir / "eff_fake_cls{}.png".format(icls))
image_path = str(cp_dir / "eff_fake_cls{}_ivar{}.png".format(icls, ivar))
plt.savefig(image_path, bbox_inches="tight")
plt.close("all")

Expand Down Expand Up @@ -454,6 +454,7 @@ def on_epoch_end(self, epoch, logs=None):

if icls!=0:
self.plot_eff_and_fake_rate(epoch, icls, msk, ypred_id, cp_dir_cls)
self.plot_eff_and_fake_rate(epoch, icls, msk, ypred_id, cp_dir_cls, ivar=2, bins=np.linspace(-5,5,100))

for variable in ["pt", "eta", "sin_phi", "cos_phi", "energy"]:
self.plot_reg_distribution(epoch, cp_dir_cls, ypred, ypred_id, icls, variable)
Expand Down
34 changes: 23 additions & 11 deletions mlpf/tfmodel/timing.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,47 @@
import numpy as np
import time
import subprocess
import shlex
import pynvml

#pip install only onnxruntime_gpu, not onnxruntime!
import onnxruntime

if __name__ == "__main__":
EP_list = ['CUDAExecutionProvider']
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)

nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f nvidia_smi_log.csv"
p = subprocess.Popen(shlex.split(nvidia_smi_call))
EP_list = ['CUDAExecutionProvider']

time.sleep(5)

mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_initial = mem.used/1000/1000
print("mem_initial", mem_initial)

onnx_sess = onnxruntime.InferenceSession("model.onnx", providers=EP_list)
time.sleep(5)

mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_onnx = mem.used/1000/1000
print("mem_onnx", mem_initial)

for num_elems in [3200, 6400, 12800, 25600, 12800, 6400, 3200]:
for num_elems in range(1600, 25600, 320):
times = []
for i in range(250):
mem_used = []

#average over 100 events
for i in range(100):

#allocate array in system RAM
X = np.array(np.random.randn(1, num_elems, 15), np.float32)
X = np.array(np.random.randn(1, num_elems, 18), np.float32)

#transfer data to GPU, run model, transfer data back
t0 = time.time()
pred_onx = onnx_sess.run(None, {"x:0": X})
t1 = time.time()
dt = t1 - t0
times.append(dt)
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
mem_used.append(mem.used/1000/1000)

print("Nelem={} mean_time={:.2f}ms stddev_time={:.2f} ms".format(num_elems, 1000.0*np.mean(times), 1000.0*np.std(times)))
print("Nelem={} mean_time={:.2f} ms stddev_time={:.2f} ms mem_used={:.0f} MB".format(num_elems, 1000.0*np.mean(times), 1000.0*np.std(times), np.max(mem_used)))
time.sleep(5)

p.terminate()
Loading

0 comments on commit 4cf744f

Please sign in to comment.