From 97d30e8c6efb391856ce6650c22d2dac122aa310 Mon Sep 17 00:00:00 2001
From: Chao Peng <cpeng@anl.gov>
Date: Tue, 22 Jun 2021 21:51:04 +0000
Subject: [PATCH] Add ML benchmark for imaging calorimetry

---
 .gitlab-ci.yml                                |   1 +
 benchmarks/imaging_shower_ML/config.yml       |  42 ++++++
 .../options/imaging_ml_data.py                |  76 +++++++++++
 .../scripts/gen_particles.py                  | 104 ++++++++++++++
 .../scripts/prepare_tf_dataset.py             | 128 ++++++++++++++++++
 benchmarks/imaging_shower_ML/sim_rec_tag.py   |  99 ++++++++++++++
 6 files changed, 450 insertions(+)
 create mode 100644 benchmarks/imaging_shower_ML/config.yml
 create mode 100644 benchmarks/imaging_shower_ML/options/imaging_ml_data.py
 create mode 100644 benchmarks/imaging_shower_ML/scripts/gen_particles.py
 create mode 100644 benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
 create mode 100755 benchmarks/imaging_shower_ML/sim_rec_tag.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index dbcc78e1..3c943649 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -61,6 +61,7 @@ include:
   - local: 'benchmarks/clustering/config.yml'
   - local: 'benchmarks/rich/config.yml'
   - local: 'benchmarks/imaging_ecal/config.yml'
+  - local: 'benchmarks/imaging_shower_ML/config.yml'
 
 
 final_report:
diff --git a/benchmarks/imaging_shower_ML/config.yml b/benchmarks/imaging_shower_ML/config.yml
new file mode 100644
index 00000000..5f2bd4ff
--- /dev/null
+++ b/benchmarks/imaging_shower_ML/config.yml
@@ -0,0 +1,42 @@
+ml_shower:tagging_epi :
+  extends: .rec_benchmark
+  when: manual
+  stage: benchmarks1
+  script:
+    - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epi -n 100000 
+      --particles "electron,pion-,pion-" --pmin 0.5 --pmax 10
+
+ml_shower:tagging_e :
+  extends: .rec_benchmark
+  when: manual
+  stage: benchmarks1
+  script:
+    - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100000 
+      --particles "electron" --pmin 0.5 --pmax 10
+
+ml_shower:tagging_pi :
+  extends: .rec_benchmark
+  when: manual
+  stage: benchmarks1
+  script:
+    - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100000 
+      --particles "pion-" --pmin 0.5 --pmax 10
+
+ml_shower:training:
+  extends: .rec_benchmark
+  when: manual
+  stage: process
+  needs: ["ml_shower:tagging_epi", "ml_shower:tagging_e", "ml_shower:tagging_pi"]
+  script:
+    - ls -lrth
+    # TODO
+
+ml_shower:test:
+  extends: .rec_benchmark
+  when: manual
+  stage: collect
+  needs: ["ml_shower:training"]
+  script:
+    - ls -lrth
+    # TODO
+
diff --git a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
new file mode 100644
index 00000000..4c46af45
--- /dev/null
+++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
@@ -0,0 +1,76 @@
+import os
+import ROOT
+from Gaudi.Configuration import *
+from GaudiKernel import SystemOfUnits as units
+
+from GaudiKernel.DataObjectHandleBase import DataObjectHandleBase
+from Configurables import ApplicationMgr, EICDataSvc, PodioOutput, GeoSvc
+
+from Configurables import PodioInput
+from Configurables import Jug__Base__InputCopier_dd4pod__Geant4ParticleCollection_dd4pod__Geant4ParticleCollection_ as MCCopier
+from Configurables import Jug__Digi__CalorimeterHitDigi as CalorimeterHitDigi
+from Configurables import Jug__Reco__ImagingPixelReco as ImagingPixelReco
+from Configurables import Jug__Reco__ImagingPixelMerger as ImagingPixelMerger
+
+
+# input arguments through environment variables
+kwargs = dict()
+kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '')
+kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '')
+kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '')
+kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1))
+kwargs['nlayers'] = int(os.environ.get('IMCAL_ML_N_LAYERS', 20))
+kwargs['nhits'] = int(os.environ.get('IMCAL_ML_N_HITS', 20))
+
+if kwargs['nev'] < 1:
+    f = ROOT.TFile(kwargs['input'])
+    kwargs['nev'] = f.events.GetEntries()
+
+print(kwargs)
+# get sampling fraction from system environment variable, 1.0 by default
+sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0'))
+
+geo_service  = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')])
+podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')])
+
+podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits'])
+podout = PodioOutput('out', filename=kwargs['output'])
+
+copier = MCCopier('MCCopier',
+        OutputLevel=DEBUG,
+        inputCollection='mcparticles',
+        outputCollection='mcparticles2')
+imcaldigi = CalorimeterHitDigi('imcal_digi',
+        inputHitCollection='EcalBarrelHits',
+        outputHitCollection='EcalBarrelHitsDigi',
+        energyResolutions=[0., 0.02, 0.],
+        dynamicRangeADC=3*units.MeV,
+        pedestalSigma=40)
+imcalreco = ImagingPixelReco('imcal_reco',
+        inputHitCollection='EcalBarrelHitsDigi',
+        outputHitCollection='EcalBarrelHitsReco',
+        dynamicRangeADC=3.*units.MeV,
+        pedestalSigma=40,
+        thresholdFactor=3.5,
+        readoutClass='EcalBarrelHits',
+        layerField='layer',
+        sectorField='module')
+imcaldata = ImagingPixelMerger('imcal_merger',
+#        OutputLevel=DEBUG,
+        inputHitCollection='EcalBarrelHitsReco',
+        outputHitCollection='EcalBarrelHitsML',
+        etaSize=0.001,
+        phiSize=0.001,
+        numberOfHits=kwargs['nhits'],
+        numberOfLayers=kwargs['nlayers'])
+
+podout.outputCommands = ['keep *']
+
+ApplicationMgr(
+    TopAlg=[podin, copier, imcaldigi, imcalreco, imcaldata, podout],
+    EvtSel='NONE',
+    EvtMax=kwargs['nev'],
+    ExtSvc=[podev],
+    OutputLevel=DEBUG
+)
+
diff --git a/benchmarks/imaging_shower_ML/scripts/gen_particles.py b/benchmarks/imaging_shower_ML/scripts/gen_particles.py
new file mode 100644
index 00000000..2d0b81c8
--- /dev/null
+++ b/benchmarks/imaging_shower_ML/scripts/gen_particles.py
@@ -0,0 +1,104 @@
+import os
+from pyHepMC3 import HepMC3 as hm
+import numpy as np
+import argparse
+
+
+PARTICLES = {
+    "pion0": (111, 0.1349766),       # pi0
+    "pion+": (211, 0.13957018),      # pi+
+    "pion-": (-211, 0.13957018),     # pi-
+    "kaon0": (311, 0.497648),        # K0
+    "kaon+": (321, 0.493677),        # K+
+    "kaon-": (-321, 0.493677),       # K-
+    "proton": (2212, 0.938272),      # proton
+    "neutron": (2112, 0.939565),     # neutron
+    "electron": (11, 0.51099895e-3), # electron
+    "positron": (-11, 0.51099895e-3),# positron
+    "photon": (22, 0),               # photon
+}
+
+
+def gen_event(p, theta, phi, pid, mass):
+    evt = hm.GenEvent(momentum_unit=hm.Units.MomentumUnit.GEV, length_unit=hm.Units.LengthUnit.MM)
+    # final state
+    state = 1
+    e0 = np.sqrt(p*p + mass*mass)
+    px = np.cos(phi)*np.sin(theta)
+    py = np.sin(phi)*np.sin(theta)
+    pz = np.cos(theta)
+
+    # beam
+    pbeam = hm.GenParticle(hm.FourVector(0, 0, 0, 0.938272), 2212, 4)
+    ebeam = hm.GenParticle(hm.FourVector(0, 0, e0, np.sqrt(e0*e0 + 0.511e-3*0.511e-3)), 11, 4)
+
+    # out particle
+    hout = hm.GenParticle(hm.FourVector(px*p, py*p, pz*p, e0), pid, state)
+
+    # vertex
+    vert = hm.GenVertex()
+    vert.add_particle_in(ebeam)
+    vert.add_particle_in(pbeam)
+    vert.add_particle_out(hout)
+    evt.add_vertex(vert)
+    return evt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('output', help='path to the output file')
+    parser.add_argument('-n', type=int, default=1000, dest='nev', help='number of events to generate')
+    parser.add_argument('-s', type=int, default=-1, dest='seed', help='seed for random generator')
+    parser.add_argument('--parray', type=str, default="", dest='parray',
+                        help='an array of momenta in GeV, separated by \",\"')
+    parser.add_argument('--pmin', type=float, default=8.0, dest='pmin', help='minimum momentum in GeV')
+    parser.add_argument('--pmax', type=float, default=100.0, dest='pmax', help='maximum momentum in GeV')
+    parser.add_argument('--angmin', type=float, default=0.0, dest='angmin', help='minimum angle in degree')
+    parser.add_argument('--angmax', type=float, default=20.0, dest='angmax', help='maximum angle in degree')
+    parser.add_argument('--phmin', type=float, default=0.0, dest='phmin', help='minimum angle in degree')
+    parser.add_argument('--phmax', type=float, default=360.0, dest='phmax', help='maximum angle in degree')
+    parser.add_argument('--particles', type=str, default='electron', dest='particles',
+                        help='particle names, support {}'.format(list(PARTICLES.keys())))
+
+    args = parser.parse_args()
+
+    # random seed (< 0 will get it from enviroment variable 'SEED', or a system random number)
+    if args.seed < 0:
+        args.seed = os.environ.get('SEED', int.from_bytes(os.urandom(4), byteorder='big', signed=False))
+    print("Random seed is {}".format(args.seed))
+    np.random.seed(args.seed)
+
+    output = hm.WriterAscii(args.output);
+    if output.failed():
+        print("Cannot open file \"{}\"".format(args.output))
+        sys.exit(2)
+
+    # build particle info
+    parts = []
+    for pid in args.particles.split(','):
+        pid = pid.strip()
+        if pid not in PARTICLES.keys():
+            print('pid {:d} not found in dictionary, ignored.'.format(pid))
+            continue
+        parts.append(PARTICLES[pid])
+
+    # p values
+    pvals = np.random.uniform(args.pmin, args.pmax, args.nev) if not args.parray else \
+            np.random.choice([float(p.strip()) for p in args.parray.split(',')], args.nev)
+    thvals = np.random.uniform(args.angmin, args.angmax, args.nev)/180.*np.pi
+    phivals = np.random.uniform(args.phmin, args.phmax, args.nev)/180.*np.pi
+    partvals = [parts[i] for i in np.random.choice(len(parts), args.nev)]
+
+    count = 0
+    for p, theta, phi, (pid, mass) in zip(pvals, thvals, phivals, partvals):
+        if (count % 1000 == 0):
+            print("Generated {} events".format(count), end='\r')
+        evt = gen_event(p, theta, phi, pid, mass)
+        output.write_event(evt)
+        evt.clear()
+        count += 1
+
+    print("Generated {} events".format(args.nev))
+    output.close()
+
diff --git a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
new file mode 100644
index 00000000..2b722a45
--- /dev/null
+++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
@@ -0,0 +1,128 @@
+import ROOT
+import os
+import numpy as np
+import pandas as pd
+import argparse
+import sys
+
+
+# read hits data from root file
+def get_ml_data(path, evnums=None, shape=(20, 20, 3), branch='EcalBarrelHitsML'):
+    f = ROOT.TFile(path)
+    events = f.events
+    if evnums is None:
+        evnums = np.arange(events.GetEntries())
+    elif isinstance(evnums, int):
+        evnums = [evnums]
+
+    dbuf = np.zeros(shape=np.hstack([len(evnums), shape]))
+    idb = 0
+    for iev in evnums:
+        if iev >= events.GetEntries():
+            print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1))
+            continue
+
+        events.GetEntry(iev)
+        for hit in getattr(events, branch):
+            dbuf[iev][hit.layerID][hit.hitID] = (hit.edep, hit.eta, hit.polar.phi)
+
+    return dbuf
+
+
+# execute this script
+# read mc particles from root file
+def get_mcp_simple(path, evnums=None, branch='mcparticles2'):
+    f = ROOT.TFile(path)
+    events = f.events
+    if evnums is None:
+        evnums = np.arange(events.GetEntries())
+    elif isinstance(evnums, int):
+        evnums = [evnums]
+
+    dbuf = np.zeros(shape=(len(evnums), 6))
+    idb = 0
+    for iev in evnums:
+        if iev >= events.GetEntries():
+            print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1))
+            continue
+
+        events.GetEntry(iev)
+        # extract full mc particle data
+        part = getattr(events, branch)[2]
+        dbuf[idb] = (iev, part.psx, part.psy, part.psz, part.pdgID, part.status)
+        idb += 1
+    return pd.DataFrame(data=dbuf[:idb], columns=['event', 'px', 'py', 'pz', 'pid', 'status'])
+
+
+# load root macros, input is an argument string
+def load_root_macros(arg_macros):
+    for path in arg_macros.split(','):
+        path = path.strip()
+        if os.path.exists(path):
+            ROOT.gROOT.Macro(path)
+        else:
+            print('\"{}\" does not exist, skip loading it.'.format(path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis')
+    parser.add_argument('file', type=str, help='path to root file')
+    parser.add_argument('-o', type=str, default='.', dest='outdir', help='output directory')
+    # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file')
+    parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros',
+                         help='root macros to load (accept multiple paths separated by \",\")')
+    parser.add_argument('--topo-size', type=float, default=2.0, dest='topo_size',
+                        help='bin size for projection plot (mrad)')
+    parser.add_argument('--topo-range', type=float, default=200.0, dest='topo_range',
+                        help='half range for projection plot (mrad)')
+    parser.add_argument('--shape', type=str, default="20, 20, 3", dest='shape',
+                        help='shape of data (3D), default (20, 20, 3)')
+    args = parser.parse_args()
+
+
+    os.makedirs(args.outdir, exist_ok=True)
+    load_root_macros(args.macros)
+    data = get_ml_data(args.file, shape=[int(i.strip()) for i in args.shape.split(',')])
+    dfmcp = get_mcp_simple(args.file, branch='mcparticles2')
+    # dfmcp = dfmcp[dfmcp['status'] == 24578]
+    dfmcp.loc[:, 'p'] = np.linalg.norm(dfmcp[['px', 'py', 'pz']].values, axis=1)
+    dfmcp['phi'] = np.arctan2(dfmcp['py'].values, dfmcp['px'].values)
+    dfmcp['theta'] = np.arccos(dfmcp['pz'].values/dfmcp['p'].values)
+    dfmcp['eta'] = -np.log(np.tan(dfmcp['theta'].values/2.))
+
+    tag = np.zeros(shape=(data.shape[0], 7))
+    trg = args.topo_range/1000.
+
+    for iev in np.arange(data.shape[0]):
+        ev = data[iev].transpose(2,0,1).reshape(3,-1)
+        truth = dfmcp[dfmcp['event'] == iev]
+        if not truth.shape[0]:
+            continue
+        mask = ev[0] > 0.
+        if not sum(mask):
+            continue
+        eta = ev[1][ev[0] > 0.]
+        phi = ev[2][ev[0] > 0.]
+        # normalize
+        # print(ev.T.reshape(20, 20, 3) == data[iev])
+        ev[0] /= 10.
+        # ev[1][mask] = (ev[1][mask] - eta.mean() + trg)/(2.*trg)
+        # ev[2][mask] = (ev[2][mask] - phi.mean() + trg)/(2.*trg)
+        ev[1][mask] = (ev[1][mask] - truth['eta'].values + trg)/(2.*trg)
+        ev[2][mask] = (ev[2][mask] - truth['phi'].values + trg)/(2.*trg)
+        # ev[1][mask] = (ev[1][mask] + 1.0)/2.
+        # ev[2][mask] = (ev[2][mask] + np.pi)/2./np.pi
+        out_range = (ev[1] > 1.) | (ev[1] < 0.) | (ev[2] > 1.) | (ev[2] < 0.)
+        ev.T[out_range] = (0., 0., 0.)
+        test = data[iev].transpose(2,0,1).reshape(3,-1)
+        tag[iev] = np.hstack([truth[['pid', 'eta', 'phi', 'p']].values[0], (eta.mean(), phi.mean(), trg)])
+        # print(ev[1])
+        # print((eta.shape, phi.shape), (eta.mean(), phi.mean()), truth[['eta', 'phi']].values)
+
+    # momentum
+    mask = tag.T[3] > 0.
+    print('training data shape: {}'.format(data[mask].shape))
+    print('tagging data shape: {}'.format(tag[mask].shape))
+    np.save(os.path.join(args.outdir, 'data.npy'), data[mask])
+    np.save(os.path.join(args.outdir, 'tag.npy'), tag[mask])
+
diff --git a/benchmarks/imaging_shower_ML/sim_rec_tag.py b/benchmarks/imaging_shower_ML/sim_rec_tag.py
new file mode 100755
index 00000000..1b227979
--- /dev/null
+++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py
@@ -0,0 +1,99 @@
+#! /usr/local/bin/python3
+'''
+    A python script to facilitate the ML training benchmark for imaging Calorimeter:
+    simulation -> reconstruction -> prepare tagging data for tensorflow
+    Author: Chao Peng (ANL)
+    Date: 06/20/2021
+'''
+import os
+import sys
+import subprocess
+import argparse
+
+
+default_compact = os.path.join(os.environ.get('DETECTOR_PATH', ''),
+        '{}.xml'.format(os.environ.get('JUGGLER_DETECTOR', '')))
+parser = argparse.ArgumentParser()
+parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100, help='Number of events to process.')
+parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.')
+parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.')
+parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).')
+parser.add_argument('--numberOfLayers', dest='nlayer', type=int, default=20, help='number of layers in ML data.')
+parser.add_argument('--numberOfHits', dest='nhit', type=int, default=20, help='number of hits in ML data.')
+parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".')
+parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.')
+parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.')
+parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.')
+
+args = parser.parse_args()
+kwargs = vars(args)
+
+for mdir in ['gen_data', 'sim_data', 'rec_data', 'tag_data']:
+    os.makedirs(mdir, exist_ok=True)
+
+gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs))
+sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs))
+rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs))
+tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs))
+os.makedirs(tag_dir, exist_ok=True)
+
+procs = [p.strip() for p in args.process.split(',')]
+sdir = os.path.dirname(os.path.realpath(__file__))
+
+if 'sim' in procs:
+    # generate particles
+    gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file,
+            '-n', '{}'.format(args.nev),
+            '-s', '{}'.format(args.seed),
+            '--angmin', '50', '--angmax', '130',
+            '--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax),
+            '--particles', args.particles]
+    subprocess.run(gen_cmd)
+    # simulation
+    sim_cmd = ['npsim',
+            '--part.minimalKineticEnergy', '1*TeV',
+            '--numberOfEvents', '{}'.format(args.nev),
+            '--runType', 'batch',
+            '--inputFiles', gen_file,
+            '--outputFile', sim_file,
+            '--compact', args.compact,
+            '-v', 'WARNING']
+    if args.seed > 0:
+        sim_cmd += ['--random.seed', args.seed]
+    return_code = subprocess.run(sim_cmd).returncode
+    print(return_code)
+    if return_code is not None and return_code < 0:
+        print("ERROR running simulation!")
+        exit(return_code)
+    subprocess.run(['rootls', '-t', sim_file])
+
+
+if 'rec' in procs:
+    # export to environment variables (used to pass arguments to the option file)
+    os.environ['JUGGLER_SIM_FILE'] = sim_file
+    os.environ['JUGGLER_REC_FILE'] = rec_file
+    os.environ['JUGGLER_COMPACT_PATH'] = args.compact
+    os.environ['JUGGLER_N_EVENTS'] = '{}'.format(args.nev)
+    os.environ['IMCAL_ML_N_LAYERS'] = '{}'.format(args.nlayer)
+    os.environ['IMCAL_ML_N_HITS'] = '{}'.format(args.nhit)
+
+    juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv')
+    rec_cmd = ['xenv', '-x', juggler_xenv, 'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')]
+    return_code = subprocess.run(rec_cmd).returncode
+    print(return_code)
+    if return_code is not None and return_code < 0:
+        print("ERROR running juggler (reconstruction)!")
+        exit(return_code)
+    process = subprocess.run(['rootls', '-t', rec_file])
+
+
+if 'tag' in procs:
+    tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
+            '-o', tag_dir,
+            '--shape', '{nlayer},{nhit},3'.format(**kwargs)]
+    return_code = subprocess.run(tag_cmd).returncode
+    print(return_code)
+    if return_code is not None and return_code < 0:
+        print("ERROR running ML data tagging!")
+        exit(return_code)
+
-- 
GitLab