From 97d30e8c6efb391856ce6650c22d2dac122aa310 Mon Sep 17 00:00:00 2001 From: Chao Peng <cpeng@anl.gov> Date: Tue, 22 Jun 2021 21:51:04 +0000 Subject: [PATCH] Add ML benchmark for imaging calorimetry --- .gitlab-ci.yml | 1 + benchmarks/imaging_shower_ML/config.yml | 42 ++++++ .../options/imaging_ml_data.py | 76 +++++++++++ .../scripts/gen_particles.py | 104 ++++++++++++++ .../scripts/prepare_tf_dataset.py | 128 ++++++++++++++++++ benchmarks/imaging_shower_ML/sim_rec_tag.py | 99 ++++++++++++++ 6 files changed, 450 insertions(+) create mode 100644 benchmarks/imaging_shower_ML/config.yml create mode 100644 benchmarks/imaging_shower_ML/options/imaging_ml_data.py create mode 100644 benchmarks/imaging_shower_ML/scripts/gen_particles.py create mode 100644 benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py create mode 100755 benchmarks/imaging_shower_ML/sim_rec_tag.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dbcc78e1..3c943649 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -61,6 +61,7 @@ include: - local: 'benchmarks/clustering/config.yml' - local: 'benchmarks/rich/config.yml' - local: 'benchmarks/imaging_ecal/config.yml' + - local: 'benchmarks/imaging_shower_ML/config.yml' final_report: diff --git a/benchmarks/imaging_shower_ML/config.yml b/benchmarks/imaging_shower_ML/config.yml new file mode 100644 index 00000000..5f2bd4ff --- /dev/null +++ b/benchmarks/imaging_shower_ML/config.yml @@ -0,0 +1,42 @@ +ml_shower:tagging_epi : + extends: .rec_benchmark + when: manual + stage: benchmarks1 + script: + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epi -n 100000 + --particles "electron,pion-,pion-" --pmin 0.5 --pmax 10 + +ml_shower:tagging_e : + extends: .rec_benchmark + when: manual + stage: benchmarks1 + script: + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100000 + --particles "electron" --pmin 0.5 --pmax 10 + +ml_shower:tagging_pi : + extends: .rec_benchmark + when: manual + stage: benchmarks1 + script: + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100000 + --particles "pion-" --pmin 0.5 --pmax 10 + +ml_shower:training: + extends: .rec_benchmark + when: manual + stage: process + needs: ["ml_shower:tagging_epi", "ml_shower:tagging_e", "ml_shower:tagging_pi"] + script: + - ls -lrth + # TODO + +ml_shower:test: + extends: .rec_benchmark + when: manual + stage: collect + needs: ["ml_shower:training"] + script: + - ls -lrth + # TODO + diff --git a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py new file mode 100644 index 00000000..4c46af45 --- /dev/null +++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py @@ -0,0 +1,76 @@ +import os +import ROOT +from Gaudi.Configuration import * +from GaudiKernel import SystemOfUnits as units + +from GaudiKernel.DataObjectHandleBase import DataObjectHandleBase +from Configurables import ApplicationMgr, EICDataSvc, PodioOutput, GeoSvc + +from Configurables import PodioInput +from Configurables import Jug__Base__InputCopier_dd4pod__Geant4ParticleCollection_dd4pod__Geant4ParticleCollection_ as MCCopier +from Configurables import Jug__Digi__CalorimeterHitDigi as CalorimeterHitDigi +from Configurables import Jug__Reco__ImagingPixelReco as ImagingPixelReco +from Configurables import Jug__Reco__ImagingPixelMerger as ImagingPixelMerger + + +# input arguments through environment variables +kwargs = dict() +kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '') +kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '') +kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '') +kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1)) +kwargs['nlayers'] = int(os.environ.get('IMCAL_ML_N_LAYERS', 20)) +kwargs['nhits'] = int(os.environ.get('IMCAL_ML_N_HITS', 20)) + +if kwargs['nev'] < 1: + f = ROOT.TFile(kwargs['input']) + kwargs['nev'] = f.events.GetEntries() + +print(kwargs) +# get sampling fraction from system environment variable, 1.0 by default +sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) + +geo_service = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')]) +podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')]) + +podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits']) +podout = PodioOutput('out', filename=kwargs['output']) + +copier = MCCopier('MCCopier', + OutputLevel=DEBUG, + inputCollection='mcparticles', + outputCollection='mcparticles2') +imcaldigi = CalorimeterHitDigi('imcal_digi', + inputHitCollection='EcalBarrelHits', + outputHitCollection='EcalBarrelHitsDigi', + energyResolutions=[0., 0.02, 0.], + dynamicRangeADC=3*units.MeV, + pedestalSigma=40) +imcalreco = ImagingPixelReco('imcal_reco', + inputHitCollection='EcalBarrelHitsDigi', + outputHitCollection='EcalBarrelHitsReco', + dynamicRangeADC=3.*units.MeV, + pedestalSigma=40, + thresholdFactor=3.5, + readoutClass='EcalBarrelHits', + layerField='layer', + sectorField='module') +imcaldata = ImagingPixelMerger('imcal_merger', +# OutputLevel=DEBUG, + inputHitCollection='EcalBarrelHitsReco', + outputHitCollection='EcalBarrelHitsML', + etaSize=0.001, + phiSize=0.001, + numberOfHits=kwargs['nhits'], + numberOfLayers=kwargs['nlayers']) + +podout.outputCommands = ['keep *'] + +ApplicationMgr( + TopAlg=[podin, copier, imcaldigi, imcalreco, imcaldata, podout], + EvtSel='NONE', + EvtMax=kwargs['nev'], + ExtSvc=[podev], + OutputLevel=DEBUG +) + diff --git a/benchmarks/imaging_shower_ML/scripts/gen_particles.py b/benchmarks/imaging_shower_ML/scripts/gen_particles.py new file mode 100644 index 00000000..2d0b81c8 --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/gen_particles.py @@ -0,0 +1,104 @@ +import os +from pyHepMC3 import HepMC3 as hm +import numpy as np +import argparse + + +PARTICLES = { + "pion0": (111, 0.1349766), # pi0 + "pion+": (211, 0.13957018), # pi+ + "pion-": (-211, 0.13957018), # pi- + "kaon0": (311, 0.497648), # K0 + "kaon+": (321, 0.493677), # K+ + "kaon-": (-321, 0.493677), # K- + "proton": (2212, 0.938272), # proton + "neutron": (2112, 0.939565), # neutron + "electron": (11, 0.51099895e-3), # electron + "positron": (-11, 0.51099895e-3),# positron + "photon": (22, 0), # photon +} + + +def gen_event(p, theta, phi, pid, mass): + evt = hm.GenEvent(momentum_unit=hm.Units.MomentumUnit.GEV, length_unit=hm.Units.LengthUnit.MM) + # final state + state = 1 + e0 = np.sqrt(p*p + mass*mass) + px = np.cos(phi)*np.sin(theta) + py = np.sin(phi)*np.sin(theta) + pz = np.cos(theta) + + # beam + pbeam = hm.GenParticle(hm.FourVector(0, 0, 0, 0.938272), 2212, 4) + ebeam = hm.GenParticle(hm.FourVector(0, 0, e0, np.sqrt(e0*e0 + 0.511e-3*0.511e-3)), 11, 4) + + # out particle + hout = hm.GenParticle(hm.FourVector(px*p, py*p, pz*p, e0), pid, state) + + # vertex + vert = hm.GenVertex() + vert.add_particle_in(ebeam) + vert.add_particle_in(pbeam) + vert.add_particle_out(hout) + evt.add_vertex(vert) + return evt + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('output', help='path to the output file') + parser.add_argument('-n', type=int, default=1000, dest='nev', help='number of events to generate') + parser.add_argument('-s', type=int, default=-1, dest='seed', help='seed for random generator') + parser.add_argument('--parray', type=str, default="", dest='parray', + help='an array of momenta in GeV, separated by \",\"') + parser.add_argument('--pmin', type=float, default=8.0, dest='pmin', help='minimum momentum in GeV') + parser.add_argument('--pmax', type=float, default=100.0, dest='pmax', help='maximum momentum in GeV') + parser.add_argument('--angmin', type=float, default=0.0, dest='angmin', help='minimum angle in degree') + parser.add_argument('--angmax', type=float, default=20.0, dest='angmax', help='maximum angle in degree') + parser.add_argument('--phmin', type=float, default=0.0, dest='phmin', help='minimum angle in degree') + parser.add_argument('--phmax', type=float, default=360.0, dest='phmax', help='maximum angle in degree') + parser.add_argument('--particles', type=str, default='electron', dest='particles', + help='particle names, support {}'.format(list(PARTICLES.keys()))) + + args = parser.parse_args() + + # random seed (< 0 will get it from enviroment variable 'SEED', or a system random number) + if args.seed < 0: + args.seed = os.environ.get('SEED', int.from_bytes(os.urandom(4), byteorder='big', signed=False)) + print("Random seed is {}".format(args.seed)) + np.random.seed(args.seed) + + output = hm.WriterAscii(args.output); + if output.failed(): + print("Cannot open file \"{}\"".format(args.output)) + sys.exit(2) + + # build particle info + parts = [] + for pid in args.particles.split(','): + pid = pid.strip() + if pid not in PARTICLES.keys(): + print('pid {:d} not found in dictionary, ignored.'.format(pid)) + continue + parts.append(PARTICLES[pid]) + + # p values + pvals = np.random.uniform(args.pmin, args.pmax, args.nev) if not args.parray else \ + np.random.choice([float(p.strip()) for p in args.parray.split(',')], args.nev) + thvals = np.random.uniform(args.angmin, args.angmax, args.nev)/180.*np.pi + phivals = np.random.uniform(args.phmin, args.phmax, args.nev)/180.*np.pi + partvals = [parts[i] for i in np.random.choice(len(parts), args.nev)] + + count = 0 + for p, theta, phi, (pid, mass) in zip(pvals, thvals, phivals, partvals): + if (count % 1000 == 0): + print("Generated {} events".format(count), end='\r') + evt = gen_event(p, theta, phi, pid, mass) + output.write_event(evt) + evt.clear() + count += 1 + + print("Generated {} events".format(args.nev)) + output.close() + diff --git a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py new file mode 100644 index 00000000..2b722a45 --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py @@ -0,0 +1,128 @@ +import ROOT +import os +import numpy as np +import pandas as pd +import argparse +import sys + + +# read hits data from root file +def get_ml_data(path, evnums=None, shape=(20, 20, 3), branch='EcalBarrelHitsML'): + f = ROOT.TFile(path) + events = f.events + if evnums is None: + evnums = np.arange(events.GetEntries()) + elif isinstance(evnums, int): + evnums = [evnums] + + dbuf = np.zeros(shape=np.hstack([len(evnums), shape])) + idb = 0 + for iev in evnums: + if iev >= events.GetEntries(): + print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1)) + continue + + events.GetEntry(iev) + for hit in getattr(events, branch): + dbuf[iev][hit.layerID][hit.hitID] = (hit.edep, hit.eta, hit.polar.phi) + + return dbuf + + +# execute this script +# read mc particles from root file +def get_mcp_simple(path, evnums=None, branch='mcparticles2'): + f = ROOT.TFile(path) + events = f.events + if evnums is None: + evnums = np.arange(events.GetEntries()) + elif isinstance(evnums, int): + evnums = [evnums] + + dbuf = np.zeros(shape=(len(evnums), 6)) + idb = 0 + for iev in evnums: + if iev >= events.GetEntries(): + print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1)) + continue + + events.GetEntry(iev) + # extract full mc particle data + part = getattr(events, branch)[2] + dbuf[idb] = (iev, part.psx, part.psy, part.psz, part.pdgID, part.status) + idb += 1 + return pd.DataFrame(data=dbuf[:idb], columns=['event', 'px', 'py', 'pz', 'pid', 'status']) + + +# load root macros, input is an argument string +def load_root_macros(arg_macros): + for path in arg_macros.split(','): + path = path.strip() + if os.path.exists(path): + ROOT.gROOT.Macro(path) + else: + print('\"{}\" does not exist, skip loading it.'.format(path)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') + parser.add_argument('file', type=str, help='path to root file') + parser.add_argument('-o', type=str, default='.', dest='outdir', help='output directory') + # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file') + parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros', + help='root macros to load (accept multiple paths separated by \",\")') + parser.add_argument('--topo-size', type=float, default=2.0, dest='topo_size', + help='bin size for projection plot (mrad)') + parser.add_argument('--topo-range', type=float, default=200.0, dest='topo_range', + help='half range for projection plot (mrad)') + parser.add_argument('--shape', type=str, default="20, 20, 3", dest='shape', + help='shape of data (3D), default (20, 20, 3)') + args = parser.parse_args() + + + os.makedirs(args.outdir, exist_ok=True) + load_root_macros(args.macros) + data = get_ml_data(args.file, shape=[int(i.strip()) for i in args.shape.split(',')]) + dfmcp = get_mcp_simple(args.file, branch='mcparticles2') + # dfmcp = dfmcp[dfmcp['status'] == 24578] + dfmcp.loc[:, 'p'] = np.linalg.norm(dfmcp[['px', 'py', 'pz']].values, axis=1) + dfmcp['phi'] = np.arctan2(dfmcp['py'].values, dfmcp['px'].values) + dfmcp['theta'] = np.arccos(dfmcp['pz'].values/dfmcp['p'].values) + dfmcp['eta'] = -np.log(np.tan(dfmcp['theta'].values/2.)) + + tag = np.zeros(shape=(data.shape[0], 7)) + trg = args.topo_range/1000. + + for iev in np.arange(data.shape[0]): + ev = data[iev].transpose(2,0,1).reshape(3,-1) + truth = dfmcp[dfmcp['event'] == iev] + if not truth.shape[0]: + continue + mask = ev[0] > 0. + if not sum(mask): + continue + eta = ev[1][ev[0] > 0.] + phi = ev[2][ev[0] > 0.] + # normalize + # print(ev.T.reshape(20, 20, 3) == data[iev]) + ev[0] /= 10. + # ev[1][mask] = (ev[1][mask] - eta.mean() + trg)/(2.*trg) + # ev[2][mask] = (ev[2][mask] - phi.mean() + trg)/(2.*trg) + ev[1][mask] = (ev[1][mask] - truth['eta'].values + trg)/(2.*trg) + ev[2][mask] = (ev[2][mask] - truth['phi'].values + trg)/(2.*trg) + # ev[1][mask] = (ev[1][mask] + 1.0)/2. + # ev[2][mask] = (ev[2][mask] + np.pi)/2./np.pi + out_range = (ev[1] > 1.) | (ev[1] < 0.) | (ev[2] > 1.) | (ev[2] < 0.) + ev.T[out_range] = (0., 0., 0.) + test = data[iev].transpose(2,0,1).reshape(3,-1) + tag[iev] = np.hstack([truth[['pid', 'eta', 'phi', 'p']].values[0], (eta.mean(), phi.mean(), trg)]) + # print(ev[1]) + # print((eta.shape, phi.shape), (eta.mean(), phi.mean()), truth[['eta', 'phi']].values) + + # momentum + mask = tag.T[3] > 0. + print('training data shape: {}'.format(data[mask].shape)) + print('tagging data shape: {}'.format(tag[mask].shape)) + np.save(os.path.join(args.outdir, 'data.npy'), data[mask]) + np.save(os.path.join(args.outdir, 'tag.npy'), tag[mask]) + diff --git a/benchmarks/imaging_shower_ML/sim_rec_tag.py b/benchmarks/imaging_shower_ML/sim_rec_tag.py new file mode 100755 index 00000000..1b227979 --- /dev/null +++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py @@ -0,0 +1,99 @@ +#! /usr/local/bin/python3 +''' + A python script to facilitate the ML training benchmark for imaging Calorimeter: + simulation -> reconstruction -> prepare tagging data for tensorflow + Author: Chao Peng (ANL) + Date: 06/20/2021 +''' +import os +import sys +import subprocess +import argparse + + +default_compact = os.path.join(os.environ.get('DETECTOR_PATH', ''), + '{}.xml'.format(os.environ.get('JUGGLER_DETECTOR', ''))) +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100, help='Number of events to process.') +parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.') +parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.') +parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).') +parser.add_argument('--numberOfLayers', dest='nlayer', type=int, default=20, help='number of layers in ML data.') +parser.add_argument('--numberOfHits', dest='nhit', type=int, default=20, help='number of hits in ML data.') +parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".') +parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.') +parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.') +parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.') + +args = parser.parse_args() +kwargs = vars(args) + +for mdir in ['gen_data', 'sim_data', 'rec_data', 'tag_data']: + os.makedirs(mdir, exist_ok=True) + +gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs)) +sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) +rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) +tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs)) +os.makedirs(tag_dir, exist_ok=True) + +procs = [p.strip() for p in args.process.split(',')] +sdir = os.path.dirname(os.path.realpath(__file__)) + +if 'sim' in procs: + # generate particles + gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file, + '-n', '{}'.format(args.nev), + '-s', '{}'.format(args.seed), + '--angmin', '50', '--angmax', '130', + '--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax), + '--particles', args.particles] + subprocess.run(gen_cmd) + # simulation + sim_cmd = ['npsim', + '--part.minimalKineticEnergy', '1*TeV', + '--numberOfEvents', '{}'.format(args.nev), + '--runType', 'batch', + '--inputFiles', gen_file, + '--outputFile', sim_file, + '--compact', args.compact, + '-v', 'WARNING'] + if args.seed > 0: + sim_cmd += ['--random.seed', args.seed] + return_code = subprocess.run(sim_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running simulation!") + exit(return_code) + subprocess.run(['rootls', '-t', sim_file]) + + +if 'rec' in procs: + # export to environment variables (used to pass arguments to the option file) + os.environ['JUGGLER_SIM_FILE'] = sim_file + os.environ['JUGGLER_REC_FILE'] = rec_file + os.environ['JUGGLER_COMPACT_PATH'] = args.compact + os.environ['JUGGLER_N_EVENTS'] = '{}'.format(args.nev) + os.environ['IMCAL_ML_N_LAYERS'] = '{}'.format(args.nlayer) + os.environ['IMCAL_ML_N_HITS'] = '{}'.format(args.nhit) + + juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv') + rec_cmd = ['xenv', '-x', juggler_xenv, 'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')] + return_code = subprocess.run(rec_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running juggler (reconstruction)!") + exit(return_code) + process = subprocess.run(['rootls', '-t', rec_file]) + + +if 'tag' in procs: + tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file, + '-o', tag_dir, + '--shape', '{nlayer},{nhit},3'.format(**kwargs)] + return_code = subprocess.run(tag_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running ML data tagging!") + exit(return_code) + -- GitLab