diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bcccd797d90fcfefbedd54c15b899dace82e4062..1739e4cc03d69d725cded7bba5e630c3895c4531 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -102,7 +102,7 @@ common:detector: - runner_system_failure interruptible: true -include: +include: - local: 'benchmarks/ecal/config.yml' - local: 'benchmarks/track_finding/config.yml' - local: 'benchmarks/track_fitting/config.yml' diff --git a/benchmarks/imaging_shower_ML/config.yml b/benchmarks/imaging_shower_ML/config.yml index 15730f2cf08ac161074756ebc420685176502bb2..5d4429b835ec4a99085885fd59ce57bee48f92e4 100644 --- a/benchmarks/imaging_shower_ML/config.yml +++ b/benchmarks/imaging_shower_ML/config.yml @@ -1,4 +1,4 @@ -ml_shower:tagging_epimuphka_100: +ml_shower:epi_separation: extends: .rec_benchmark stage: benchmarks1 script: @@ -7,74 +7,7 @@ ml_shower:tagging_epimuphka_100: - | if [[ ${DETECTOR} =~ athena || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka_100 -n 100 --particles "electron,pion-,muon,photon,kaon-" --pmin 0.5 --pmax 10 + pip3 install -r rbenchmarks/imaging_shower_ML/equirements.txt + python3 benchmarks/imaging_shower_ML/run_benchmark.py -t imcal_epi -n 1000 --pmin 1.8 --pmax 2.2 --nocut-samples fi -ml_shower:tagging_epimuphka: - extends: .rec_benchmark - when: manual - stage: benchmarks1 - script: - - ls -hal - - | - if [[ ${DETECTOR} =~ athena - || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka -n 10000 --particles "electron,pion-,muon,photon,kaon-" --pmin 0.5 --pmax 10 - fi - -ml_shower:tagging_e: - extends: .rec_benchmark - when: manual - stage: benchmarks1 - script: - - | - if [[ ${DETECTOR} =~ athena - || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100 --particles "electron" --pmin 0.5 --pmax 10 - fi - -ml_shower:tagging_pi: - extends: .rec_benchmark - when: manual - stage: benchmarks1 - script: - - | - if [[ ${DETECTOR} =~ athena - || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100 --particles "pion-" --pmin 0.5 --pmax 10 - fi - -ml_shower:training_100: - extends: .rec_benchmark - stage: process - needs: ["ml_shower:tagging_epimuphka_100"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"] - script: - - python3 -m pip install tensorflow particle - - | - if [[ ${DETECTOR} =~ athena - || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka_100 --pmin 0.5 --pmax 10 - fi - -ml_shower:training: - extends: .rec_benchmark - stage: process - when: manual - needs: ["ml_shower:tagging_epimuphka"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"] - script: - - python3 -m pip install tensorflow particle - - | - if [[ ${DETECTOR} =~ athena - || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then - python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka --pmin 0.5 --pmax 10 - fi - -ml_shower:test: - extends: .rec_benchmark - when: manual - stage: collect - needs: ["ml_shower:training"] - script: - - ls -lrth - # TODO - diff --git a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py b/benchmarks/imaging_shower_ML/deprecated/prepare_tf_dataset.py similarity index 100% rename from benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py rename to benchmarks/imaging_shower_ML/deprecated/prepare_tf_dataset.py diff --git a/benchmarks/imaging_shower_ML/sim_rec_tag.py b/benchmarks/imaging_shower_ML/deprecated/sim_rec_tag.py similarity index 100% rename from benchmarks/imaging_shower_ML/sim_rec_tag.py rename to benchmarks/imaging_shower_ML/deprecated/sim_rec_tag.py diff --git a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py index 096f68da5e6269b0e44a5c66b78fa7be14de372f..15a9d5b793815a965ab82e30c54b93bc78bf1a2d 100644 --- a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py +++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py @@ -9,9 +9,9 @@ from GaudiKernel.SystemOfUnits import MeV, GeV, mm, cm, mrad from Configurables import Jug__Digi__CalorimeterHitDigi as CalHitDigi from Configurables import Jug__Reco__CalorimeterHitReco as CalHitReco from Configurables import Jug__Reco__CalorimeterHitsMerger as CalHitsMerger -from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger -from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter -from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner +# from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger +# from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter +# from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner # input arguments through environment variables @@ -20,9 +20,6 @@ kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '') kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '') kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '') kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1)) -kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate') -kwargs['img_nlayers'] = int(os.environ.get('IMCAL_ML_IMG_NLAYERS', 9)) -kwargs['nhits'] = int(os.environ.get('IMCAL_ML_NHITS', 20)) if kwargs['nev'] < 1: f = ROOT.TFile(kwargs['input']) @@ -34,56 +31,66 @@ sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) geo_service = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')]) podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')]) -podin = PodioInput('PodioReader', collections=['MCParticles', 'EcalBarrelImagingHits', 'EcalBarrelImagingHitsContributions', 'EcalBarrelScFiHits', 'EcalBarrelScFiHitsContributions']) +podin = PodioInput('PodioReader', collections=[ + 'MCParticles', + 'EcalBarrelImagingHits', + 'EcalBarrelImagingHitsContributions', + 'EcalBarrelScFiHits', + 'EcalBarrelScFiHitsContributions' + ]) podout = PodioOutput('out', filename=kwargs['output']) -# Central Barrel Ecal (Imaging Cal.) +# Barrel Ecal AstroPix becal_img_daq = dict( dynamicRangeADC=3*MeV, - capacityADC=8192, + capacityADC=2**13, pedestalMean=400, - pedestalSigma=20) # about 6 keV + pedestalSigma=20 + ) # about 6 keV pedestal -becal_img_digi = CalHitDigi('becal_img_digi', +becal_img_digi = CalHitDigi( + 'becal_img_digi', inputHitCollection='EcalBarrelImagingHits', outputHitCollection='EcalBarrelImagingHitsDigi', energyResolutions=[0., 0.02, 0.], # 2% flat resolution - **becal_img_daq) + **becal_img_daq + ) -becal_img_reco = CalHitReco('becal_img_reco', +becal_img_reco = CalHitReco( + 'becal_img_reco', inputHitCollection=becal_img_digi.outputHitCollection, outputHitCollection='EcalBarrelImagingHitsReco', thresholdFactor=3, # about 20 keV readoutClass='EcalBarrelImagingHits', # readout class layerField='layer', # field to get layer id sectorField='module', # field to get sector id - **becal_img_daq) + **becal_img_daq + ) -becal_img_merger = MLDataMerger('becal_img_merger', +""" +becal_img_merger = MLDataMerger( + 'becal_img_merger', inputHits=becal_img_reco.outputHitCollection, outputHits='EcalBarrelImagingHitsSeg', etaSize=0.001, - phiSize=0.001) + phiSize=0.001 + ) becal_img_sorter = MLDataSorter('becal_img_sorter', inputHitCollection=becal_img_merger.outputHits, outputHitCollection='EcalBarrelImagingHitsML', - numberOfLayers=kwargs['img_nlayers'], - numberOfHits=kwargs['nhits']) + numberOfLayers=6, + numberOfHits=20 + ) +""" - -#Central ECAL SciFi +# Barrel ECAL ScFi # use the same daq_setting for digi/reco pair becal_scfi_daq = dict( dynamicRangeADC=50.*MeV, - capacityADC=32768, + capacityADC=2**15, pedestalMean=400, pedestalSigma=10) -# becal_scfi_daq = dict( -# dynamicRangeADC=50.*MeV, -# capacityADC=2**16, -# pedestalMean=100, -# pedestalSigma=0) becal_scfi_digi = CalHitDigi('becal_scfi_digi', inputHitCollection='EcalBarrelScFiHits', @@ -105,25 +112,25 @@ becal_scfi_merger = CalHitsMerger('becal_scfi_merger', # OutputLevel=DEBUG, inputHitCollection=becal_scfi_reco.outputHitCollection, outputHitCollection='EcalBarrelScFiGridReco', - fields=['fiber'], - fieldRefNumbers=[1], + fields=['fiber', 'z'], + fieldRefNumbers=[1, 1], readoutClass='EcalBarrelScFiHits') +""" becal_scfi_sorter = MLDataSorter('becal_scfi_sorter', inputHitCollection=becal_scfi_merger.outputHitCollection, outputHitCollection='EcalBarrelScFiHitsML', numberOfLayers=20, - numberOfHits=kwargs['nhits']) + numberOfHits=20) # combine layers becal_combiner = MLDataCombiner('becal_combiner', inputHits1=becal_img_sorter.outputHitCollection, inputHits2=becal_scfi_sorter.outputHitCollection, - outputHits='EcalBarrelImagingHitsCombinedML', + outputHits='EcalBarrelCombinedHitsML', layerIncrement=100, - rule=kwargs['combine']) - - + rule='concatenate') +""" podout.outputCommands = [ # 'keep *', @@ -135,11 +142,15 @@ podout.outputCommands = [ ] ApplicationMgr( - TopAlg=[podin, - becal_img_digi, becal_img_reco, becal_img_merger, becal_img_sorter, - becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, becal_scfi_sorter, - becal_combiner, - podout], + TopAlg=[ + podin, + becal_img_digi, becal_img_reco, + # becal_img_merger, becal_img_sorter, + becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, + # becal_scfi_sorter, + # becal_combiner, + podout + ], EvtSel='NONE', EvtMax=kwargs['nev'], ExtSvc=[podev], diff --git a/benchmarks/imaging_shower_ML/requirements.txt b/benchmarks/imaging_shower_ML/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4088c9a02f807b1d8433fc29813c14f0519d7601 --- /dev/null +++ b/benchmarks/imaging_shower_ML/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 2.10.0 +tables >= 3.7.0 diff --git a/benchmarks/imaging_shower_ML/run_benchmark.py b/benchmarks/imaging_shower_ML/run_benchmark.py new file mode 100755 index 0000000000000000000000000000000000000000..2b2ee476290dd5c6c52d54beb02899f454eaabd6 --- /dev/null +++ b/benchmarks/imaging_shower_ML/run_benchmark.py @@ -0,0 +1,267 @@ +#! /usr/local/bin/python3 +''' + A python script to facilitate the ML benchmarks for e/pi separation with the imaging calorimeter (single particles). + This process follows the steps below: + 1. Simulation to generate training samples + 2. Study and apply E/p cut to reduce the training samples + 3. Train and test ML models with the "cleaned" (after E/p cut) samples + 4. Benchmark the performance + + Author: Chao Peng (ANL) + Date: 11/11/2022 +''' +import os +import sys +import subprocess +import argparse + + +SDIR = os.path.dirname(os.path.realpath(__file__)) +# {var} is from args +FILE_NAMES = dict( + gen_script = os.path.join(SDIR, 'scripts', 'gen_particles.py'), + rec_script = os.path.join(SDIR, 'options', 'imaging_ml_data.py'), + epscan_script = os.path.join(SDIR, 'scripts', 'epcut_scan.py'), + prep_script = os.path.join(SDIR, 'scripts', 'ml_data_preprocess.py'), + ml_script = os.path.join(SDIR, 'scripts', 'ml_training.py'), + + sim_dir = os.path.join('{outdir}', 'sim_data'), + epscan_dir = os.path.join('{outdir}', 'epcut'), + ml_dir = os.path.join('{outdir}', 'ml_result'), + + gen_file = os.path.join('{outdir}', 'sim_data', '{ntag}_gen.hepmc'), + sim_file = os.path.join('{outdir}', 'sim_data', '{ntag}_sim.edm4hep.root'), + rec_file = os.path.join('{outdir}', 'sim_data', '{ntag}_rec.root'), + ml_file = os.path.join('{outdir}', 'sim_data', '{ntag}_data.h5'), +) +# default values for argument parser +DEFAULT_COMPACT = os.path.join( + os.environ.get('DETECTOR_PATH', ''), + '{}.xml'.format(os.environ.get('DETECTOR_CONFIG', '')) + ) +# defined steps +SCRIPT_STEPS = ( + 'sim', # step 1; simulation to generate samples + 'epscan', # step 2: e/p cut scan to find the optimized cut + 'prep', # step 3: prepare tagging data (apply e/p cut) + 'train', # step 4: train ML model and benchmark it + # 'bmk', # step 5: benchmarking +) + + +# simulation and reconstruction +def gen_sim_rec(**kwargs): + # generate particles + gen_cmd = [ + 'python {gen_script} {gen_file}', + '-n {nev}', + '-s {seed}', + '--angmin {angmin} --angmax {angmax}', + '--pmin {pmin} --pmax {pmax}', + '--particles {particles}', + ] + gen_cmd = ' '.join(gen_cmd).format(**kwargs).split(' ') + subprocess.run(gen_cmd) + + # simulation + sim_cmd = [ + 'ddsim --runType batch --part.minimalKineticEnergy 1*TeV --filter.tracker edep0', + '-v WARNING', + '--numberOfEvents {nev}', + # '--physics.list {physics_list}', + '--inputFiles {gen_file}', + '--outputFile {sim_file}', + '--compact {compact}', + ] + if 'seed' in kwargs and kwargs['seed'] > 0: + sim_cmd += ['--random.seed {seed}'] + sim_cmd = ' '.join(sim_cmd).format(**kwargs).split(' ') + return_code = subprocess.run(sim_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running simulation!") + exit(return_code) + subprocess.run(['rootls', '-t', kwargs['sim_file']]) + + # reconstruction with juggler + # export to environment variables (used to pass arguments to the option file) + run_env = os.environ.copy() + juggler_vars = [ + 'JUGGLER_SIM_FILE {sim_file}', + 'JUGGLER_REC_FILE {rec_file}', + 'JUGGLER_COMPACT_PATH {compact}', + 'JUGGLER_N_EVENTS {nev}', + ] + lst = ' '.join(juggler_vars).format(**kwargs).split(' ') + run_env.update({lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}) + + rec_cmd = 'gaudirun.py {rec_script}'.format(**kwargs).split(' ') + print(rec_cmd) + return_code = subprocess.run(rec_cmd, env=run_env).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running juggler (reconstruction)!") + exit(return_code) + process = subprocess.run(['rootls', '-t', kwargs['rec_file']]) + + + +if __name__ == '__main__': + + # argument parser + parser = argparse.ArgumentParser() + + parser.add_argument( + '-n', '--n-events', type=int, + dest='nev', + default=100, + help='number of events.' + ) + parser.add_argument( + '-o', '--outdir', type=str, + dest='outdir', + default='sim_output', + help='output directory.' + ) + parser.add_argument( + '-t', '--name-tag', type=str, + dest='ntag', + default='imcal_ml', + help='name tag for output files.' + ) + parser.add_argument( + '-c', '--compact', type=str, + dest='compact', + default=DEFAULT_COMPACT, + help='path to detector compact file.' + ) + parser.add_argument( + '-s', '--seed', type=int, + default=-1, + help='random seed to child scripts (only pass it if > 0).' + ) + parser.add_argument( + '--batch-size', type=int, + dest='batch', + default=100000, + help='batch size to process data.' + ) + parser.add_argument( + '--p-min', type=float, + dest='pmin', + default=1.5, + help='minimum momentum of particles.' + ) + parser.add_argument( + '--p-max', type=float, + dest='pmax', + default=2.5, + help='maximum momentum of particles.' + ) + parser.add_argument( + '--angle-min', type=float, + dest='angmin', + default=75, + help='minimum scattering angle of particles.' + ) + parser.add_argument( + '--angle-max', type=float, + dest='angmax', + default=105, + help='maximum scattering angle of particles.' + ) + parser.add_argument( + '--weight-pion', type=float, + dest='wpi', + default=10., + help='weights for pions.' + ) + parser.add_argument( + '--particles', type=str, + default='electron,pion-', + help='partcile names, separated by \",\".' + ) + parser.add_argument( + '--test-size', type=float, + default=0.2, + help='relative size of test samples (between 0-1).' + ) + parser.add_argument( + '--no-epcut', action='store_true', + default=False, + help='do not apply E/p cut to ML samples, might be useful for test runs with small samples.' + ) + parser.add_argument( + '--steps', type=str, + default=', '.join(SCRIPT_STEPS), + help='FOR DEV: choose the steps to be executed ({}).'.format(', '.join(SCRIPT_STEPS)) + ) + + args = parser.parse_args() + kwargs = vars(args) + + # prepare + steps = [p.strip() for p in args.steps.split(',')] + + # make dirs, add paths to kwargs + FILE_NAMES.update({key: val.format(**kwargs) for key, val in FILE_NAMES.items()}) + for key, val in FILE_NAMES.items(): + if key.endswith('_dir'): + os.makedirs(val, exist_ok=True) + kwargs.update(FILE_NAMES) + + # simulation for benchmark samples + if SCRIPT_STEPS[0] in steps: + gen_sim_rec(**kwargs) + + # ep cut scan + if SCRIPT_STEPS[1] in steps: + cmd = [ + 'python {epscan_script} {rec_file}', + '-o {epscan_dir} --name-tag {ntag}', + '--batch-size {batch}', + '-s {seed}', + '-e 0.97' + ] + cmd = ' '.join(cmd).format(**kwargs).split(' ') + return_code = subprocess.run(cmd).returncode + if return_code is not None and return_code < 0: + print("ERROR running epcut scan script!") + exit(return_code) + + # prepare ML data + if SCRIPT_STEPS[2] in steps: + cmd = [ + 'python {prep_script} {rec_file}', + '-o {ml_file}', + '--batch-size {batch}', + '-s {seed}', + '--nhits 50', + ] + if not args.no_epcut: + cmd.append('--epcut {epscan_dir}/{ntag}_result.json') + cmd = ' '.join(cmd).format(**kwargs).split(' ') + return_code = subprocess.run(cmd).returncode + if return_code is not None and return_code < 0: + print("ERROR running data preprocessing script!") + exit(return_code) + + # train ML model and benchmarking + if SCRIPT_STEPS[3] in steps: + cmd = [ + 'python {ml_script} {ml_file}', + '-o {ml_dir} --name-tag {ntag}', + '--weight-pion-minus {wpi}', + '-s {seed}', + '--batch-size 256', + '--epochs 20', + '--test-size {test_size}', + ] + cmd = ' '.join(cmd).format(**kwargs).split(' ') + return_code = subprocess.run(cmd).returncode + if return_code is not None and return_code < 0: + print("ERROR running ML training script!") + exit(return_code) + + # TODO: collect the two-step results (e/p cut and ML) + diff --git a/benchmarks/imaging_shower_ML/scripts/check_edep_dists.py b/benchmarks/imaging_shower_ML/scripts/check_edep_dists.py deleted file mode 100644 index 8e4932cdbb060fa169e41a6599476e14ef677b45..0000000000000000000000000000000000000000 --- a/benchmarks/imaging_shower_ML/scripts/check_edep_dists.py +++ /dev/null @@ -1,135 +0,0 @@ -import ROOT -import os -import gc -import numpy as np -import pandas as pd -import argparse -import sys -import matplotlib.pyplot as plt - - -# read from RDataFrame and flatten a given collection, return pandas dataframe -def flatten_collection(rdf, collection, cols=None, event_colname='event'): - if not cols: - cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))] - else: - cols = ['{}.{}'.format(collection, c) for c in cols] - if not cols: - print('cannot find any branch under collection {}'.format(collection)) - return pd.DataFrame() - - data = rdf.AsNumpy(cols) - # flatten the data, add an event id to identify clusters from different events - evns = [] - for i, vec in enumerate(data[cols[0]]): - evns += [i]*vec.size() - for n, vals in data.items(): - # make sure ints are not converted to floats - typename = vals[0].__class__.__name__.lower() - dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32 - # type safe creation - data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype) - # build data frame - dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()}) - dfp.loc[:, event_colname] = evns - return dfp - - -# load root macros, input is an argument string -def load_root_macros(arg_macros): - for path in arg_macros.split(','): - path = path.strip() - if os.path.exists(path): - ROOT.gROOT.Macro(path) - else: - print('\"{}\" does not exist, skip loading it.'.format(path)) - - -def cartesian_to_polar(x, y, z): - r = np.sqrt(x**2 + y**2 + z**2) - rc = np.sqrt(x**2 + y**2) - theta = np.arccos(z / r) - phi = np.arctan2(y, x) - eta = -np.log(np.tan(theta / 2.)) - return r, theta, phi, rc, eta - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('file', type=str, help='path to root file') - parser.add_argument('--sim', action='store_true', help='flag switch between sim and rec data (default rec)') - parser.add_argument('-o', type=str, default='.', dest='outdir', help='output directory') - # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file') - parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros', - help='root macros to load (accept multiple paths separated by \",\")') - parser.add_argument('--branch', type=str, default='EcalBarrelImagingHitsReco', help='name of data branch (edm4eic::CalorimeterHitCollection)') - parser.add_argument('--truth-branch', type=str, default='MCParticles', help='name of truth mc branch') - parser.add_argument('--edep-max', type=float, default=0., help='maximum edep (GeV) to plot') - parser.add_argument('--edep-nbins', type=int, default=200, help='number of bins') - parser.add_argument('--name-tag', type=str, default='test', help='name tag to save the file') - parser.add_argument('--samp-frac', type=float, default=1.0, help='sampling fraction') - args = parser.parse_args() - - os.makedirs(args.outdir, exist_ok=True) - load_root_macros(args.macros) - - # read data and MCParticles - rdf = ROOT.RDataFrame("events", args.file) - - mc_branch = args.truth_branch - dfm = flatten_collection(rdf, mc_branch, ['generatorStatus', 'PDG', 'momentum.x', 'momentum.y', 'momentum.z', 'mass']) - dfm.rename(columns={c: c.replace(mc_branch + '.', '') for c in dfm.columns}, inplace=True) - # selete incident particles - dfm = dfm[dfm['generatorStatus'].isin([0, 1])] - # NOTE: assumed single particles - dfm = dfm.groupby('event').first() - # p, theta, phi, pT, eta = cartesian_to_polar(*dfm[['momentum.x', 'momentum.y', 'momentum.z']].values.T) - - if args.sim: - df = flatten_collection(rdf, args.branch, ['energyDeposit']) - df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True) - df.rename(columns={'energyDeposit': 'energy'}, inplace=True) - else: - df = flatten_collection(rdf, args.branch, ['layer', 'energy', 'position.x', 'position.y', 'position.z']) - df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True) - - dfe = df.groupby('event')['energy'].sum().reset_index() - # determine histrogram bins - if args.edep_max <= 0.: - args.edep_max = dfe['energy'].quantile(0.99)*1.2 - bins = np.linspace(0., args.edep_max, args.edep_nbins + 1) - bincenters = (bins[1:] + bins[:-1])/2. - - # get particle types - fig, ax = plt.subplots(figsize=(16, 9), dpi=120, gridspec_kw={'left': 0.15, 'right': 0.95}) - ax.set_xlabel('Energy Deposit / {:.2f} (GeV)'.format(args.samp_frac), fontsize=24) - ax.set_ylabel('Normalized Counts', fontsize=24) - ax.set_yscale('log') - ax.grid(linestyle=':') - ax.tick_params(labelsize=24) - ax.set_axisbelow(True) - - hist_vals, hist_cols = [], [] - pdgbase = ROOT.TDatabasePDG() - for pdgid in dfm['PDG'].unique(): - particle = pdgbase.GetParticle(int(pdgid)) - if not particle: - print("Unknown pdgcode {}, they are ignored".format(int(pdgid))) - continue - events_indices = dfm[dfm.loc[:, 'PDG'] == pdgid].index.unique() - print("{} entries of particle {}".format(len(events_indices), particle.GetName())) - - dfe_part = dfe.loc[dfe['event'].isin(events_indices)] - - edep_vals, _, _ = ax.hist(dfe_part['energy'] / args.samp_frac, - weights = [1. / dfe_part.shape[0]]*dfe_part.shape[0], - histtype='step', bins=bins, label=particle.GetName()) - hist_vals.append(edep_vals) - hist_cols.append(particle.GetName()) - - pd.DataFrame(index=bincenters, data=np.vstack(hist_vals).T, columns=hist_cols)\ - .to_csv('{}.csv'.format(args.name_tag)) - ax.legend(fontsize=24) - fig.savefig('{}.png'.format(args.name_tag)) - - diff --git a/benchmarks/imaging_shower_ML/scripts/epcut_scan.py b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py new file mode 100644 index 0000000000000000000000000000000000000000..f153d70751991e8348eff6b5078c7c19055c91bf --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py @@ -0,0 +1,229 @@ +''' + A script to scan the optimized cut on layer and E/p. + It scan all the possible ScFi layers (20 in the EPIC brycecanyon configuration) + The results give the best cut (highest pion rejection) on [layer, E/p] with a targeted electron efficiency + + Chao Peng (ANL) + 2022/11/13 +''' +import os +import gc +import sys +import json +import ROOT +import argparse +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.backends.backend_pdf as mpdf +from collections import OrderedDict +from utils import flatten_collection, imcal_info + + +# default color cycle of matplotlib +prop_cycle = plt.rcParams['axes.prop_cycle'] +colors = prop_cycle.by_key()['color'] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + 'file', type=str, + help='path to root file.' + ) + parser.add_argument( + '-o', type=str, + default='epcuts', + dest='outdir', + help='output directory.' + ) + parser.add_argument( + '--batch-size', type=int, + default=100000, + help='batch size to process data.' + ) + parser.add_argument( + '--name-tag', type=str, + default='imcal', + dest='ntag', + help='nametag for output file.' + ) + # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file') + parser.add_argument( + '-b', '--branch', type=str, + default='EcalBarrelScFiGridReco', + help='name of data branch (edm4eic::CalorimeterHitCollection).' + ) + parser.add_argument( + '-b2', '--truth-branch', type=str, + default='MCParticles', + help='name of truth mc branch.' + ) + parser.add_argument( + '-s', '--seed', type=int, + default=-1, + help='random seed.' + ) + # flat resolution now, may need add P dependent terms? + parser.add_argument( + '-r', '--tracking-resolution', type=float, + dest='res', + default=5e-3, + help='relative resolution of tracking (used to smear truth momentum).' + ) + parser.add_argument( + '-e', '--efficiency', type=float, + dest='eff', + default=0.97, + help='targeted efficiency from the cuts.' + ) + parser.add_argument( + '--sampling-fractions', type=str, + default='0,0.11,2200', + help='sampling fractions range (min,max,nbins).' + ) + args = parser.parse_args() + + os.makedirs(args.outdir, exist_ok=True) + + if args.seed > 0: + np.random.seed(args.seed) + + # read data and MCParticles + rdf = ROOT.RDataFrame('events', args.file) + # event range + ntotal = rdf.Count().GetValue() + ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal]) + + # data container + nlayers = int(imcal_info.nlayers_scfi) + samps = args.sampling_fractions.split(',') + samp_range = (float(samps[0]), float(samps[1])) + samp_nbins = int(samps[2]) + # print(nlayers) + ep_bins = np.linspace(*samp_range, samp_nbins + 1) + ep_centers = (ep_bins[:-1] + ep_bins[1:])/2. + el_hist = np.zeros(shape=(nlayers, samp_nbins)) + pi_hist = np.zeros(shape=(nlayers, samp_nbins)) + + # process events + # avoid insane amount of memory use with a large dataset + for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]): + gc.collect() + print('E/p cut: processing events {:d} - {:d}'.format(ev_begin, ev_end)) + + dfm = flatten_collection(rdf, args.truth_branch, (int(ev_begin), int(ev_end)), cols=[ + 'generatorStatus', + 'PDG', + 'momentum.x', 'momentum.y', 'momentum.z', + 'mass' + ]) + + # select incident particles + dfm = dfm[dfm['generatorStatus'].isin([0, 1])] + # NOTE: assumed single particles + dfm = dfm.groupby('event').first().reset_index() + # True momentum + dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2) + # Smear with tracking resolution (0.5%) + dfm.loc[:, 'Ptrack'] = dfm['P']*np.random.normal(1.0, args.res, len(dfm)) + # print(dfm[['PDG', 'P', 'Ptrack']]) + # print(dfm['PDG'].value_counts()) + + # reconstructed simulation hits + df = flatten_collection(rdf, args.branch, (int(ev_begin), int(ev_end)), cols=[ + 'layer', + 'energy', + ]) + # cumulative energy sum for layer + dfe = df.groupby(['event', 'layer'])['energy'].sum().groupby(level=0).cumsum() + # fill missing layer (no hits in that layer) edep values with the previous value + dfe = dfe.reset_index(level='event').groupby('event', as_index=False)\ + .apply(lambda grp: grp.reindex(np.arange(nlayers) + 1, method='ffill'))\ + .reset_index().drop('level_0', axis=1) + # print(dfe) + dfe = dfe.reset_index().merge(dfm[['event', 'Ptrack', 'PDG']], on='event') + dfe.loc[:, 'eoverp'] = dfe['energy']/dfe['Ptrack'] + + # NOTE: assumed only e- and pi- + el_mask = dfe['PDG'] == 11 + dfel = dfe[el_mask] + dfpi = dfe[~el_mask] + + # fill data layer by layer + for hist, dftmp in zip([el_hist, pi_hist], [dfel, dfpi]): + for i in np.arange(nlayers): + vals = dftmp[dftmp['layer'] == i + 1]['eoverp'].values + hvals, _ = np.histogram(vals, bins=ep_bins) + hist[i] += hvals + + # save binned data + for hist, label in zip([el_hist, pi_hist], ['el', 'pi']): + dfr = pd.DataFrame( + columns=['bin_left', 'bin_right'] + ['layer_{:d}'.format(i+1) for i in np.arange(nlayers)], + data=np.vstack([ep_bins[:-1], ep_bins[1:], hist]).T + ) + dfr.to_csv(os.path.join(args.outdir, '{}_eop_{}.csv'.format(args.ntag, label)), index=False) + # print(label, np.sum(hist, axis=1)) + # print(dfr) + + + # study the epcut performance with binned data + best = { + 'layer': int(nlayers + 1), + 'ep_cut': 0., + 'el_eff': 0., + 'pi_rej': 0., + } + ep_dict = OrderedDict([ + ('info', { + 'nsamples': int(ntotal), + 'targeted_efficiency': args.eff, + 'tracking_resolution': args.res + }), + ('best', best), + ]) + pdf = mpdf.PdfPages(os.path.join(args.outdir, '{}_layers.pdf'.format(args.ntag))) + for i in np.arange(nlayers): + elvals, pivals = el_hist[i], pi_hist[i] + # cut position + # NOTE: larger error with wider bins + perc = np.cumsum(elvals)/np.sum(elvals) + idx = len(perc[perc < 1. - args.eff]) + ep_cut = ep_centers[idx] + # efficiency + eff = np.sum(elvals[idx:])/np.sum(elvals) + # rejection power + rej = np.sum(pivals[:idx])/np.sum(pivals) + ep_dict['layer_{:d}'.format(i + 1)] = {'ep_cut': ep_cut, 'el_eff': eff, 'pi_rej': rej} + # greater or [equal] here because always favor the cut on more layers + if rej >= best['pi_rej']: + best.update({ + 'layer': int(i + 1), + 'ep_cut': ep_cut, + 'el_eff': eff, + 'pi_rej': rej, + }) + # plot showing the results + fig, ax = plt.subplots(figsize=(8, 8)) + ax.hist(ep_centers, weights=pivals, bins=50, label='$\pi^-$', color=colors[0], ec=colors[0], alpha=0.5) + ax.hist(ep_centers, weights=elvals, bins=50, label='$e^-$', color=colors[1], ec=colors[1], alpha=0.5) + ax.legend(fontsize=20, ncol=2, loc='upper center', bbox_to_anchor=(0.5, 1.12),) + ax.tick_params(labelsize=20) + ax.set_xlabel('$E/p$', fontsize=20) + ax.set_ylabel('Counts', fontsize=20) + ax.axvline(x=ep_cut, color='k', ls='--', lw=2) + props = dict(boxstyle='round', facecolor='white', alpha=0.5) + ax.text(0.5, 0.97, + 'Layer $\leq${:d}\n$\epsilon_e={:.2f}$%\n$R_{{\pi}}={:.2f}$%'.format(i + 1, eff*100., rej*100.), + transform=ax.transAxes, fontsize=20, va='top', ha='center', bbox=props) + pdf.savefig(fig) + pdf.close() + + # save cut position and performance + ep_dict.update({'best': best}) + ep_json = json.dumps(ep_dict, indent=4) + with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile: + outfile.write(ep_json) + diff --git a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..88fdf6427e0f0d3aa74eb672f7152237f31d2fa8 --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py @@ -0,0 +1,288 @@ +''' + A script to prepare datasets for ML training +''' +import ROOT +import os +import gc +import sys +import json +import numpy as np +import pandas as pd +import argparse +from utils import flatten_collection, cartesian_to_polar, imcal_info + + +pd.set_option('display.max_rows', 500) + +# normalizing range for features +# NOTE: assumed for barrel ecal, using the range of 0.5 - 2 meters to normalize the r0 values +WIN_ETA = (-0.2, 0.2) # unitless +WIN_PHI = (-0.4, 0.4) # rad +WIN_R0 = (500, 2000) # mm +WIN_EH = (0, 0.05) # GeV + + +# only positive weights are returned +# basis defines the minimum contribution (4.6 is about 1%) +def log_weights(val, basis=4.6): + w = np.log(val) + basis + return np.clip(w, 0., None) + + +def read_epcut(path): + if not path or not os.path.exists(path): + print('Prepare ML data: not applying E/p cut because E/p file is not provided or missing.') + return None + with open(args.epcut, 'r') as f: + data = json.load(f) + epcut = data.get('best', {}) + epcut.update(data.get('info', {})) + return epcut + + +def lin_norm(vals, window, val_name='', var_name='', warn_win_size=True): + # statistically meaningful + if warn_win_size and len(vals) > 1000: + perc = (np.percentile(vals, 5), np.percentile(vals, 95)) + if perc[0] < window[0] or perc[1] > window[1]: + warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit {} values {}.'\ + .format(window, val_name, perc) + if var_name: + warnstr += ' Try adjust {}'.format(var_name) + print(warnstr) + return np.clip((vals - window[0])/(window[1] - window[0]), 0., 1.) + + +# prepare ML datasets from dataframe +def format_ml_data(dfh, dft, nlayers=20, nhits=50, + win_eta=WIN_ETA, win_phi=WIN_PHI, win_r0=WIN_R0, win_eh=WIN_EH): + # sanity check + dfh = dfh[dfh['energy'] > 0.] + + # add more data + r, theta, phi, rc, eta = cartesian_to_polar(*dfh[['position.x', 'position.y', 'position.z']].values.T) + + event_group = dfh.groupby('event') + # convert eta, phi, edep so they are within [0, 1] + dfh.loc[:, 'etotal'] = event_group['energy'].transform('sum') + # get the hits energy ratio + dfh.loc[:, 'eratio'] = dfh['energy']/dfh['etotal'] + + # calculate weighted center of the event, in the future this can be replaced by tracking input + # NOTE: assumed single particle events + # log weighting (4.6 + dfh.loc[:, 'logw'] = log_weights(dfh['eratio'].values) + dfh.loc[:, 'wtotal'] = event_group['logw'].transform('sum') + # sanity check + dfh = dfh[dfh['wtotal'] > 0.] + # regroup because of mutation + event_group = dfh.groupby('event') + + dfh.loc[:, 'logw'] = dfh['logw']/dfh['wtotal'] + dfh.loc[:, 'xw'] = dfh['position.x']*dfh['logw'] + dfh.loc[:, 'yw'] = dfh['position.y']*dfh['logw'] + dfh.loc[:, 'zw'] = dfh['position.z']*dfh['logw'] + + # position center + dfh.loc[:, 'xc'] = event_group['xw'].transform('sum') + dfh.loc[:, 'yc'] = event_group['yw'].transform('sum') + dfh.loc[:, 'zc'] = event_group['zw'].transform('sum') + + # single hits features + r, theta, phi, r0, eta = cartesian_to_polar(*dfh[['position.x', 'position.y', 'position.z']].values.T) + # center + rc, thetac, phic, r0c, etac = cartesian_to_polar(*dfh[['xc', 'yc', 'zc']].values.T) + + # data for ML + dfml = dfh[['event', 'layer']].copy() + # NOTE: not much a difference observed between eratio or normalized E with a narrow momentum range (e.g., 1.5-2.5 GeV/c) + dfml.loc[:, 'eh'] = dfh['eratio'].values + # dfml.loc[:, 'eh'] = lin_norm(dfh['energy'].values, win_eh, "Ehit") + dfml.loc[:, 'r0'] = lin_norm(r0, win_r0, "r0") + dfml.loc[:, 'eta'] = lin_norm(eta - etac, win_eta, "eta") + dfml.loc[:, 'phi'] = lin_norm(phi - phic, win_phi, "phi") + + # sort by energy + dfml.sort_values(['event', 'layer', 'eh'], ascending=[True, True, False]) + dfml.loc[:, 'hit'] = dfml.groupby(['event', 'layer']).cumcount() + 1 + dfml.set_index(['event', 'layer', 'hit'], inplace=True) + + # shape data (nevents, nlayers, nhits) + # padding with zeros + # NOTE: if more than nhits per layer, the least energetic hits are dropped because of the previous energy sorting + dfml = dfml.reindex( + pd.MultiIndex.from_product([dfml.index.levels[0], np.arange(nlayers) + 1, np.arange(nhits) + 1], + names=['event', 'layer', 'hit']), + fill_value=0 + ) + + return dfml, list(event_group.groups.keys()) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') + parser.add_argument( + 'file', type=str, + help='path to the input file.' + ) + parser.add_argument( + '-o', type=str, + default='.', + dest='outpath', + help='path to output data file (hdf5).' + ) + parser.add_argument( + '--batch-size', type=int, + default=100000, + help='batch size to process data.' + ) + parser.add_argument( + '-s', '--seed', type=int, + default=-1, + help='random seed.' + ) + parser.add_argument( + '--nhits', type=int, + default=50, + dest='nhits', + help='number of hits per layer.' + ) + parser.add_argument( + '-b', '--img-branch', type=str, + default='EcalBarrelImagingHitsReco', + help='name of data branch (edm4eic::CalorimeterHitCollection).' + ) + parser.add_argument( + '-b2', '--scfi-branch', type=str, + default='EcalBarrelScFiGridReco', + help='name of data branch (edm4eic::CalorimeterHitCollection).' + ) + parser.add_argument( + '-b3', '--truth-branch', type=str, + default='MCParticles', + help='name of truth mc branch.' + ) + parser.add_argument( + '--epcut', type=str, + default=None, + help='A json file contains epcut information (best will be used).' + ) + parser.add_argument( + '--append-data', action='store_true', + default=False, + help='Append data to pre-existing data file.' + ) + args = parser.parse_args() + + data_store = pd.HDFStore(args.outpath) + # print(data_store.keys()) + # clear data store + if not args.append_data: + for dkey in [imcal_info.ml_data_key, imcal_info.truth_data_key]: + if dkey in data_store.keys(): + data_store.remove(dkey) + print('Prepare ML data: remove pre-existed dataset {}'.format(dkey)) + + if args.seed > 0: + np.random.seed(args.seed) + + # read data and MCParticles + rdf = ROOT.RDataFrame("events", args.file) + # event range + ntotal = rdf.Count().GetValue() + ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal]) + + # epcut info + epcut = read_epcut(args.epcut) + # print(epcut) + + # process events + # avoid insane amount of memory use with a large dataset + for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]): + gc.collect() + print('Prepare ML data: processing events {:d} - {:d}'.format(ev_begin, ev_end)) + + dfm = flatten_collection(rdf, args.truth_branch, (int(ev_begin), int(ev_end)), cols=[ + 'generatorStatus', + 'PDG', + 'momentum.x', 'momentum.y', 'momentum.z', + 'mass' + ]) + + # select incident particles + dfm = dfm[dfm['generatorStatus'].isin([0, 1])] + # NOTE: assumed single particles + dfm = dfm.groupby('event').first() + # True momentum + dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2) + # print(dfm) + + # scfi information + dfs = flatten_collection(rdf, args.scfi_branch, (int(ev_begin), int(ev_end)), cols=[ + 'layer', + 'energy', + 'position.x', 'position.y', 'position.z', + ]) + + # information + dfi = flatten_collection(rdf, args.img_branch, (int(ev_begin), int(ev_end)), cols=[ + 'layer', + 'energy', + 'position.x', 'position.y', 'position.z', + ]) + + # apply ep_cut + if epcut: + # cumulative energy sum for layer + dfe = dfs.groupby(['event', 'layer'])['energy'].sum().groupby(level=0).cumsum() + # fill missing layer (no hits in that layer) edep values with the previous value + dfe = dfe.reset_index(level='event').groupby('event', as_index=False)\ + .apply(lambda grp: grp.reindex(np.arange(imcal_info.nlayers_scfi) + 1, method='ffill'))\ + .reset_index().drop('level_0', axis=1) + # unstack layer edeps + dfe = dfe.set_index(['event', 'layer'])\ + .stack().unstack(level='layer') + dfe.columns = ['layer_{}'.format(i) for i in dfe.columns] + dfe.columns.name = None + dfe = dfe.reset_index().drop('level_1', axis=1).set_index('event', drop=True) + # print(dfe) + + p_smear = dfm['P']*np.random.normal(1.0, epcut['tracking_resolution'], len(dfm)) + ep_mask = (dfe['layer_{layer}'.format(**epcut)]/p_smear) >= epcut['ep_cut'] + # print(np.sum(ep_mask)) + dfm = dfm[ep_mask] + dfs = dfs[dfs['event'].isin(dfm.index)] + dfi = dfi[dfi['event'].isin(dfm.index)] + + # reformat data for ML + dfi, ievs = format_ml_data(dfi, dfm, nlayers=imcal_info.nlayers_img, nhits=args.nhits) + dfs, sevs = format_ml_data(dfs, dfm, nlayers=imcal_info.nlayers_scfi, nhits=args.nhits) + if ievs != sevs: + common = np.intersect1d(ievs, sevs) + dfi = dfi[dfi.index.isin(common, level='event')] + dfs = dfs[dfs.index.isin(common, level='event')] + dfm = dfm[dfm.index.isin(common)].sort_index() + else: + dfm = dfm[dfm.index.isin(ievs)].sort_index() + + # assign layer type to different dataset + dfi.loc[:, 'ltype'] = 'img' + dfi.loc[:, 'lval'] = 0 + dfs.loc[:, 'ltype'] = 'scfi' + dfs.loc[:, 'lval'] = 1 + + # null eta value for ScFi layer + # NOTE: add it back in the future with timing-z resolution + dfs['eta'] = 0. + + # combine two datasets + df = pd.concat([dfi.reset_index(), dfs.reset_index()], ignore_index=True)\ + .set_index(['event', 'ltype', 'layer', 'hit']).sort_index() + # print(df.head(100)) + data_store.append(imcal_info.ml_data_key, df, format='t', data_columns=True) + data_store.append(imcal_info.truth_data_key, dfm[['PDG', 'P', 'mass']], format='t', data_columns=True) + + data_store.close() + + diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training.py b/benchmarks/imaging_shower_ML/scripts/ml_training.py index 82517e3b9b53a65a04115ef464d07abffa78e2c1..730eccf6408edf6874178008424b4845afd140ad 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_training.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py @@ -1,283 +1,263 @@ +''' + Script for supervised training of a simple ML model using Tensorflow 2. + ML e/pi separation for imaging barrel ecal. + + Chao Peng (ANL) + 2022/11/11 +''' import os +import sys +import json +import argparse import pandas as pd import numpy as np +import ROOT from collections import OrderedDict -from scipy.interpolate import CubicSpline -import uproot as uproot + +import matplotlib as mpl import matplotlib.pyplot as plt -import logging -import sys -import subprocess -import argparse +from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator +from matplotlib.colors import LogNorm import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers -import matplotlib as mpl -from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator -from matplotlib.colors import LogNorm -from particle import Particle - -print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) - -# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" -# 5 layers 0-1-2-6-9 -# 6 layers 0-1-2-3-4-6-9 - - -def get_layer_cum_edeps(edep): - return np.tile(np.arange(edep.shape[1]), (edep.shape[0], 1)).flatten(), np.cumsum(edep, axis=1).flatten() - - -def draw_layer_edep(edep, label=''): - layer_num, edep_val = get_layer_cum_edeps(edep) - figure, axis = plt.subplots(figsize=(16, 9), dpi=120) - axis.hist2d(layer_num, edep_val, - bins=(np.arange(0.5, edep.shape[1] + 0.5, step=1.0), 100), norm=mpl.colors.LogNorm()) - axis.grid(linestyle=':') - axis.set_axisbelow(True) - axis.tick_params(labelsize=24, direction='in', which='both') - axis.set_ylabel('Cumulative Edep (MeV)', fontsize=24) - axis.set_xlabel('Layer number', fontsize=24) - axis.xaxis.set_major_locator(MaxNLocator(edep.shape[1], integer=True)) - if label: - axis.set_title(label, fontsize=26) - return figure, axis - -parser = argparse.ArgumentParser() -parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.') -parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.') -parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.') -args = parser.parse_args() -kwargs = vars(args) - -train_model = True -do_epcut = False -#nametag = 'imcal' -#partag = 'epimuphka' -#prange = (0.5, 10.0) -#etag = '{:.1f}_{:.1f}_combined'.format(*prange) -epoch = 20 -data_shape = (29, 20, 5) -#data_dir = r'sim_output/tag_data/{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) -#out_dir = r'sim_output/tag_data/output/{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) -data_dir = os.path.join('sim_output/tag_data/', '{nametag}_{pmin}_{pmax}_combined'.format(**kwargs)) -out_dir = os.path.join('sim_output/tag_data/output/', '{nametag}_{pmin}_{pmax}_combined'.format(**kwargs)) -raw_data = np.load(os.path.join(data_dir, 'data.npy')) -raw_tags = np.load(os.path.join(data_dir, 'tag.npy')) -edeps = np.load(os.path.join(data_dir, 'edep.npy')) - -# for interlayered data, null the last few layers -astro_nlayers = 9 -astro_nlayers_used = [0, 1, 2, 3, 4] -astro_nlayers_not_used = [2*i for i in np.arange(astro_nlayers) if i not in astro_nlayers_used] -if len(astro_nlayers_used) and len(astro_nlayers_not_used): - raw_data[:, astro_nlayers_not_used] = 0 - -os.makedirs(out_dir, exist_ok=True) -# label converter -labels = OrderedDict([ - (pid, [i, Particle.from_pdgid(pid).name, '']) - for i, pid in enumerate(sorted(np.unique(raw_tags.T[0].astype(int)), key=lambda x: np.abs(x))) -]) - -# add some labels in latex format -name_dict = {'gamma': r'$\gamma$', '-': '$^-$', '+': '$^+$', 'pi': r'$\pi$', 'mu': r'$\mu$'} -for pid, (i, name, _) in labels.items(): - labelstr = name - for str1, str2 in name_dict.items(): - labelstr = labelstr.replace(str1, str2) - labels[pid][2] = labelstr - -part_weights = {pid: 1.0 for pid, _ in labels.items()} -part_weights.update({ - 11: 1.0, - -211: 10.0, -}) - -begin_layer, end_layer = 0, 9 -epcut = 0.085 -if do_epcut: - epi_ids = (11, -211) # (11, -211) - # Eoverp cut in tags data - eovp = np.sum(edeps[:, begin_layer:end_layer], axis=1)*1e-3/raw_tags.T[1] - emask = raw_tags.T[0] == epi_ids[0] - pimask = raw_tags.T[0] == epi_ids[1] - - epmask = tuple([eovp > epcut]) - ep_eff = sum(raw_tags[epmask].T[0] == 11) / sum(emask) - ep_rej = 1. - sum(raw_tags[epmask].T[0] == -211) / sum(pimask) - - bins = np.linspace(0, 0.15, 151) - fig, ax = plt.subplots(figsize=(12, 9), dpi=120) - ax.hist(eovp[emask], histtype='step', density=True, bins=bins, label=labels[epi_ids[0]][2]) - ax.hist(eovp[pimask], histtype='step', density=True, bins=bins, label=labels[epi_ids[1]][2]) - ax.axvline(epcut, color='k') - ax.set_xlabel('Edep / p', fontsize=24) - ax.set_ylabel('Normalized Counts', fontsize=24) - ax.tick_params(labelsize=24) - ax.legend(fontsize=24) - ax.text(0.2, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(ep_eff*100., ep_rej*100.), - fontsize=24, transform=ax.transAxes, ha='left', va='top') - ax.set_title('$E/p > {:.2f}$% @ {:d} - {:d} X$_0$'.format(epcut*100., begin_layer, end_layer), fontsize=24) - fig.savefig(os.path.join('results/ml/models/emcal/', 'pre_epcut.png')) - - fig, _ = draw_layer_edep(edeps[emask], 'Cumulative Edep over layer for electrons') - fig.savefig(os.path.join('results/ml/models/emcal/', 'cum_edep_el.png')) - fig, _ = draw_layer_edep(edeps[pimask], 'Cumulative Edep over layer for pions') - fig.savefig(os.path.join('results/ml/models/emcal/', 'cum_edep_pi.png')) - - data = raw_data[epmask] - tags = raw_tags[epmask] - -else: - ep_eff = 1.0 - ep_rej = 0. - data = raw_data - tags = raw_tags - -pid = np.vectorize(lambda x: labels[x][0])(tags.T[0]) -weights = np.vectorize(lambda x: part_weights[x])(tags.T[0]) - -if train_model: - indices = np.arange(len(data)) - np.random.shuffle(indices) - id_train = indices[:int(len(data)*0.8)] - id_valid = indices[int(len(data)*0.8):] - x_train = data[id_train] - y_train = pid[id_train] - w_train = weights[id_train] - - x_valid = data[id_valid] - y_valid = pid[id_valid] - w_valid = weights[id_valid] - - model = keras.Sequential([ - keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=data_shape), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.25), - keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.3), - keras.layers.Conv2D(64, (2, 2), padding='same', activation='relu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - - keras.layers.Flatten(), - keras.layers.Dense(128, activation='relu'), - keras.layers.Dropout(0.25), - keras.layers.Dense(32, activation='relu'), - keras.layers.Dense(len(labels), activation='softmax') + +from utils import imcal_info + + +# default color cycle of matplotlib +prop_cycle = plt.rcParams['axes.prop_cycle'] +colors = prop_cycle.by_key()['color'] + + +# full name, short name, latex str +PARTICLE_DICT = OrderedDict([ + (11, ('electron', 'electron', '$e^-$')), + (-211, ('pion-minus', 'pion-', '$\pi^-$')), ]) - model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - metrics=['accuracy']) - history = model.fit(x_train, y_train, - epochs=epoch, sample_weight=w_train, validation_data=(x_valid, y_valid, w_valid)) - model.save(os.path.join('results/ml/models/emcal/', 'model_epch{:d}'.format(epoch))) - - converter = tf.lite.TFLiteConverter.from_keras_model(model) - tflite_model = converter.convert() - - with open(os.path.join('results/ml/models/emcal/', 'lite_model_{nametag}.tflite'.format(**kwargs)), 'wb') as f: - f.write(tflite_model) -else: - model = keras.models.load_model(os.path.join('results/ml/models/emcal/', 'model_epch{:d}'.format(epoch))) -# ---- general ---- -# x_test = data -# y_test = pid -# prediction = model.predict(x_test) -# result = prediction.argmax(axis=1) -# pid_probs = [] -# for _, part in labels.items(): -# mask = (y_test == part[0]) -# res = result[mask] -# pid_probs.append(np.bincount(res, minlength=len(labels))/sum(mask)) -# pid_probs = np.asarray(pid_probs) - -# predictions -x_test = data -y_test = pid -prediction = model.predict(x_test) - -# two labels (pion - electron) -if len(labels) == 2: - # --- pion - electron ---- - pion_prob_cut = 0.5 + + +def count_pid_labels(labels, pid_label_dict): + result = OrderedDict() + result['total'] = len(labels) + counts = np.bincount(labels) + label_pid_dict = {i: pid for pid, i in pid_label_dict.items()} + for i, cnt in enumerate(counts): + if cnt == 0 and i not in label_pid_dict.values(): + continue + pid = label_pid_dict.get(i) + particle = PARTICLE_DICT.get(pid, (pid, pid, pid)) + # print(i, pid, cnt, particle) + result[str(particle[1])] = '{:d}'.format(cnt) + return result + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + 'data_store', type=str, + help='path to data store.' + ) + parser.add_argument( + '-o', type=str, + dest='outdir', + default='ml_result', + help='output directory.' + ) + parser.add_argument( + '-s', '--seed', type=int, + default=-1, + help='random seed.' + ) + parser.add_argument( + '-t', '--name-tag', type=str, + dest='ntag', + default='imcal_ml', + help='name tag for output files/models.' + ) + parser.add_argument( + '--test-size', type=float, + default=0.2, + help='relative size of test samples (between 0-1).' + ) + parser.add_argument( + '--epochs', type=int, + default=20, + help='epochs for ML training.' + ) + parser.add_argument( + '--batch-size', type=int, + default=512, + help='batch size for ML training.' + ) + parser.add_argument( + '-e', '--efficiency', type=float, + dest='eff', + default=0.98, + help='targeted efficiency from the cuts.' + ) + parser.add_argument( + '--read-model', action='store_true', + default=False, + help='read a trained model' + ) + parser.add_argument( + '--read-model-path', type=str, + default='', + help='only used when --read-model is enabled, if not specified it reads the model from saving path.' + ) + # weights + for key, (name, sname, latex) in PARTICLE_DICT.items(): + parser.add_argument( + '--weight-{}'.format(name), type=float, + default=1.0, + help='estimator weight for {} particles.'.format(name) + ) + args = parser.parse_args() + kwargs = vars(args) + + os.makedirs(args.outdir, exist_ok=True) + + if args.seed > 0: + np.random.seed(args.seed) + tf.random.seed(args.seed) + + df = pd.read_hdf(args.data_store, key=imcal_info.ml_data_key) + # NOTE: assumed event index is exactly the same as df + dft = pd.read_hdf(args.data_store, key=imcal_info.truth_data_key) + print(dft['PDG'].value_counts()) + + # fill dictionary if not pre-defined + data_pid = dft['PDG'].unique() + for pid in data_pid: + if pid not in PARTICLE_DICT.keys(): + print('WARNING = ML training: particle ({}) not found in dictionary, update the script to include it.'\ + .format(pid)) + PARTICLE_DICT[pid] = ('particle_{:d}'.format(pid), 'PDG_{:d}'.format(pid), 'PDG_{:d}'.format(pid)) + + # build labels and weights for particles + # NOTE: pid follows the dictionary orders + pid_labels = OrderedDict() + pid_weights = OrderedDict() + for pid, (pname, _, _) in PARTICLE_DICT.items(): + if pid not in data_pid: + continue + pid_weights[pid] = kwargs.get('weight_{}'.format(pname.replace('-', '_')), 1.0) + pid_labels[pid] = len(pid_labels) + print(pid_labels) + print(pid_weights) + + # data shape + # NOTE: number of layers for combined data + nlayers = imcal_info.nlayers_img + imcal_info.nlayers_scfi + # NOTE: index levels should be [event, layer_type, layer, hit], change the code if data structure is changed + nhits = df.index.levels[-1].max() + + # training datasets + xdata = df.values.reshape(len(dft), nlayers, nhits, len(df.columns)) + ydata = dft['PDG'].map(pid_labels).values + wdata = dft['PDG'].map(pid_weights).values + + # seperate train and test data + indices = np.arange(len(xdata)) + np.random.shuffle(indices) + id_train = indices[int(len(indices)*args.test_size):] + id_test = indices[:int(len(indices)*args.test_size)] + # test data will be used for benchmarking, no need for sampling weights + xtrain, ytrain, wtrain = xdata[id_train], ydata[id_train], wdata[id_train] + xtest, ytest = xdata[id_test], ydata[id_test] + + # try to use GPUs + # tf.debugging.set_log_device_placement(True) + gpus = tf.config.list_logical_devices('GPU') + print("Number of GPUs Available: ", len(gpus)) + strategy = tf.distribute.MirroredStrategy(gpus) + if args.read_model: + model_path = os.path.join(args.outdir, '{}_model'.format(args.ntag)) + if args.read_model_path: + model_path = args.read_model_path + model = keras.models.load_model(model_path) + else: + with strategy.scope(): + train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain, wtrain)) + model = keras.Sequential([ + keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=xdata.shape[1:]), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.25), + keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.3), + keras.layers.Conv2D(64, (2, 2), padding='same', activation='relu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + + keras.layers.Flatten(), + keras.layers.Dense(128, activation='relu'), + keras.layers.Dropout(0.25), + keras.layers.Dense(32, activation='relu'), + keras.layers.Dense(len(pid_labels), activation='softmax') + ]) + + model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), + metrics=['accuracy']) + history = model.fit(train_dataset.batch(args.batch_size), epochs=args.epochs) + model.save(os.path.join(args.outdir, '{}_model'.format(args.ntag))) + + # benchmark + prediction = model.predict(xtest) + + # cut label probability + # NOTE: assumed two-labels (electron, pion) and ordered (see dictionary) pid_probs = np.zeros(shape=(2, 2)) - fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=160) + fig, ax = plt.subplots(1, 1, figsize=(12, 9), dpi=160) + + # cut position (try to keep the targeted efficiency) + epi_probs = prediction[ytest == 0].T[1] + pion_prob_cut = np.percentile(epi_probs, args.eff*100.) + eff_string = '' + for pid, ilbl in pid_labels.items(): + mask = (ytest == ilbl) + probs = prediction[mask] + label = PARTICLE_DICT.get(pid, [''])[-1] + + ax.hist(probs.T[1], bins=np.linspace(0, 1, 101), label=label, color=colors[ilbl], ec=colors[ilbl], alpha=0.5) + + pid_probs[ilbl][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) + pid_probs[ilbl][1] = 1. - pid_probs[ilbl][0] + if pid == -211 or pid == 211: + eff_string += '{} rej. = {:.2f}%'.format(label, pid_probs[ilbl][1]*100.) + else: + eff_string += '{} eff. = {:.2f}%\n'.format(label, pid_probs[ilbl][0]*100.) + ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') ax.set_yscale('log') ax.set_ylabel('Counts', fontsize=24) ax.set_xlabel(r'$P_{\pi}$', fontsize=24) ax.tick_params(direction='in', which='both', labelsize=24) - for i, (_, part) in enumerate(labels.items()): - mask = (y_test == part[0]) - probs = prediction[mask] - ax.hist(probs.T[1], histtype='step', bins=np.linspace(0, 1, 101), label=part[1]) - pid_probs[i][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) - pid_probs[i][1] = 1. - pid_probs[i][0] - ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') - ax.text(0.55, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(pid_probs[0][0]*100., pid_probs[1][1]*100.), - fontsize=24, transform=ax.transAxes, ha='left', va='top') - ax.legend(fontsize=24, loc='lower left') - fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.png')) - fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.pdf')) - # --- pion - electron ---- - result_text = open(os.path.join(out_dir, 'rejection_result.txt'), 'w') - lines = [ - 'E/p cut position: {:.4f}, layer {:d} to {:d}, E/p efficiency: {:.4f}%, rejection power: {:.4f}'\ - .format(epcut, begin_layer, end_layer, ep_eff*100., 1./(1. - ep_rej)), - ] if do_epcut else [] - lines += [ - 'ML training weight: {}, pion cut position: {:.4f}, ML efficiency: {:.4f}%, rejection power: {:.4f}'\ - .format(list(part_weights.values()), pion_prob_cut, pid_probs[0][0]*100., 1./(1. - pid_probs[1][1])), - 'combined efficiency: {:.2f}%, rejection power: {:.1f}'\ - .format(pid_probs[0][0]*ep_eff*100., 1./((1. - ep_rej)*(1. - pid_probs[1][1]))), - ] - result_text.write('\n'.join(lines)) - result_text.close() - print('\n'.join(lines)) - -else: - # --- multi-particles --- - pid_probs = np.zeros(shape=(len(labels), len(labels))) - fig, ax = plt.subplots(1, 1, figsize=(12, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) - ax.set_yscale('log') - ax.set_ylabel('Counts', fontsize=26) - ax.set_xlabel(r'Likelihood of the Corresponding Label', fontsize=26) - ax.tick_params(direction='in', which='both', labelsize=26) - # label with maximum likelihood - for _, (i, part, label) in labels.items(): - mask = (y_test == i) - probs = prediction[mask] - pred_labels = probs.argmax(axis=1) - ax.hist(probs.T[i], histtype='step', bins=np.linspace(0, 1, 101), label=label) - for _, (j, _, _) in labels.items(): - pid_probs[i][j] = np.sum(pred_labels == j)/float(np.sum(mask)) - - ax.legend(fontsize=26, loc='upper center', ncol=3, handletextpad=0.3, columnspacing=0.6) - fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.png')) - fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.pdf')) - - # --- multi-particles --- -fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) -ax.imshow(pid_probs, cmap='PuBu', norm=LogNorm(vmin=0.01, vmax=1)) -ax.xaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[0], step=1.0))) -ax.yaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[1], step=1.0))) -ax.xaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[0], step=1.0))) -ax.yaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[1], step=1.0))) -ax.set_xticklabels([x[2] for _, x in labels.items()], fontsize=26, va='bottom') -ax.set_yticklabels([x[2] for _, x in labels.items()], fontsize=26, ha='left') -ax.tick_params(bottom=False, left=False, which='both') -ax.tick_params(pad=30, axis='x') -ax.tick_params(pad=40, axis='y') -ax.set_ylabel('Truth', fontsize=26) -ax.set_xlabel('Classification', fontsize=26) -for i in range(pid_probs.shape[0]): - for j in range(pid_probs.shape[1]): - color = 'k' if pid_probs[i, j] < 0.15 else 'w' - text = ax.text(j, i, '{:.2f}%'.format(pid_probs[i, j]*100.), ha='center', va='center', - color=color, fontsize=28 - len(labels)*2) -ax.grid('-', color='k', which='minor') -fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag.png')) -fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag.pdf')) + ax.text(0.55, 0.9, eff_string, fontsize=24, transform=ax.transAxes, ha='left', va='top') + ax.legend(fontsize=24, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),) + fig.savefig(os.path.join(args.outdir, '{}_pid_hist.pdf'.format(args.ntag))) + + # save result + result = OrderedDict( + training= OrderedDict( + epochs=args.epochs, + batch_size=args.batch_size, + sample_size=count_pid_labels(ytrain, pid_labels), + sample_weights={PARTICLE_DICT.get(pid)[1]: weight for pid, weight in pid_weights.items()}, + targeted_efficiency=args.eff, + pion_prob_cut=pion_prob_cut, + ), + + test=OrderedDict( + sample_size=count_pid_labels(ytest, pid_labels), + el_eff=pid_probs[0][0], + pi_rej=pid_probs[1][1], + ) + ) + res_json = json.dumps(result, indent=4) + with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile: + outfile.write(res_json) + diff --git a/benchmarks/imaging_shower_ML/scripts/utils.py b/benchmarks/imaging_shower_ML/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2ddbc61608d0f494ee3d8209f7362f5669418897 --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/utils.py @@ -0,0 +1,63 @@ +import ROOT +import pandas as pd +import numpy as np + + +class dotdict(dict): + """dot.notation access to dictionary attributes""" + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +# basic information share among all scripts +imcal_info = dotdict( + nlayers_img=6, + nlayers_scfi=20, + ml_data_key='/ml_data', + truth_data_key='/truth', + ) + +# read from RDataFrame and flatten a given collection, return pandas dataframe +def flatten_collection(rdf, collection, ev_range=None, cols=None, event_colname='event'): + if not cols: + cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))] + else: + cols = ['{}.{}'.format(collection, c) for c in cols] + if not cols: + print('cannot find any branch under collection {}'.format(collection)) + return pd.DataFrame() + # take data + if not ev_range: + data = rdf.AsNumpy(cols) + else: + data = rdf.Range(*ev_range).AsNumpy(cols) + # flatten the data, add an event id to identify clusters from different events + evns = [] + for i, vec in enumerate(data[cols[0]]): + evns += [i]*vec.size() + for n, vals in data.items(): + # make sure ints are not converted to floats + typename = vals[0].__class__.__name__.lower() + dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32 + # type safe creation + data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype) + # build data frame + dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()}) + evns = np.asarray(evns) + if ev_range is not None: + evns += ev_range[0] + dfp.loc[:, event_colname] = evns + dfp.rename(columns={c: c.replace(collection + '.', '') for c in dfp.columns}, inplace=True) + return dfp + + +def cartesian_to_polar(x, y, z): + r = np.sqrt(x**2 + y**2 + z**2) + rc = np.sqrt(x**2 + y**2) + theta = np.arccos(z / r) + phi = np.arctan2(y, x) + eta = -np.log(np.tan(theta / 2.)) + return r, theta, phi, rc, eta + +