Update imaging ml benchmarks

f0717c76 · Chao Peng · 43c01cc4 · f0717c76 · f0717c76 · f0717c76
Commit f0717c76 authored Nov 14, 2022 by Chao Peng
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
--- a/benchmarks/imaging_shower_ML/config.yml
+++ b/benchmarks/imaging_shower_ML/config.yml
-ml_shower:tagging_epimuphka_100:
+ml_shower:epi_separation:
  extends: .rec_benchmark
  stage: benchmarks1
  script:
@@ -7,74 +7,7 @@ ml_shower:tagging_epimuphka_100:
    - |
      if [[ ${DETECTOR} =~ athena
         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka_100 -n 100 --particles "electron,pion-,muon,photon,kaon-" --pmin 0.5 --pmax 10
+        pip3 install -r rbenchmarks/imaging_shower_ML/equirements.txt
+        python3 benchmarks/imaging_shower_ML/run_benchmark.py -t imcal_epi -n 1000 --pmin 1.8 --pmax 2.2 --nocut-samples
      fi
-ml_shower:tagging_epimuphka:
-  extends: .rec_benchmark
-  when: manual
-  stage: benchmarks1
-  script:
-    - ls -hal
-    - |
-      if [[ ${DETECTOR} =~ athena
-         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka -n 10000 --particles "electron,pion-,muon,photon,kaon-" --pmin 0.5 --pmax 10
-      fi
-ml_shower:tagging_e:
-  extends: .rec_benchmark
-  when: manual
-  stage: benchmarks1
-  script:
-    - |
-      if [[ ${DETECTOR} =~ athena
-         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100 --particles "electron" --pmin 0.5 --pmax 10
-      fi
-ml_shower:tagging_pi:
-  extends: .rec_benchmark
-  when: manual
-  stage: benchmarks1
-  script:
-    - |
-      if [[ ${DETECTOR} =~ athena
-         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100  --particles "pion-" --pmin 0.5 --pmax 10
-      fi
-ml_shower:training_100:
-  extends: .rec_benchmark
-  stage: process
-  needs: ["ml_shower:tagging_epimuphka_100"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"]
-  script:
-    - python3 -m pip install tensorflow particle
-    - |
-      if [[ ${DETECTOR} =~ athena
-         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka_100 --pmin 0.5 --pmax 10
-      fi
-ml_shower:training:
-  extends: .rec_benchmark
-  stage: process
-  when: manual
-  needs: ["ml_shower:tagging_epimuphka"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"]
-  script:
-    - python3 -m pip install tensorflow particle
-    - |
-      if [[ ${DETECTOR} =~ athena
-         || ${DETECTOR} =~ ecce && ${DETECTOR_CONFIG} =~ imaging ]] ; then
-        python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka --pmin 0.5 --pmax 10
-      fi
-ml_shower:test:
-  extends: .rec_benchmark
-  when: manual
-  stage: collect
-  needs: ["ml_shower:training"]
-  script:
-    - ls -lrth
-    # TODO
--- a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
+++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
--- a/benchmarks/imaging_shower_ML/sim_rec_tag.py
+++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py
--- a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
+++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
@@ -9,9 +9,9 @@ from GaudiKernel.SystemOfUnits import MeV, GeV, mm, cm, mrad
 from Configurables import Jug__Digi__CalorimeterHitDigi as CalHitDigi
 from Configurables import Jug__Reco__CalorimeterHitReco as CalHitReco
 from Configurables import Jug__Reco__CalorimeterHitsMerger as CalHitsMerger
-from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger
+# from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger
-from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter
+# from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter
-from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner
+# from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner
 # input arguments through environment variables
@@ -20,9 +20,6 @@ kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '')
 kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '')
 kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '')
 kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1))
-kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate')
-kwargs['img_nlayers'] = int(os.environ.get('IMCAL_ML_IMG_NLAYERS', 9))
-kwargs['nhits'] = int(os.environ.get('IMCAL_ML_NHITS', 20))
 if kwargs['nev'] < 1:
    f = ROOT.TFile(kwargs['input'])
@@ -34,56 +31,66 @@ sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0'))
 geo_service  = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')])
 podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')])
-podin = PodioInput('PodioReader', collections=['MCParticles', 'EcalBarrelImagingHits', 'EcalBarrelImagingHitsContributions', 'EcalBarrelScFiHits', 'EcalBarrelScFiHitsContributions'])
+podin = PodioInput('PodioReader', collections=[
+    'MCParticles',
+    'EcalBarrelImagingHits',
+    'EcalBarrelImagingHitsContributions',
+    'EcalBarrelScFiHits',
+    'EcalBarrelScFiHitsContributions'
+    ])
 podout = PodioOutput('out', filename=kwargs['output'])
-# Central Barrel Ecal (Imaging Cal.)
+# Barrel Ecal AstroPix
 becal_img_daq = dict(
        dynamicRangeADC=3*MeV,
-        capacityADC=8192,
+        capacityADC=2**13,
        pedestalMean=400,
-        pedestalSigma=20)   # about 6 keV
+        pedestalSigma=20
+        )   # about 6 keV pedestal
-becal_img_digi = CalHitDigi('becal_img_digi',
+becal_img_digi = CalHitDigi(
+        'becal_img_digi',
        inputHitCollection='EcalBarrelImagingHits',
        outputHitCollection='EcalBarrelImagingHitsDigi',
        energyResolutions=[0., 0.02, 0.],   # 2% flat resolution
-        **becal_img_daq)
+        **becal_img_daq
+        )
-becal_img_reco = CalHitReco('becal_img_reco',
+becal_img_reco = CalHitReco(
+        'becal_img_reco',
        inputHitCollection=becal_img_digi.outputHitCollection,
        outputHitCollection='EcalBarrelImagingHitsReco',
        thresholdFactor=3,  # about 20 keV
        readoutClass='EcalBarrelImagingHits',  # readout class
        layerField='layer',             # field to get layer id
        sectorField='module',           # field to get sector id
-        **becal_img_daq)
+        **becal_img_daq
+        )
-becal_img_merger = MLDataMerger('becal_img_merger',
+"""
+becal_img_merger = MLDataMerger(
+        'becal_img_merger',
        inputHits=becal_img_reco.outputHitCollection,
        outputHits='EcalBarrelImagingHitsSeg',
        etaSize=0.001,
-        phiSize=0.001)
+        phiSize=0.001
+        )
 becal_img_sorter = MLDataSorter('becal_img_sorter',
        inputHitCollection=becal_img_merger.outputHits,
        outputHitCollection='EcalBarrelImagingHitsML',
-        numberOfLayers=kwargs['img_nlayers'],
+        numberOfLayers=6,
-        numberOfHits=kwargs['nhits'])
+        numberOfHits=20
+        )
+"""
-#Central ECAL SciFi
+# Barrel ECAL ScFi
 # use the same daq_setting for digi/reco pair
 becal_scfi_daq = dict(
        dynamicRangeADC=50.*MeV,
-        capacityADC=32768,
+        capacityADC=2**15,
        pedestalMean=400,
        pedestalSigma=10)
-# becal_scfi_daq = dict(
-#         dynamicRangeADC=50.*MeV,
-#         capacityADC=2**16,
-#         pedestalMean=100,
-#         pedestalSigma=0)
 becal_scfi_digi = CalHitDigi('becal_scfi_digi',
        inputHitCollection='EcalBarrelScFiHits',
@@ -105,25 +112,25 @@ becal_scfi_merger = CalHitsMerger('becal_scfi_merger',
        # OutputLevel=DEBUG,
        inputHitCollection=becal_scfi_reco.outputHitCollection,
        outputHitCollection='EcalBarrelScFiGridReco',
-        fields=['fiber'],
+        fields=['fiber', 'z'],
-        fieldRefNumbers=[1],
+        fieldRefNumbers=[1, 1],
        readoutClass='EcalBarrelScFiHits')
+"""
 becal_scfi_sorter = MLDataSorter('becal_scfi_sorter',
        inputHitCollection=becal_scfi_merger.outputHitCollection,
        outputHitCollection='EcalBarrelScFiHitsML',
        numberOfLayers=20,
-        numberOfHits=kwargs['nhits'])
+        numberOfHits=20)
 # combine layers
 becal_combiner = MLDataCombiner('becal_combiner',
        inputHits1=becal_img_sorter.outputHitCollection,
        inputHits2=becal_scfi_sorter.outputHitCollection,
-        outputHits='EcalBarrelImagingHitsCombinedML',
+        outputHits='EcalBarrelCombinedHitsML',
        layerIncrement=100,
-        rule=kwargs['combine'])
+        rule='concatenate')
+"""
 podout.outputCommands = [
 #        'keep *',
@@ -135,11 +142,15 @@ podout.outputCommands = [
 ]
 ApplicationMgr(
-    TopAlg=[podin,
+    TopAlg=[
-            becal_img_digi, becal_img_reco, becal_img_merger, becal_img_sorter,
+        podin,
-            becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, becal_scfi_sorter,
+        becal_img_digi, becal_img_reco,
-            becal_combiner,
+        # becal_img_merger, becal_img_sorter,
-            podout],
+        becal_scfi_digi, becal_scfi_reco, becal_scfi_merger,
+        # becal_scfi_sorter,
+        # becal_combiner,
+        podout
+        ],
    EvtSel='NONE',
    EvtMax=kwargs['nev'],
    ExtSvc=[podev],

--- a/benchmarks/imaging_shower_ML/requirements.txt
+++ b/benchmarks/imaging_shower_ML/requirements.txt
+tensorflow >= 2.10.0
+tables >= 3.7.0
--- a/benchmarks/imaging_shower_ML/run_benchmark.py
+++ b/benchmarks/imaging_shower_ML/run_benchmark.py
+#! /usr/local/bin/python3
+'''
+    A python script to facilitate the ML benchmarks for e/pi separation with the imaging calorimeter (single particles).
+    This process follows the steps below:
+    1. Simulation to generate training samples
+    2. Study and apply E/p cut to reduce the training samples
+    3. Train and test ML models with the "cleaned" (after E/p cut) samples
+    4. Benchmark the performance
+    Author: Chao Peng (ANL)
+    Date: 11/11/2022
+'''
+import os
+import sys
+import subprocess
+import argparse
+SDIR = os.path.dirname(os.path.realpath(__file__))
+# {var} is from args
+FILE_NAMES = dict(
+    gen_script = os.path.join(SDIR, 'scripts', 'gen_particles.py'),
+    rec_script = os.path.join(SDIR, 'options', 'imaging_ml_data.py'),
+    epscan_script = os.path.join(SDIR, 'scripts', 'epcut_scan.py'),
+    prep_script = os.path.join(SDIR, 'scripts', 'ml_data_preprocess.py'),
+    ml_script = os.path.join(SDIR, 'scripts', 'ml_training.py'),
+    sim_dir = os.path.join('{outdir}', 'sim_data'),
+    epscan_dir = os.path.join('{outdir}', 'epcut'),
+    ml_dir = os.path.join('{outdir}', 'ml_result'),
+    gen_file = os.path.join('{outdir}', 'sim_data', '{ntag}_gen.hepmc'),
+    sim_file = os.path.join('{outdir}', 'sim_data', '{ntag}_sim.edm4hep.root'),
+    rec_file = os.path.join('{outdir}', 'sim_data', '{ntag}_rec.root'),
+    ml_file = os.path.join('{outdir}', 'sim_data', '{ntag}_data.h5'),
+)
+# default values for argument parser
+DEFAULT_COMPACT = os.path.join(
+        os.environ.get('DETECTOR_PATH', ''),
+        '{}.xml'.format(os.environ.get('DETECTOR_CONFIG', ''))
+        )
+# defined steps
+SCRIPT_STEPS = (
+    'sim',      # step 1; simulation to generate samples
+    'epscan',   # step 2: e/p cut scan to find the optimized cut
+    'prep',     # step 3: prepare tagging data (apply e/p cut)
+    'train',    # step 4: train ML model and benchmark it
+    # 'bmk',      # step 5: benchmarking
+)
+# simulation and reconstruction
+def gen_sim_rec(**kwargs):
+    # generate particles
+    gen_cmd = [
+        'python {gen_script} {gen_file}',
+        '-n {nev}',
+        '-s {seed}',
+        '--angmin {angmin} --angmax {angmax}',
+        '--pmin {pmin} --pmax {pmax}',
+        '--particles {particles}',
+        ]
+    gen_cmd = ' '.join(gen_cmd).format(**kwargs).split(' ')
+    subprocess.run(gen_cmd)
+    # simulation
+    sim_cmd = [
+        'ddsim --runType batch --part.minimalKineticEnergy 1*TeV --filter.tracker edep0',
+        '-v WARNING',
+        '--numberOfEvents {nev}',
+        # '--physics.list {physics_list}',
+        '--inputFiles {gen_file}',
+        '--outputFile {sim_file}',
+        '--compact {compact}',
+        ]
+    if 'seed' in kwargs and kwargs['seed'] > 0:
+        sim_cmd += ['--random.seed {seed}']
+    sim_cmd = ' '.join(sim_cmd).format(**kwargs).split(' ')
+    return_code = subprocess.run(sim_cmd).returncode
+    print(return_code)
+    if return_code is not None and return_code < 0:
+        print("ERROR running simulation!")
+        exit(return_code)
+    subprocess.run(['rootls', '-t', kwargs['sim_file']])
+    # reconstruction with juggler
+    # export to environment variables (used to pass arguments to the option file)
+    run_env = os.environ.copy()
+    juggler_vars = [
+        'JUGGLER_SIM_FILE {sim_file}',
+        'JUGGLER_REC_FILE {rec_file}',
+        'JUGGLER_COMPACT_PATH {compact}',
+        'JUGGLER_N_EVENTS {nev}',
+        ]
+    lst = ' '.join(juggler_vars).format(**kwargs).split(' ')
+    run_env.update({lst[i]: lst[i + 1] for i in range(0, len(lst), 2)})
+    rec_cmd = 'gaudirun.py {rec_script}'.format(**kwargs).split(' ')
+    print(rec_cmd)
+    return_code = subprocess.run(rec_cmd, env=run_env).returncode
+    print(return_code)
+    if return_code is not None and return_code < 0:
+        print("ERROR running juggler (reconstruction)!")
+        exit(return_code)
+    process = subprocess.run(['rootls', '-t', kwargs['rec_file']])
+if __name__ == '__main__':
+    # argument parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+            '-n', '--n-events', type=int,
+            dest='nev',
+            default=100,
+            help='number of events.'
+            )
+    parser.add_argument(
+            '-o', '--outdir', type=str,
+            dest='outdir',
+            default='sim_output',
+            help='output directory.'
+            )
+    parser.add_argument(
+            '-t', '--name-tag', type=str,
+            dest='ntag',
+            default='imcal_ml',
+            help='name tag for output files.'
+            )
+    parser.add_argument(
+            '-c', '--compact', type=str,
+            dest='compact',
+            default=DEFAULT_COMPACT,
+            help='path to detector compact file.'
+            )
+    parser.add_argument(
+            '-s', '--seed', type=int,
+            default=-1,
+            help='random seed to child scripts (only pass it if > 0).'
+            )
+    parser.add_argument(
+            '--batch-size', type=int,
+            dest='batch',
+            default=100000,
+            help='batch size to process data.'
+            )
+    parser.add_argument(
+            '--p-min', type=float,
+            dest='pmin',
+            default=1.5,
+            help='minimum momentum of particles.'
+            )
+    parser.add_argument(
+            '--p-max', type=float,
+            dest='pmax',
+            default=2.5,
+            help='maximum momentum of particles.'
+            )
+    parser.add_argument(
+            '--angle-min', type=float,
+            dest='angmin',
+            default=75,
+            help='minimum scattering angle of particles.'
+            )
+    parser.add_argument(
+            '--angle-max', type=float,
+            dest='angmax',
+            default=105,
+            help='maximum scattering angle of particles.'
+            )
+    parser.add_argument(
+            '--weight-pion', type=float,
+            dest='wpi',
+            default=10.,
+            help='weights for pions.'
+            )
+    parser.add_argument(
+            '--particles', type=str,
+            default='electron,pion-',
+            help='partcile names, separated by \",\".'
+            )
+    parser.add_argument(
+            '--test-size', type=float,
+            default=0.2,
+            help='relative size of test samples (between 0-1).'
+            )
+    parser.add_argument(
+            '--no-epcut', action='store_true',
+            default=False,
+            help='do not apply E/p cut to ML samples, might be useful for test runs with small samples.'
+            )
+    parser.add_argument(
+            '--steps', type=str,
+            default=', '.join(SCRIPT_STEPS),
+            help='FOR DEV: choose the steps to be executed ({}).'.format(', '.join(SCRIPT_STEPS))
+            )
+    args = parser.parse_args()
+    kwargs = vars(args)
+    # prepare
+    steps = [p.strip() for p in args.steps.split(',')]
+    # make dirs, add paths to kwargs
+    FILE_NAMES.update({key: val.format(**kwargs) for key, val in FILE_NAMES.items()})
+    for key, val in FILE_NAMES.items():
+        if key.endswith('_dir'):
+            os.makedirs(val, exist_ok=True)
+    kwargs.update(FILE_NAMES)
+    # simulation for benchmark samples
+    if SCRIPT_STEPS[0] in steps:
+        gen_sim_rec(**kwargs)
+    # ep cut scan
+    if SCRIPT_STEPS[1] in steps:
+        cmd = [
+            'python {epscan_script} {rec_file}',
+            '-o {epscan_dir} --name-tag {ntag}',
+            '--batch-size {batch}',
+            '-s {seed}',
+            '-e 0.97'
+            ]
+        cmd = ' '.join(cmd).format(**kwargs).split(' ')
+        return_code = subprocess.run(cmd).returncode
+        if return_code is not None and return_code < 0:
+            print("ERROR running epcut scan script!")
+            exit(return_code)
+    # prepare ML data
+    if SCRIPT_STEPS[2] in steps:
+        cmd = [
+            'python {prep_script} {rec_file}',
+            '-o {ml_file}',
+            '--batch-size {batch}',
+            '-s {seed}',
+            '--nhits 50',
+            ]
+        if not args.no_epcut:
+            cmd.append('--epcut {epscan_dir}/{ntag}_result.json')
+        cmd = ' '.join(cmd).format(**kwargs).split(' ')
+        return_code = subprocess.run(cmd).returncode
+        if return_code is not None and return_code < 0:
+            print("ERROR running data preprocessing script!")
+            exit(return_code)
+    # train ML model and benchmarking
+    if SCRIPT_STEPS[3] in steps:
+        cmd = [
+            'python {ml_script} {ml_file}',
+            '-o {ml_dir} --name-tag {ntag}',
+            '--weight-pion-minus {wpi}',
+            '-s {seed}',
+            '--batch-size 256',
+            '--epochs 20',
+            '--test-size {test_size}',
+            ]
+        cmd = ' '.join(cmd).format(**kwargs).split(' ')
+        return_code = subprocess.run(cmd).returncode
+        if return_code is not None and return_code < 0:
+            print("ERROR running ML training script!")
+            exit(return_code)
+    # TODO: collect the two-step results (e/p cut and ML)
--- a/benchmarks/imaging_shower_ML/scripts/check_edep_dists.py
+++ b/benchmarks/imaging_shower_ML/scripts/check_edep_dists.py
-import ROOT
-import os
-import gc
-import numpy as np
-import pandas as pd
-import argparse
-import sys
-import matplotlib.pyplot as plt
-# read from RDataFrame and flatten a given collection, return pandas dataframe
-def flatten_collection(rdf, collection, cols=None, event_colname='event'):
-    if not cols:
-        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
-    else:
-        cols = ['{}.{}'.format(collection, c) for c in cols]
-    if not cols:
-        print('cannot find any branch under collection {}'.format(collection))
-        return pd.DataFrame()
-    data = rdf.AsNumpy(cols)
-    # flatten the data, add an event id to identify clusters from different events
-    evns = []
-    for i, vec in enumerate(data[cols[0]]):
-        evns += [i]*vec.size()
-    for n, vals in data.items():
-        # make sure ints are not converted to floats
-        typename = vals[0].__class__.__name__.lower()
-        dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32
-        # type safe creation
-        data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype)
-    # build data frame
-    dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()})
-    dfp.loc[:, event_colname] = evns
-    return dfp
-# load root macros, input is an argument string
-def load_root_macros(arg_macros):
-    for path in arg_macros.split(','):
-        path = path.strip()
-        if os.path.exists(path):
-            ROOT.gROOT.Macro(path)
-        else:
-            print('\"{}\" does not exist, skip loading it.'.format(path))
-def cartesian_to_polar(x, y, z):
-    r = np.sqrt(x**2 + y**2 + z**2)
-    rc = np.sqrt(x**2 + y**2)
-    theta = np.arccos(z / r)
-    phi = np.arctan2(y, x)
-    eta = -np.log(np.tan(theta / 2.))
-    return r, theta, phi, rc, eta
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('file', type=str, help='path to root file')
-    parser.add_argument('--sim', action='store_true', help='flag switch between sim and rec data (default rec)')
-    parser.add_argument('-o', type=str, default='.', dest='outdir', help='output directory')
-    # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file')
-    parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros',
-                         help='root macros to load (accept multiple paths separated by \",\")')
-    parser.add_argument('--branch', type=str, default='EcalBarrelImagingHitsReco', help='name of data branch (edm4eic::CalorimeterHitCollection)')
-    parser.add_argument('--truth-branch', type=str, default='MCParticles', help='name of truth mc branch')
-    parser.add_argument('--edep-max', type=float, default=0., help='maximum edep (GeV) to plot')
-    parser.add_argument('--edep-nbins', type=int, default=200, help='number of bins')
-    parser.add_argument('--name-tag', type=str, default='test', help='name tag to save the file')
-    parser.add_argument('--samp-frac', type=float, default=1.0, help='sampling fraction')
-    args = parser.parse_args()
-    os.makedirs(args.outdir, exist_ok=True)
-    load_root_macros(args.macros)
-    # read data and MCParticles
-    rdf = ROOT.RDataFrame("events", args.file)
-    mc_branch = args.truth_branch
-    dfm = flatten_collection(rdf, mc_branch, ['generatorStatus', 'PDG', 'momentum.x', 'momentum.y', 'momentum.z', 'mass'])
-    dfm.rename(columns={c: c.replace(mc_branch + '.', '') for c in dfm.columns}, inplace=True)
-    # selete incident particles
-    dfm = dfm[dfm['generatorStatus'].isin([0, 1])]
-    # NOTE: assumed single particles
-    dfm = dfm.groupby('event').first()
-    # p, theta, phi, pT, eta = cartesian_to_polar(*dfm[['momentum.x', 'momentum.y', 'momentum.z']].values.T)
-    if args.sim:
-        df = flatten_collection(rdf, args.branch, ['energyDeposit'])
-        df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True)
-        df.rename(columns={'energyDeposit': 'energy'}, inplace=True)
-    else:
-        df = flatten_collection(rdf, args.branch, ['layer', 'energy', 'position.x', 'position.y', 'position.z'])
-        df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True)
-    dfe = df.groupby('event')['energy'].sum().reset_index()
-    # determine histrogram bins
-    if args.edep_max <= 0.:
-        args.edep_max = dfe['energy'].quantile(0.99)*1.2
-    bins = np.linspace(0., args.edep_max, args.edep_nbins + 1)
-    bincenters = (bins[1:] + bins[:-1])/2.
-    # get particle types
-    fig, ax = plt.subplots(figsize=(16, 9), dpi=120, gridspec_kw={'left': 0.15, 'right': 0.95})
-    ax.set_xlabel('Energy Deposit / {:.2f} (GeV)'.format(args.samp_frac), fontsize=24)
-    ax.set_ylabel('Normalized Counts', fontsize=24)
-    ax.set_yscale('log')
-    ax.grid(linestyle=':')
-    ax.tick_params(labelsize=24)
-    ax.set_axisbelow(True)
-    hist_vals, hist_cols = [], []
-    pdgbase = ROOT.TDatabasePDG()
-    for pdgid in dfm['PDG'].unique():
-        particle = pdgbase.GetParticle(int(pdgid))
-        if not particle:
-            print("Unknown pdgcode {}, they are ignored".format(int(pdgid)))
-            continue
-        events_indices = dfm[dfm.loc[:, 'PDG'] == pdgid].index.unique()
-        print("{} entries of particle {}".format(len(events_indices), particle.GetName()))
-        dfe_part = dfe.loc[dfe['event'].isin(events_indices)]
-        edep_vals, _, _ = ax.hist(dfe_part['energy'] / args.samp_frac,
-                                  weights = [1. / dfe_part.shape[0]]*dfe_part.shape[0],
-                                  histtype='step', bins=bins, label=particle.GetName())
-        hist_vals.append(edep_vals)
-        hist_cols.append(particle.GetName())
-    pd.DataFrame(index=bincenters, data=np.vstack(hist_vals).T, columns=hist_cols)\
-      .to_csv('{}.csv'.format(args.name_tag))
-    ax.legend(fontsize=24)
-    fig.savefig('{}.png'.format(args.name_tag))
--- a/benchmarks/imaging_shower_ML/scripts/epcut_scan.py
+++ b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py
+'''
+    A script to scan the optimized cut on layer and E/p.
+    It scan all the possible ScFi layers (20 in the EPIC brycecanyon configuration)
+    The results give the best cut (highest pion rejection) on [layer, E/p] with a targeted electron efficiency
+    Chao Peng (ANL)
+    2022/11/13
+'''
+import os
+import gc
+import sys
+import json
+import ROOT
+import argparse
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.backends.backend_pdf as mpdf
+from collections import OrderedDict
+from utils import flatten_collection, imcal_info
+# default color cycle of matplotlib
+prop_cycle = plt.rcParams['axes.prop_cycle']
+colors = prop_cycle.by_key()['color']
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+            'file', type=str,
+            help='path to root file.'
+            )
+    parser.add_argument(
+            '-o', type=str,
+            default='epcuts',
+            dest='outdir',
+            help='output directory.'
+            )
+    parser.add_argument(
+            '--batch-size', type=int,
+            default=100000,
+            help='batch size to process data.'
+            )
+    parser.add_argument(
+            '--name-tag', type=str,
+            default='imcal',
+            dest='ntag',
+            help='nametag for output file.'
+            )
+    # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file')
+    parser.add_argument(
+            '-b', '--branch', type=str,
+            default='EcalBarrelScFiGridReco',
+            help='name of data branch (edm4eic::CalorimeterHitCollection).'
+            )
+    parser.add_argument(
+            '-b2', '--truth-branch', type=str,
+            default='MCParticles',
+            help='name of truth mc branch.'
+            )
+    parser.add_argument(
+            '-s', '--seed', type=int,
+            default=-1,
+            help='random seed.'
+            )
+    # flat resolution now, may need add P dependent terms?
+    parser.add_argument(
+            '-r', '--tracking-resolution', type=float,
+            dest='res',
+            default=5e-3,
+            help='relative resolution of tracking (used to smear truth momentum).'
+            )
+    parser.add_argument(
+            '-e', '--efficiency', type=float,
+            dest='eff',
+            default=0.97,
+            help='targeted efficiency from the cuts.'
+            )
+    parser.add_argument(
+            '--sampling-fractions', type=str,
+            default='0,0.11,2200',
+            help='sampling fractions range (min,max,nbins).'
+            )
+    args = parser.parse_args()
+    os.makedirs(args.outdir, exist_ok=True)
+    if args.seed > 0:
+        np.random.seed(args.seed)
+    # read data and MCParticles
+    rdf = ROOT.RDataFrame('events', args.file)
+    # event range
+    ntotal = rdf.Count().GetValue()
+    ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal])
+    # data container
+    nlayers = int(imcal_info.nlayers_scfi)
+    samps = args.sampling_fractions.split(',')
+    samp_range = (float(samps[0]), float(samps[1]))
+    samp_nbins = int(samps[2])
+    # print(nlayers)
+    ep_bins = np.linspace(*samp_range, samp_nbins + 1)
+    ep_centers = (ep_bins[:-1] + ep_bins[1:])/2.
+    el_hist = np.zeros(shape=(nlayers, samp_nbins))
+    pi_hist = np.zeros(shape=(nlayers, samp_nbins))
+    # process events
+    # avoid insane amount of memory use with a large dataset
+    for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]):
+        gc.collect()
+        print('E/p cut: processing events {:d} - {:d}'.format(ev_begin, ev_end))
+        dfm = flatten_collection(rdf, args.truth_branch, (int(ev_begin), int(ev_end)), cols=[
+                'generatorStatus',
+                'PDG',
+                'momentum.x', 'momentum.y', 'momentum.z',
+                'mass'
+                ])
+        # select incident particles
+        dfm = dfm[dfm['generatorStatus'].isin([0, 1])]
+        # NOTE: assumed single particles
+        dfm = dfm.groupby('event').first().reset_index()
+        # True momentum
+        dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2)
+        # Smear with tracking resolution (0.5%)
+        dfm.loc[:, 'Ptrack'] = dfm['P']*np.random.normal(1.0, args.res, len(dfm))
+        # print(dfm[['PDG', 'P', 'Ptrack']])
+        # print(dfm['PDG'].value_counts())
+        # reconstructed simulation hits
+        df = flatten_collection(rdf, args.branch, (int(ev_begin), int(ev_end)), cols=[
+                'layer',
+                'energy',
+                ])
+        # cumulative energy sum for layer
+        dfe = df.groupby(['event', 'layer'])['energy'].sum().groupby(level=0).cumsum()
+        # fill missing layer (no hits in that layer) edep values with the previous value
+        dfe = dfe.reset_index(level='event').groupby('event', as_index=False)\
+                 .apply(lambda grp: grp.reindex(np.arange(nlayers) + 1, method='ffill'))\
+                 .reset_index().drop('level_0', axis=1)
+        # print(dfe)
+        dfe = dfe.reset_index().merge(dfm[['event', 'Ptrack', 'PDG']], on='event')
+        dfe.loc[:, 'eoverp'] = dfe['energy']/dfe['Ptrack']
+        # NOTE: assumed only e- and pi-
+        el_mask = dfe['PDG'] == 11
+        dfel = dfe[el_mask]
+        dfpi = dfe[~el_mask]
+        # fill data layer by layer
+        for hist, dftmp in zip([el_hist, pi_hist], [dfel, dfpi]):
+            for i in np.arange(nlayers):
+                vals = dftmp[dftmp['layer'] == i + 1]['eoverp'].values
+                hvals, _ = np.histogram(vals, bins=ep_bins)
+                hist[i] += hvals
+    # save binned data
+    for hist, label in zip([el_hist, pi_hist], ['el', 'pi']):
+        dfr = pd.DataFrame(
+                columns=['bin_left', 'bin_right'] + ['layer_{:d}'.format(i+1) for i in np.arange(nlayers)],
+                data=np.vstack([ep_bins[:-1], ep_bins[1:], hist]).T
+                )
+        dfr.to_csv(os.path.join(args.outdir, '{}_eop_{}.csv'.format(args.ntag, label)), index=False)
+        # print(label, np.sum(hist, axis=1))
+        # print(dfr)
+    # study the epcut performance with binned data
+    best = {
+        'layer': int(nlayers + 1),
+        'ep_cut': 0.,
+        'el_eff': 0.,
+        'pi_rej': 0.,
+        }
+    ep_dict = OrderedDict([
+        ('info', {
+            'nsamples': int(ntotal),
+            'targeted_efficiency': args.eff,
+            'tracking_resolution': args.res
+            }),
+        ('best', best),
+        ])
+    pdf = mpdf.PdfPages(os.path.join(args.outdir, '{}_layers.pdf'.format(args.ntag)))
+    for i in np.arange(nlayers):
+        elvals, pivals = el_hist[i], pi_hist[i]
+        # cut position
+        # NOTE: larger error with wider bins
+        perc = np.cumsum(elvals)/np.sum(elvals)
+        idx = len(perc[perc < 1. - args.eff])
+        ep_cut = ep_centers[idx]
+        # efficiency
+        eff = np.sum(elvals[idx:])/np.sum(elvals)
+        # rejection power
+        rej = np.sum(pivals[:idx])/np.sum(pivals)
+        ep_dict['layer_{:d}'.format(i + 1)] = {'ep_cut': ep_cut, 'el_eff': eff, 'pi_rej': rej}
+        # greater or [equal] here because always favor the cut on more layers
+        if rej >= best['pi_rej']:
+            best.update({
+                'layer': int(i + 1),
+                'ep_cut': ep_cut,
+                'el_eff': eff,
+                'pi_rej': rej,
+                })
+        # plot showing the results
+        fig, ax = plt.subplots(figsize=(8, 8))
+        ax.hist(ep_centers, weights=pivals, bins=50, label='$\pi^-$', color=colors[0], ec=colors[0], alpha=0.5)
+        ax.hist(ep_centers, weights=elvals, bins=50, label='$e^-$', color=colors[1], ec=colors[1], alpha=0.5)
+        ax.legend(fontsize=20, ncol=2, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
+        ax.tick_params(labelsize=20)
+        ax.set_xlabel('$E/p$', fontsize=20)
+        ax.set_ylabel('Counts', fontsize=20)
+        ax.axvline(x=ep_cut, color='k', ls='--', lw=2)
+        props = dict(boxstyle='round', facecolor='white', alpha=0.5)
+        ax.text(0.5, 0.97,
+                'Layer $\leq${:d}\n$\epsilon_e={:.2f}$%\n$R_{{\pi}}={:.2f}$%'.format(i + 1, eff*100., rej*100.),
+                transform=ax.transAxes, fontsize=20, va='top', ha='center', bbox=props)
+        pdf.savefig(fig)
+    pdf.close()
+    # save cut position and performance
+    ep_dict.update({'best': best})
+    ep_json = json.dumps(ep_dict, indent=4)
+    with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile:
+        outfile.write(ep_json)
--- a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py
+++ b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py
+'''
+    A script to prepare datasets for ML training
+'''
+import ROOT
+import os
+import gc
+import sys
+import json
+import numpy as np
+import pandas as pd
+import argparse
+from utils import flatten_collection, cartesian_to_polar, imcal_info
+pd.set_option('display.max_rows', 500)
+# normalizing range for features
+# NOTE: assumed for barrel ecal, using the range of 0.5 - 2 meters to normalize the r0 values
+WIN_ETA = (-0.2, 0.2) # unitless
+WIN_PHI = (-0.4, 0.4) # rad
+WIN_R0 = (500, 2000) # mm
+WIN_EH = (0, 0.05) # GeV
+# only positive weights are returned
+# basis defines the minimum contribution (4.6 is about 1%)
+def log_weights(val, basis=4.6):
+    w = np.log(val) + basis
+    return np.clip(w, 0., None)
+def read_epcut(path):
+    if not path or not os.path.exists(path):
+        print('Prepare ML data: not applying E/p cut because E/p file is not provided or missing.')
+        return None
+    with open(args.epcut, 'r') as f:
+        data = json.load(f)
+        epcut = data.get('best', {})
+        epcut.update(data.get('info', {}))
+        return epcut
+def lin_norm(vals, window, val_name='', var_name='', warn_win_size=True):
+    # statistically meaningful
+    if warn_win_size and len(vals) > 1000:
+        perc = (np.percentile(vals, 5), np.percentile(vals, 95))
+        if perc[0] < window[0] or perc[1] > window[1]:
+            warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit {} values {}.'\
+                      .format(window, val_name, perc)
+            if var_name:
+                warnstr += ' Try adjust {}'.format(var_name)
+            print(warnstr)
+    return np.clip((vals - window[0])/(window[1] - window[0]), 0., 1.)
+# prepare ML datasets from dataframe
+def format_ml_data(dfh, dft, nlayers=20, nhits=50,
+                   win_eta=WIN_ETA, win_phi=WIN_PHI, win_r0=WIN_R0, win_eh=WIN_EH):
+    # sanity check
+    dfh = dfh[dfh['energy'] > 0.]
+    # add more data
+    r, theta, phi, rc, eta = cartesian_to_polar(*dfh[['position.x', 'position.y', 'position.z']].values.T)
+    event_group = dfh.groupby('event')
+    # convert eta, phi, edep so they are within [0, 1]
+    dfh.loc[:, 'etotal'] = event_group['energy'].transform('sum')
+    # get the hits energy ratio
+    dfh.loc[:, 'eratio'] = dfh['energy']/dfh['etotal']
+    # calculate weighted center of the event, in the future this can be replaced by tracking input
+    # NOTE: assumed single particle events
+    # log weighting (4.6
+    dfh.loc[:, 'logw'] = log_weights(dfh['eratio'].values)
+    dfh.loc[:, 'wtotal'] = event_group['logw'].transform('sum')
+    # sanity check
+    dfh = dfh[dfh['wtotal'] > 0.]
+    # regroup because of mutation
+    event_group = dfh.groupby('event')
+    dfh.loc[:, 'logw'] = dfh['logw']/dfh['wtotal']
+    dfh.loc[:, 'xw'] = dfh['position.x']*dfh['logw']
+    dfh.loc[:, 'yw'] = dfh['position.y']*dfh['logw']
+    dfh.loc[:, 'zw'] = dfh['position.z']*dfh['logw']
+    # position center
+    dfh.loc[:, 'xc'] = event_group['xw'].transform('sum')
+    dfh.loc[:, 'yc'] = event_group['yw'].transform('sum')
+    dfh.loc[:, 'zc'] = event_group['zw'].transform('sum')
+    # single hits features
+    r, theta, phi, r0, eta = cartesian_to_polar(*dfh[['position.x', 'position.y', 'position.z']].values.T)
+    # center
+    rc, thetac, phic, r0c, etac = cartesian_to_polar(*dfh[['xc', 'yc', 'zc']].values.T)
+    # data for ML
+    dfml = dfh[['event', 'layer']].copy()
+    # NOTE: not much a difference observed between eratio or normalized E with a narrow momentum range (e.g., 1.5-2.5 GeV/c)
+    dfml.loc[:, 'eh'] = dfh['eratio'].values
+    # dfml.loc[:, 'eh'] = lin_norm(dfh['energy'].values, win_eh, "Ehit")
+    dfml.loc[:, 'r0'] = lin_norm(r0, win_r0, "r0")
+    dfml.loc[:, 'eta'] = lin_norm(eta - etac, win_eta, "eta")
+    dfml.loc[:, 'phi'] = lin_norm(phi - phic, win_phi, "phi")
+    # sort by energy
+    dfml.sort_values(['event', 'layer', 'eh'], ascending=[True, True, False])
+    dfml.loc[:, 'hit'] = dfml.groupby(['event', 'layer']).cumcount() + 1
+    dfml.set_index(['event', 'layer', 'hit'], inplace=True)
+    # shape data (nevents, nlayers, nhits)
+    # padding with zeros
+    # NOTE: if more than nhits per layer, the least energetic hits are dropped because of the previous energy sorting
+    dfml = dfml.reindex(
+                pd.MultiIndex.from_product([dfml.index.levels[0], np.arange(nlayers) + 1, np.arange(nhits) + 1],
+                names=['event', 'layer', 'hit']),
+                fill_value=0
+                )
+    return dfml, list(event_group.groups.keys())
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis')
+    parser.add_argument(
+            'file', type=str,
+            help='path to the input file.'
+            )
+    parser.add_argument(
+            '-o', type=str,
+            default='.',
+            dest='outpath',
+            help='path to output data file (hdf5).'
+            )
+    parser.add_argument(
+            '--batch-size', type=int,
+            default=100000,
+            help='batch size to process data.'
+            )
+    parser.add_argument(
+            '-s', '--seed', type=int,
+            default=-1,
+            help='random seed.'
+            )
+    parser.add_argument(
+            '--nhits', type=int,
+            default=50,
+            dest='nhits',
+            help='number of hits per layer.'
+            )
+    parser.add_argument(
+            '-b', '--img-branch', type=str,
+            default='EcalBarrelImagingHitsReco',
+            help='name of data branch (edm4eic::CalorimeterHitCollection).'
+            )
+    parser.add_argument(
+            '-b2', '--scfi-branch', type=str,
+            default='EcalBarrelScFiGridReco',
+            help='name of data branch (edm4eic::CalorimeterHitCollection).'
+            )
+    parser.add_argument(
+            '-b3', '--truth-branch', type=str,
+            default='MCParticles',
+            help='name of truth mc branch.'
+            )
+    parser.add_argument(
+            '--epcut', type=str,
+            default=None,
+            help='A json file contains epcut information (best will be used).'
+            )
+    parser.add_argument(
+            '--append-data', action='store_true',
+            default=False,
+            help='Append data to pre-existing data file.'
+            )
+    args = parser.parse_args()
+    data_store = pd.HDFStore(args.outpath)
+    # print(data_store.keys())
+    # clear data store
+    if not args.append_data:
+        for dkey in [imcal_info.ml_data_key, imcal_info.truth_data_key]:
+            if dkey in data_store.keys():
+                data_store.remove(dkey)
+                print('Prepare ML data: remove pre-existed dataset {}'.format(dkey))
+    if args.seed > 0:
+        np.random.seed(args.seed)
+    # read data and MCParticles
+    rdf = ROOT.RDataFrame("events", args.file)
+    # event range
+    ntotal = rdf.Count().GetValue()
+    ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal])
+    # epcut info
+    epcut = read_epcut(args.epcut)
+    # print(epcut)
+    # process events
+    # avoid insane amount of memory use with a large dataset
+    for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]):
+        gc.collect()
+        print('Prepare ML data: processing events {:d} - {:d}'.format(ev_begin, ev_end))
+        dfm = flatten_collection(rdf, args.truth_branch, (int(ev_begin), int(ev_end)), cols=[
+                'generatorStatus',
+                'PDG',
+                'momentum.x', 'momentum.y', 'momentum.z',
+                'mass'
+                ])
+        # select incident particles
+        dfm = dfm[dfm['generatorStatus'].isin([0, 1])]
+        # NOTE: assumed single particles
+        dfm = dfm.groupby('event').first()
+        # True momentum
+        dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2)
+        # print(dfm)
+        # scfi information
+        dfs = flatten_collection(rdf, args.scfi_branch, (int(ev_begin), int(ev_end)), cols=[
+                'layer',
+                'energy',
+                'position.x', 'position.y', 'position.z',
+                ])
+        #  information
+        dfi = flatten_collection(rdf, args.img_branch, (int(ev_begin), int(ev_end)), cols=[
+                'layer',
+                'energy',
+                'position.x', 'position.y', 'position.z',
+                ])
+        # apply ep_cut
+        if epcut:
+            # cumulative energy sum for layer
+            dfe = dfs.groupby(['event', 'layer'])['energy'].sum().groupby(level=0).cumsum()
+            # fill missing layer (no hits in that layer) edep values with the previous value
+            dfe = dfe.reset_index(level='event').groupby('event', as_index=False)\
+                     .apply(lambda grp: grp.reindex(np.arange(imcal_info.nlayers_scfi) + 1, method='ffill'))\
+                     .reset_index().drop('level_0', axis=1)
+            # unstack layer edeps
+            dfe = dfe.set_index(['event', 'layer'])\
+                     .stack().unstack(level='layer')
+            dfe.columns = ['layer_{}'.format(i) for i in dfe.columns]
+            dfe.columns.name = None
+            dfe = dfe.reset_index().drop('level_1', axis=1).set_index('event', drop=True)
+            # print(dfe)
+            p_smear = dfm['P']*np.random.normal(1.0, epcut['tracking_resolution'], len(dfm))
+            ep_mask = (dfe['layer_{layer}'.format(**epcut)]/p_smear) >= epcut['ep_cut']
+            # print(np.sum(ep_mask))
+            dfm = dfm[ep_mask]
+            dfs = dfs[dfs['event'].isin(dfm.index)]
+            dfi = dfi[dfi['event'].isin(dfm.index)]
+        # reformat data for ML
+        dfi, ievs = format_ml_data(dfi, dfm, nlayers=imcal_info.nlayers_img, nhits=args.nhits)
+        dfs, sevs = format_ml_data(dfs, dfm, nlayers=imcal_info.nlayers_scfi, nhits=args.nhits)
+        if ievs != sevs:
+            common = np.intersect1d(ievs, sevs)
+            dfi = dfi[dfi.index.isin(common, level='event')]
+            dfs = dfs[dfs.index.isin(common, level='event')]
+            dfm = dfm[dfm.index.isin(common)].sort_index()
+        else:
+            dfm = dfm[dfm.index.isin(ievs)].sort_index()
+        # assign layer type to different dataset
+        dfi.loc[:, 'ltype'] = 'img'
+        dfi.loc[:, 'lval'] = 0
+        dfs.loc[:, 'ltype'] = 'scfi'
+        dfs.loc[:, 'lval'] = 1
+        # null eta value for ScFi layer
+        # NOTE: add it back in the future with timing-z resolution
+        dfs['eta'] = 0.
+        # combine two datasets
+        df = pd.concat([dfi.reset_index(), dfs.reset_index()], ignore_index=True)\
+               .set_index(['event', 'ltype', 'layer', 'hit']).sort_index()
+        # print(df.head(100))
+        data_store.append(imcal_info.ml_data_key, df, format='t',  data_columns=True)
+        data_store.append(imcal_info.truth_data_key, dfm[['PDG', 'P', 'mass']], format='t', data_columns=True)
+    data_store.close()
--- a/benchmarks/imaging_shower_ML/scripts/ml_training.py
+++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py
--- a/benchmarks/imaging_shower_ML/scripts/utils.py
+++ b/benchmarks/imaging_shower_ML/scripts/utils.py
+import ROOT
+import pandas as pd
+import numpy as np
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+# basic information share among all scripts
+imcal_info = dotdict(
+        nlayers_img=6,
+        nlayers_scfi=20,
+        ml_data_key='/ml_data',
+        truth_data_key='/truth',
+    )
+# read from RDataFrame and flatten a given collection, return pandas dataframe
+def flatten_collection(rdf, collection, ev_range=None, cols=None, event_colname='event'):
+    if not cols:
+        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
+    else:
+        cols = ['{}.{}'.format(collection, c) for c in cols]
+    if not cols:
+        print('cannot find any branch under collection {}'.format(collection))
+        return pd.DataFrame()
+    # take data
+    if not ev_range:
+        data = rdf.AsNumpy(cols)
+    else:
+        data = rdf.Range(*ev_range).AsNumpy(cols)
+    # flatten the data, add an event id to identify clusters from different events
+    evns = []
+    for i, vec in enumerate(data[cols[0]]):
+        evns += [i]*vec.size()
+    for n, vals in data.items():
+        # make sure ints are not converted to floats
+        typename = vals[0].__class__.__name__.lower()
+        dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32
+        # type safe creation
+        data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype)
+    # build data frame
+    dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()})
+    evns = np.asarray(evns)
+    if ev_range is not None:
+        evns += ev_range[0]
+    dfp.loc[:, event_colname] = evns
+    dfp.rename(columns={c: c.replace(collection + '.', '') for c in dfp.columns}, inplace=True)
+    return dfp
+def cartesian_to_polar(x, y, z):
+    r = np.sqrt(x**2 + y**2 + z**2)
+    rc = np.sqrt(x**2 + y**2)
+    theta = np.arccos(z / r)
+    phi = np.arctan2(y, x)
+    eta = -np.log(np.tan(theta / 2.))
+    return r, theta, phi, rc, eta