update for latest benchmarks

3842b0d4 · Chao Peng · c528fa60 · 3842b0d4 · 3842b0d4 · 3842b0d4
Commit 3842b0d4 authored Nov 7, 2021 by Chao Peng
--- a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
+++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py
@@ -8,6 +8,7 @@ from GaudiKernel.SystemOfUnits import MeV, GeV, mm, cm, mrad

 from Configurables import Jug__Base__InputCopier_dd4pod__Geant4ParticleCollection_dd4pod__Geant4ParticleCollection_ as MCCopier
 from Configurables import Jug__Digi__CalorimeterHitDigi as CalHitDigi
+from Configurables import Jug__Digi__CalorimeterBirksCorr as CalBirksCorr
 from Configurables import Jug__Reco__CalorimeterHitReco as CalHitReco
 from Configurables import Jug__Reco__CalorimeterHitsMerger as CalHitsMerger
 from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger
@@ -20,8 +21,10 @@ kwargs = dict()
 kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '')
 kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '')
 kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '')
-kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate')
 kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1))
+kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate')
+kwargs['img_nlayers'] = int(os.environ.get('IMCAL_ML_IMG_NLAYERS', 9))
+kwargs['nhits'] = int(os.environ.get('IMCAL_ML_NHITS', 20))

 if kwargs['nev'] < 1:
    f = ROOT.TFile(kwargs['input'])
@@ -72,8 +75,8 @@ becal_img_merger = MLDataMerger('becal_img_merger',
 becal_img_sorter = MLDataSorter('becal_img_sorter',
        inputHitCollection=becal_img_merger.outputHitCollection,
        outputHitCollection='EcalBarrelImagingHitsML',
-        numberOfLayers=9,
-        numberOfHits=50)
+        numberOfLayers=kwargs['img_nlayers'],
+        numberOfHits=kwargs['nhits'])


 #Central ECAL SciFi
@@ -83,9 +86,20 @@ becal_scfi_daq = dict(
        capacityADC=32768,
        pedestalMean=400,
        pedestalSigma=10)
+# becal_scfi_daq = dict(
+#         dynamicRangeADC=50.*MeV,
+#         capacityADC=2**16,
+#         pedestalMean=100,
+#         pedestalSigma=0)

-becal_scfi_digi = CalHitDigi('becal_scfi_digi',
+becal_scfi_birks = CalBirksCorr('becal_scfi_birks',
        inputHitCollection='EcalBarrelScFiHits',
+        outputHitCollection='EcalBarrelScFiHitsCorr',
+        birksConstant=0.126*mm/MeV)
+
+becal_scfi_digi = CalHitDigi('becal_scfi_digi',
+        inputHitCollection=becal_scfi_birks.outputHitCollection,
+        # inputHitCollection='EcalBarrelScFiHits',
        outputHitCollection='EcalBarrelScFiHitsDigi',
        **becal_scfi_daq)

@@ -111,8 +125,8 @@ becal_scfi_merger = CalHitsMerger('becal_scfi_merger',
 becal_scfi_sorter = MLDataSorter('becal_scfi_sorter',
        inputHitCollection=becal_scfi_merger.outputHitCollection,
        outputHitCollection='EcalBarrelScFiHitsML',
-        numberOfLayers=9,
-        numberOfHits=50)
+        numberOfLayers=20,
+        numberOfHits=kwargs['nhits'])

 # combine layers
 becal_combiner = MLDataCombiner('becal_combiner',
@@ -123,13 +137,17 @@ becal_combiner = MLDataCombiner('becal_combiner',
        rule=kwargs['combine'])

 podout.outputCommands = [
+#        'keep *',
        'drop *',
        'keep mcparticles2',
-        'keep EcalBarrel*ML']
+        'keep EcalBarrel*ML',
+        'keep *Corr',
+]

 ApplicationMgr(
    TopAlg=[podin, copier,
            becal_img_digi, becal_img_reco, becal_img_merger, becal_img_sorter,
+            becal_scfi_birks,
            becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, becal_scfi_sorter,
            becal_combiner,
            podout],

--- a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
+++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py
 import ROOT
 import os
+import gc
 import numpy as np
 import pandas as pd
 import argparse
@@ -97,8 +98,10 @@ if __name__ == '__main__':
    dfm.loc[:, 'pT'] = pT
    dfm.loc[:, 'eta'] = eta

-    # energy unit is in GeV, this put 1.0 = 5 MeV, which is enough for imaging pixels
-    df.loc[:, 'energy'] = df['energy']*200
+    # energy unit is in GeV, converting to MeV first
+    df.loc[:, 'eMeV'] = df['energy']*1000
+    # scaled by 5 MeV to have most of them in [0, 1]
+    df.loc[:, 'energy'] = df['eMeV']/5.
    # convert eta, phi, edep so they are within [0, 1]
    df.loc[:, 'etotal'] = df.groupby('event')['energy'].transform('sum')
    df = df[df['etotal'] > 0.]
@@ -134,6 +137,7 @@ if __name__ == '__main__':
    for col in nullcols:
        df.loc[:, col] = 0.

+    gc.collect()
    # features and data-shape
    featcols = [f.strip() for f in args.features.split(',')]
    dshape = [int(i.strip()) for i in args.shape.split(',')]
@@ -142,7 +146,10 @@ if __name__ == '__main__':
    event_ids = df['event'].unique()
    data = df.set_index('event')[featcols].clip(0, 1).values.reshape([len(event_ids)] + dshape)
    tags = dfm.loc[event_ids, ['pdgID', 'p', 'pT', 'eta', 'phi', 'mass']]
-    tags.loc[:, 'Emeas'] = event_group['energy'].sum()
+    # also save Edep per layers
+    # merge the sandwich layers
+    df['layer'] = df['layer'] % 100
+    edeps = df.groupby(['event', 'layer'])['eMeV'].sum().unstack()

    """ TEST
    from matplotlib import pyplot as plt
@@ -155,7 +162,9 @@ if __name__ == '__main__':
    """
    print('Training data set shape = {}'.format(data.shape))
    print('Tagging data set shape = {}'.format(tags.shape))
+    print('Edep/layer data set shape = {}'.format(edeps.shape))

    np.save(os.path.join(args.outdir, 'data.npy'), data)
    np.save(os.path.join(args.outdir, 'tag.npy'), tags.values)
+    np.save(os.path.join(args.outdir, 'edep.npy'), edeps.values)

--- a/benchmarks/imaging_shower_ML/scripts/rejection_Eoverp.py
+++ b/benchmarks/imaging_shower_ML/scripts/rejection_Eoverp.py
+import ROOT
+import os
+import gc
+import numpy as np
+import pandas as pd
+import argparse
+import sys
+import matplotlib.pyplot as plt
+
+
+# read from RDataFrame and flatten a given collection, return pandas dataframe
+def flatten_collection(rdf, collection, cols=None, event_colname='event'):
+    if not cols:
+        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
+    else:
+        cols = ['{}.{}'.format(collection, c) for c in cols]
+    if not cols:
+        print('cannot find any branch under collection {}'.format(collection))
+        return pd.DataFrame()
+
+    data = rdf.AsNumpy(cols)
+    # flatten the data, add an event id to identify clusters from different events
+    evns = []
+    for i, vec in enumerate(data[cols[0]]):
+        evns += [i]*vec.size()
+    for n, vals in data.items():
+        # make sure ints are not converted to floats
+        typename = vals[0].__class__.__name__.lower()
+        dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32
+        # type safe creation
+        data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype)
+    # build data frame
+    dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()})
+    dfp.loc[:, event_colname] = evns
+    return dfp
+
+
+# load root macros, input is an argument string
+def load_root_macros(arg_macros):
+    for path in arg_macros.split(','):
+        path = path.strip()
+        if os.path.exists(path):
+            ROOT.gROOT.Macro(path)
+        else:
+            print('\"{}\" does not exist, skip loading it.'.format(path))
+
+
+def cartesian_to_polar(x, y, z):
+    r = np.sqrt(x**2 + y**2 + z**2)
+    rc = np.sqrt(x**2 + y**2)
+    theta = np.arccos(z / r)
+    phi = np.arctan2(y, x)
+    eta = -np.log(np.tan(theta / 2.))
+    return r, theta, phi, rc, eta
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis')
+    parser.add_argument('file', type=str, help='path to root file')
+    parser.add_argument('--sim', action='store_true', help='flag switch between sim and rec data (default rec)')
+    parser.add_argument('-o', type=str, default='.', dest='outdir', help='output directory')
+    # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file')
+    parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros',
+                         help='root macros to load (accept multiple paths separated by \",\")')
+    parser.add_argument('--topo-range', type=float, default=1000.0, dest='topo_range',
+                        help='half range for projection plot (mrad)')
+    parser.add_argument('--shape', type=str, default='20, 20', dest='shape',
+                        help='shape of data (Nlayer, Nhits), default (20, 20)')
+    parser.add_argument('--features', type=str, default='energy, eta, phi',
+                        help='features of hit data, default (eta, phi, energy)')
+    parser.add_argument('--branch', type=str, default='EcalBarrelImagingHitsML', help='name of data branch')
+    parser.add_argument('--edep-max', type=float, default=0., help='maximum edep (GeV) to plot')
+    parser.add_argument('--edep-nbins', type=int, default=200, help='number of bins')
+    parser.add_argument('--null-columns', type=str, default='', help='make the columns zeros')
+    parser.add_argument('--name-tag', type=str, default='test', help='name tag to save the file')
+    parser.add_argument('--samp', type=float, default=0.103, help='sampling fraction')
+    args = parser.parse_args()
+
+    os.makedirs(args.outdir, exist_ok=True)
+    load_root_macros(args.macros)
+
+    # read data and mcparticles
+    rdf = ROOT.RDataFrame("events", args.file)
+
+    mc_branch = 'mcparticles2' if args.sim else 'mcparticles2'
+    dfm = flatten_collection(rdf, mc_branch, ['genStatus', 'pdgID', 'ps.x', 'ps.y', 'ps.z', 'mass'])
+    dfm.rename(columns={c: c.replace(mc_branch + '.', '') for c in dfm.columns}, inplace=True)
+    # selete incident particles
+    dfm = dfm[dfm['genStatus'].isin([0, 1])]
+    # NOTE: assumed single particles
+    dfm = dfm.groupby('event').first()
+    # p, theta, phi, pT, eta = cartesian_to_polar(*dfm[['ps.x', 'ps.y', 'ps.z']].values.T)
+
+    pions_events = dfm[dfm.loc[:, 'pdgID'] == -211].index.unique()
+    els_events = dfm[dfm.loc[:, 'pdgID'] == 11].index.unique()
+
+    print(len(pions_events), len(els_events))
+
+    if args.sim:
+        df = flatten_collection(rdf, args.branch, ['energyDeposit'])
+        df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True)
+        df.rename(columns={'energyDeposit': 'energy'}, inplace=True)
+    else:
+        df = flatten_collection(rdf, args.branch, ['layer', 'energy', 'position.x', 'position.y', 'position.z'])
+        df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True)
+
+    dfe = df.groupby('event')['energy'].sum().reset_index()
+    dfe_pions = dfe.loc[dfe['event'].isin(pions_events)]
+    dfe_els = dfe.loc[dfe['event'].isin(els_events)]
+
+    fig, ax = plt.subplots(figsize=(16, 9), dpi=120)
+    if args.edep_max <= 0.:
+        args.edep_max = dfe['energy'].quantile(0.99)*1.2
+    bins = np.linspace(0., args.edep_max, args.edep_nbins + 1)
+    scale = 1. / args.samp
+    pion_vals, _, _ = ax.hist(dfe_pions['energy']*scale,
+            weights=[1./dfe_pions.shape[0]]*dfe_pions.shape[0],
+            # density=True,
+            histtype='step', bins=bins, label='pions')
+    el_vals, _, _ = ax.hist(dfe_els['energy']*scale,
+            weights=[1./dfe_els.shape[0]]*dfe_els.shape[0],
+            # density=True,
+            histtype='step', bins=bins, label='electrons')
+    bincenters = (bins[1:] + bins[:-1])/2.
+    pd.DataFrame(index=bincenters, data=np.vstack([pion_vals, el_vals]).T, columns=['pions', 'electrons'])\
+       .to_csv('{}.csv'.format(args.name_tag))
+    ax.set_xlabel('Energy Deposit (GeV)', fontsize=24)
+    ax.set_ylabel('Normalized Counts', fontsize=24)
+    ax.set_yscale('log')
+    ax.grid(linestyle=':')
+    ax.legend(fontsize=24)
+    ax.tick_params(labelsize=24)
+    fig.savefig('{}.png'.format(args.name_tag))
+
+
--- a/benchmarks/imaging_shower_ML/sim_rec_tag.py
+++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py
@@ -18,13 +18,14 @@ parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100,
 parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.')
 parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.')
 parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).')
-parser.add_argument('--numberOfLayers', dest='nlayer', type=int, default=9, help='number of layers in ML data.')
-parser.add_argument('--numberOfHits', dest='nhit', type=int, default=50, help='number of hits in ML data.')
+parser.add_argument('--nlayers', dest='nlayers', type=int, default=9, help='number of layers in ML data.')
+parser.add_argument('--nhits', dest='nhits', type=int, default=20, help='number of hits in ML data.')
 parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".')
 parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.')
 parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.')
 parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.')
 parser.add_argument('--combine-method', type=str, default='interlayer', help='Path to detector compact file.')
+parser.add_argument('--physics-list', type=str, default='FTFP_BERT', help='Path to detector compact file.')

 args = parser.parse_args()
 kwargs = vars(args)
@@ -46,7 +47,7 @@ if 'sim' in procs:
    gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file,
            '-n', '{}'.format(args.nev),
            '-s', '{}'.format(args.seed),
-            '--angmin', '50', '--angmax', '130',
+            '--angmin', '45', '--angmax', '135',
            '--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax),
            '--particles', args.particles]
    subprocess.run(gen_cmd)
@@ -55,6 +56,7 @@ if 'sim' in procs:
            '--part.minimalKineticEnergy', '1*TeV',
            '--numberOfEvents', '{}'.format(args.nev),
            '--runType', 'batch',
+            # '--physics.list', args.physics_list,
            '--inputFiles', gen_file,
            '--outputFile', sim_file,
            '--compact', args.compact,
@@ -71,20 +73,23 @@ if 'sim' in procs:

 if 'rec' in procs:
    # export to environment variables (used to pass arguments to the option file)
-    os.environ['JUGGLER_SIM_FILE'] = sim_file
-    os.environ['JUGGLER_REC_FILE'] = rec_file
-    os.environ['JUGGLER_COMPACT_PATH'] = args.compact
-    os.environ['JUGGLER_N_EVENTS'] = '{}'.format(args.nev)
-    os.environ['IMCAL_ML_N_LAYERS'] = '{}'.format(args.nlayer)
-    os.environ['IMCAL_ML_N_HITS'] = '{}'.format(args.nhit)
-    os.environ['IMCAL_ML_COMBINE'] = '{}'.format(args.combine_method)
+    run_env = os.environ.copy()
+    run_env.update({
+        'JUGGLER_SIM_FILE': sim_file,
+        'JUGGLER_REC_FILE': rec_file,
+        'JUGGLER_COMPACT_PATH': args.compact,
+        'JUGGLER_N_EVENTS': str(args.nev),
+        'IMCAL_ML_IMG_NLAYERS': str(args.nlayers),
+        'IMCAL_ML_NHITS': str(args.nhits),
+        'IMCAL_ML_COMBINE': str(args.combine_method),
+    })

    juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv')
    rec_cmd = [
        'xenv', '-x', juggler_xenv,   # v35+ do not need xenv anymore
        'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')
    ]
-    return_code = subprocess.run(rec_cmd).returncode
+    return_code = subprocess.run(rec_cmd, env=run_env).returncode
    print(return_code)
    if return_code is not None and return_code < 0:
        print("ERROR running juggler (reconstruction)!")
@@ -98,7 +103,7 @@ if 'tag' in procs:
    tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
            '-o', tag_dir,
            '--branch', 'EcalBarrelImagingHitsML',
-            '--shape', '{nlayer}, {nhit}'.format(**kwargs),
+            '--shape', '{nlayers}, {nhits}'.format(**kwargs),
            '--features', 'energy, rc, eta, phi',
            ]
    return_code = subprocess.run(tag_cmd).returncode
@@ -111,8 +116,8 @@ if 'tag' in procs:
    tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
            '-o', tag_dir + '_scfi',
            '--branch', 'EcalBarrelScFiHitsML',
-            '--shape', '{nlayer}, {nhit}'.format(**kwargs),
-            # '--shape', '{}, {}'.format(20 - kwargs['nlayer'], kwargs['nhit']),
+            '--shape', '{nlayers}, {nhits}'.format(**kwargs),
+            # '--shape', '{}, {}'.format(20 - kwargs['nlayers'], kwargs['nhits']),
            '--features', 'energy, rc, eta, phi',
            '--null-columns', 'eta',
            ]
@@ -126,7 +131,7 @@ if 'tag' in procs:
    tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
            '-o', tag_dir + '_combined',
            '--branch', 'EcalBarrelHitsCombinedML',
-            '--shape', '{}, {}'.format(kwargs['nlayer']*2, kwargs['nhit']),
+            '--shape', '{}, {}'.format(20 + kwargs['nlayers'], kwargs['nhits']),
            '--features', 'layer_type, energy, rc, eta, phi',
            ]
    return_code = subprocess.run(tag_cmd).returncode