add analysis script for ecal benchmarks

e0bb61b1 · Chao Peng · 5fae2c48 · e0bb61b1 · e0bb61b1 · e0bb61b1
Commit e0bb61b1 authored 3 years ago by Chao Peng
--- a/benchmarks/clustering/scripts/cluster_plots.py
+++ b/benchmarks/clustering/scripts/cluster_plots.py
@@ -21,98 +21,77 @@ def load_root_macros(arg_macros):
            print('Loading root macros: \'{}\' does not exist, skip loading it.'.format(path))
-def thrown_particles_figure(df, save, mcbranch="mcparticles"):
+# read from RDataFrame and flatten a given collection, return pandas dataframe
+def flatten_collection(rdf, collection, cols=None):
+    if not cols:
+        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
+    else:
+        cols = ['{}.{}'.format(collection, c) for c in cols]
+    data = rdf.AsNumpy(cols)
+    # flatten the data, add an event id to identify clusters from different events
+    evns = []
+    for i, vec in enumerate(data[cols[0]]):
+        evns += [i]*vec.size()
+    for n, vals in data.items():
+        data[n] = np.asarray([v for vec in vals for v in vec])
+    # build data frame
+    dfp = pd.DataFrame(columns=cols, data=np.vstack(list(data.values())).T)
+    dfp.loc[:, 'event'] = evns
+    return dfp
+def thrown_particles_figure(rdf, save, mcbranch="mcparticles"):
    # define truth particle info
-    df = df.Define('isThrown', '{}.genStatus == 1'.format(mcbranch))\
+    df = flatten_collection(rdf, mcbranch, ['genStatus', 'pdgID', 'psx', 'psy', 'psz', 'mass'])
-           .Define('thrownParticles', '{}[isThrown]'.format(mcbranch))\
+    df.rename(columns={c: c.replace(mcbranch + '.', '') for c in df.columns}, inplace=True)
-           .Define('thrownP', 'fourvec(thrownParticles)')\
+    # select thrown particles
-           .Define('thrownPID', 'pid(thrownParticles)')\
+    df = df[df['genStatus'] == 1]
-           .Define('thrownEta', 'eta(thrownParticles)')\
-           .Define('thrownTheta', 'theta(thrownP)')\
-           .Define('thrownMomentum', 'momentum(thrownP)')
    # figure
    fig, axs = plt.subplots(2, 2, figsize=(16, 12), dpi=120)
+    # convert pid to particle names
-    # pid info need some special treatment
-    pids = df.AsNumpy(['thrownPID'])['thrownPID']
-    # unpack vectors
-    pids = np.asarray([v for vec in pids for v in vec], dtype=int)
-    # build a dictionary for particle id and particle name
    pdgbase = ROOT.TDatabasePDG()
-    pdict = {pid: pdgbase.GetParticle(int(pid)).GetName() for pid in np.unique(pids)}
+    get_pname = np.vectorize(lambda pid: pdgbase.GetParticle(int(pid)).GetName())
-    pmap = {pid: i*2 for i, pid in enumerate(list(pdict))}
-    # convert pid to index in dictionary
+    # enumerate particle names
-    pmaps = [pmap[i] for i in pids]
+    df.loc[:, 'pname'] = get_pname(df['pdgID'].values)
+    penum = {pname: i for i, pname in enumerate(df['pname'].unique())}
+    df.loc[:, 'pname_id'] = df['pname'].map(penum)
    ax = axs.flat[0]
-    ax.hist(pmaps, bins=np.arange(-4, len(pmap)*2 + 4), align='left', ec='k')
+    ax.hist(df['pname_id'].values, bins=np.arange(-4, len(penum)*2 + 4), align='left', ec='k')
    ax.set_ylabel('Particle Counts', fontsize=24)
    ax.tick_params(labelsize=22)
    ax.set_axisbelow(True)
    ax.grid(linestyle=':', which='both', axis='y')
-    ax.xaxis.set_major_locator(ticker.FixedLocator(list(pmap.values())))
+    ax.xaxis.set_major_locator(ticker.FixedLocator(list(penum.values())))
-    ax.set_xticklabels(list(pdict.values()))
+    ax.set_xticklabels(list(penum), rotation=30)
-    # kinematics
+    # calculate kinematics
-    data = df.AsNumpy(['thrownMomentum', 'thrownTheta', 'thrownEta'])
+    get_4vecs = np.vectorize(lambda x, y, z, m: ROOT.Math.PxPyPzMVector(x, y, z, m))
-    labels = {
+    fourvecs = get_4vecs(*df[['psx', 'psy', 'psz', 'mass']].values.T)
-        'thrownMomentum': (r'$p$ (GeV)', 'Counts'),
-        'thrownTheta': (r'$\theta$ (degree)', 'Counts'),
+    df.loc[:, 'p'] = [v.P() for v in fourvecs]
-        'thrownEta': (r'$\eta$', 'Counts'),
+    df.loc[:, 'eta'] = [v.Eta() for v in fourvecs]
-    }
+    df.loc[:, 'pt'] = [v.Pt() for v in fourvecs]
-    for ax, (name, vals) in zip(axs.flat[1:], data.items()):
-        hvals = [v for vec in vals for v in vec]
+    # column name, x label, y label, bins
-        ax.hist(hvals, bins=50, ec='k')
+    plots = [
-        ax.tick_params(labelsize=22, direction='in', which='both')
+        ('p', r'$P$ (GeV)', 'Counts', 50),
-        ax.grid(linestyle=':', which='both')
+        ('eta', r'$\eta$', 'Counts', 50),
-        label = labels[name]
+        ('pt', r'$P_T$ (GeV)', 'Counts', 50),
-        ax.set_axisbelow(True)
-        ax.set_xlabel(label[0], fontsize=24)
-        ax.set_ylabel(label[1], fontsize=24)
-    fig.text(0.5, 0.95, 'Thrown Particles', ha='center', fontsize=24)
-    fig.savefig(save)
-    plt.close(fig)
-def general_clusters_figure(df, collection, save, min_nhits=3):
-    data = df.AsNumpy([
-                '{}.nhits'.format(collection),
-                '{}.energy'.format(collection),
-                '{}.polar.theta'.format(collection),
-                '{}.polar.phi'.format(collection),
-             ])
-    # build a dataframe
-    evns = []
-    for i, vec in enumerate(data['{}.nhits'.format(collection)]):
-        evns += [i]*vec.size()
-    for n, vals in data.items():
-        data[n] = np.asarray([v for vec in vals for v in vec])
-    dfp = pd.DataFrame(columns=['nhits', 'edep', 'theta', 'phi'],
-            data=np.vstack(list(data.values())).T)
-    dfp.loc[:, 'evn'] = evns
-    # select the max. energy cluster for each event
-    dfc = dfp.loc[dfp.groupby('evn')['edep'].idxmax()]
-    dfc = dfc.loc[dfc['nhits'] >= min_nhits]
-    # figure
-    fig, axs = plt.subplots(2, 2, figsize=(16, 12), dpi=120)
-    labels = [
-        ('Number of Hits', 'Counts'),
-        (r'$E$ (GeV)', 'Counts'),
-        (r'$\theta$ (rad)', 'Counts'),
-        (r'$\phi$ (rad)', 'Counts'),
    ]
-    for ax, label, vals in zip(axs.flat, labels, dfc[['nhits', 'edep', 'theta', 'phi']].values.T):
+    for ax, (col, xlabel, ylabel, bins) in zip(axs.flat[1:], plots):
-        ax.hist(vals, bins=50, ec='k')
+        ax.hist(df[col].values, bins=bins, align='left', ec='k')
        ax.tick_params(labelsize=22, direction='in', which='both')
        ax.grid(linestyle=':', which='both')
        ax.set_axisbelow(True)
-        ax.set_xlabel(label[0], fontsize=24)
+        ax.set_xlabel(xlabel, fontsize=24)
-        ax.set_ylabel(label[1], fontsize=24)
+        ax.set_ylabel(ylabel, fontsize=24)
-    fig.text(0.5, 0.95, collection, ha='center', fontsize=24)
+    fig.text(0.5, 0.95, 'Thrown Particles', ha='center', fontsize=24)
    fig.savefig(save)
    plt.close(fig)
-    return dfp
 if __name__ == '__main__':
@@ -132,30 +111,67 @@ if __name__ == '__main__':
    print('CPU available {}, using {}'.format(os.cpu_count(), nthreads))
    # declare some functions from c++ script, needed for data frame processing
-    script_dir = os.path.dirname(os.path.realpath(__file__))
+    # script_dir = os.path.dirname(os.path.realpath(__file__))
-    fcode = open(os.path.join(script_dir, 'barrel_clusters.cxx')).read()
+    # fcode = open(os.path.join(script_dir, 'barrel_clusters.cxx')).read()
-    ROOT.gInterpreter.Declare(fcode)
+    # ROOT.gInterpreter.Declare(fcode)
    os.makedirs(args.outdir, exist_ok=True)
    # load macros (add libraries/headers root cannot automatically found here)
    load_root_macros(args.macros)
    rdf_sim = ROOT.RDataFrame('events', args.sim_file)
-    rdf_rec = ROOT.RDataFrame('events', args.rec_file)
    thrown_particles_figure(rdf_sim, save=os.path.join(args.outdir, 'thrown_particles.png'), mcbranch=args.mc)
-    general_clusters_figure(rdf_rec, collection='EcalEndcapNClusters', min_nhits=10,
+    rdf_rec = ROOT.RDataFrame('events', args.rec_file)
-            save=os.path.join(args.outdir, 'ecal_electron_endcap_clusters.png'))
+    # cluster collections
-    general_clusters_figure(rdf_rec, collection='EcalEndcapPClusters', min_nhits=5,
+    colls = [
-            save=os.path.join(args.outdir, 'ecal_hadron_endcap_clusters.png'))
+        'EcalEndcapNClusters',
-    general_clusters_figure(rdf_rec, collection='EcalBarrelClusters',
+        'EcalEndcapPClusters',
-            save=os.path.join(args.outdir, 'ecal_barrel_clusters.png'))
+        'EcalBarrelClusters',
+        'HcalElectronEndcapClusters',
-    general_clusters_figure(rdf_rec, collection='HcalElectronEndcapClusters', min_nhits=5,
+        'HcalHadronEndcapClusters',
-            save=os.path.join(args.outdir, 'hcal_electron_endcap_clusters.png'))
+        'HcalBarrelClusters',
-    general_clusters_figure(rdf_rec, collection='HcalHadronEndcapClusters', min_nhits=10,
+    ]
-            save=os.path.join(args.outdir, 'hcal_hadron_endcap_clusters.png'))
+    # a function to extract eta
-    general_clusters_figure(rdf_rec, collection='HcalBarrelClusters',
+    ROOT.gInterpreter.Declare('''
-            save=os.path.join(args.outdir, 'hcal_barrel_clusters.png'))
+        template<typename T>
+        ROOT::VecOps::RVec<T> eta(const ROOT::VecOps::RVec<T> &theta) {
+            ROOT::VecOps::RVec<double> result;
+            for (size_t i = 0; i < theta.size(); ++ i) { result.push_back(-std::log(std::tan(theta[i]/2.))); }
+            return result;
+        }
+    ''')
+    # branch name, x label, x bins
+    plots = [
+        ('nhits', 'Number of Hits', 50),
+        ('energy', 'Energy (GeV)', 50),
+        ('eta', '$\eta$', 50),
+    ]
+    for coll in colls:
+        df = flatten_collection(rdf_rec, coll)
+        if not df.shape[0]:
+            print('{} do not have valid entries, skip it'.format(coll))
+            continue
+        df.rename(columns={c: c.replace(coll + '.', '') for c in df.columns}, inplace=True)
+        # calculate eta
+        if 'eta' not in df.columns:
+            df.loc[:, 'eta'] = -np.log(np.tan(df['polar.theta'].values/2.))
+        fig, axs = plt.subplots(2, 2, figsize=(12, 8), dpi=160)
+        ncl = df.groupby('event')['clusterID'].nunique().values
+        axs[0][0].hist(ncl, weights=np.repeat(1./float(ncl.shape[0]), ncl.shape[0]),
+                bins=np.arange(0, 10), align='mid', ec='k')
+        axs[0][0].set_xlabel('Number of Clusters', fontsize=16)
+        for ax, (branch, xlabel, bins) in zip(axs.flat[1:], plots):
+            ax.hist(df[branch].values, bins=bins, align='mid', ec='k')
+            ax.set_xlabel(xlabel, fontsize=16)
+        # universal axis settings
+        for ax in axs.flat:
+            ax.tick_params(labelsize=16, direction='in')
+            ax.set_axisbelow(True)
+            ax.grid(linestyle=':')
+        fig.text(0.05, 0.5, 'Normalized Counts', rotation=90, va='center', fontsize=18)
+        fig.savefig(os.path.join(args.outdir, '{}.png'.format(coll)))
--- a/benchmarks/clustering/scripts/barrel_clusters.cxx
+++ b/benchmarks/clustering/scripts/barrel_clusters.cxx
--- a/benchmarks/ecal/run_emcal_benchmarks.py
+++ b/benchmarks/ecal/run_emcal_benchmarks.py
@@ -11,12 +11,24 @@ import subprocess
 import argparse
-# type: option files, analysis files
+# type: option files, analysis files, cluster collections
 default_type = {
-    'endcap_e': [['endcap_e.py'], []],
+    'endcap_e': [
-    'endcap_i': [['endcap_i.py'], []],
+        ['endcap_e.py'],
-    'barrel': [['barrel.py'], []],
+        ['draw_cluters.py'],
-#     'all': [['all_ecal.py'], []],
+        ['EcalEndcapNClusters']],
+    'endcap_i': [
+        ['endcap_i.py'],
+        ['draw_cluters.py'],
+        ['EcalEndcapPClusters']],
+    'barrel': [
+        ['barrel.py'],
+        ['draw_cluters.py'],
+        ['EcalBarrelClusters']],
+#     'all': [
+#         ['all_ecal.py'],
+#         ['draw_cluters.py'],
+#         ['EcalEndcapNClusters', 'EcalEndcapPClusters', 'EcalBarrelClusters']],
 }
 default_compact = os.path.join(os.environ.get('JUGGLER_DETECTOR_PATH',  os.environ.get('DETECTOR_PATH', '')),
@@ -43,7 +55,7 @@ kwargs = vars(args)
 if args.run_type not in default_type:
    print('ERROR unsupported run type {}, must in {}'.format(args.run_type, list(default_type.keys())))
    exit(1)
-opt_scripts, ana_scripts = default_type[args.run_type]
+opt_scripts, ana_scripts, collections = default_type[args.run_type]
 gen_file = 'gen_{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs)
 sim_file = 'sim_{nametag}_{pmin}_{pmax}.root'.format(**kwargs)
@@ -72,7 +84,6 @@ if 'sim' in procs:
            '-v', 'WARNING']
    if args.seed > 0:
        sim_cmd += ['--random.seed', args.seed]
-    print(sim_cmd)
    return_code = subprocess.run(sim_cmd).returncode
    if return_code is not None and return_code < 0:
        print("ERROR running simulation!")
@@ -101,7 +112,9 @@ if 'rec' in procs:
 if 'ana' in procs:
    os.makedirs('results', exist_ok=True)
    for ana in ana_scripts:
-        ana_cmd = ['python', os.path.join(sdir, 'scripts', ana), rec_file, '-o', 'results']
+        ana_cmd = ['python', os.path.join(sdir, 'scripts', ana), rec_file,
+                '--collections', ','.join(collections),
+                '-o', 'results']
        return_code = subprocess.run(ana_cmd).returncode
        if return_code is not None and return_code < 0:
            print('ERROR running analysis ({})!'.format(ana))

--- a/benchmarks/ecal/scripts/draw_cluters.py
+++ b/benchmarks/ecal/scripts/draw_cluters.py
+'''
+    A simple analysis script to extract some basic info of clusters
+    Chao Peng (ANL), 07/08/2021
+'''
+import os
+import ROOT
+import pandas as pd
+import numpy as np
+import argparse
+from matplotlib import pyplot as plt
+import matplotlib.ticker as ticker
+# load root macros, input is an argument string
+def load_root_macros(arg_macros):
+    for path in arg_macros.split(','):
+        path = path.strip()
+        if os.path.exists(path):
+            gROOT.Macro(path)
+            print('Loading root macro: \'{}\' loaded.'.format(path))
+        else:
+            print('Loading root macros: \'{}\' does not exist, skip loading it.'.format(path))
+# read from RDataFrame and flatten a given collection, return pandas dataframe
+def flatten_collection(rdf, collection, cols=None):
+    if not cols:
+        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
+    else:
+        cols = ['{}.{}'.format(collection, c) for c in cols]
+    data = rdf.AsNumpy(cols)
+    # flatten the data, add an event id to identify clusters from different events
+    evns = []
+    for i, vec in enumerate(data[cols[0]]):
+        evns += [i]*vec.size()
+    for n, vals in data.items():
+        data[n] = np.asarray([v for vec in vals for v in vec])
+    # build data frame
+    dfp = pd.DataFrame(columns=cols, data=np.vstack(list(data.values())).T)
+    dfp.loc[:, 'event'] = evns
+    return dfp
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('rec_file', help='Path to reconstruction output file.')
+    parser.add_argument('-o', dest='outdir', default='.', help='Output directory.')
+    parser.add_argument('-m', '--macros', dest='macros', default='rootlogon.C',
+            help='Macro files to be loaded by root, separated by \",\".')
+    parser.add_argument('--collections', dest='coll', default='',
+            help='Collection names for clusters, separated by \",\"')
+    args = parser.parse_args()
+    # multi-threading for RDataFrame
+    nthreads = os.cpu_count()//2
+    ROOT.ROOT.EnableImplicitMT(nthreads)
+    print('CPU available {}, using {}'.format(os.cpu_count(), nthreads))
+    os.makedirs(args.outdir, exist_ok=True)
+    # load macros (add libraries/headers root cannot automatically found here)
+    load_root_macros(args.macros)
+    rdf_rec = ROOT.RDataFrame('events', args.rec_file)
+    if not args.coll:
+        print('No collection names provided, no plots generated.')
+        exit(-1)
+    # a function to extract eta
+    ROOT.gInterpreter.Declare('''
+        template<typename T>
+        ROOT::VecOps::RVec<T> eta(const ROOT::VecOps::RVec<T> &theta) {
+            ROOT::VecOps::RVec<double> result;
+            for (size_t i = 0; i < theta.size(); ++ i) { result.push_back(-std::log(std::tan(theta[i]/2.))); }
+            return result;
+        }
+    ''')
+    # branch name, x label, x bins
+    plots = [
+        ('nhits', 'Number of Hits', 50),
+        ('energy', 'Energy (GeV)', 50),
+        ('eta', '$\eta$', 50),
+    ]
+    for coll in [c.strip() for c in args.coll.split(',')]:
+        df = flatten_collection(rdf_rec, coll)
+        # calculate eta
+        df.loc[:, '{}.eta'.format(coll)] = -np.log(np.tan(df['{}.polar.theta'.format(coll)].values/2.))
+        fig, axs = plt.subplots(2, 2, figsize=(12, 8), dpi=160)
+        ncl = df.groupby('event')['{}.clusterID'.format(coll)].nunique().values
+        axs[0][0].hist(ncl, weights=np.repeat(1./float(ncl.shape[0]), ncl.shape[0]),
+                bins=np.arange(0, 10), align='mid', ec='k')
+        axs[0][0].set_xlabel('Number of Clusters', fontsize=16)
+        for ax, (branch, xlabel, bins) in zip(axs.flat[1:], plots):
+            ax.hist(df['{}.{}'.format(coll, branch)].values, bins=bins, align='mid', ec='k')
+            ax.set_xlabel(xlabel, fontsize=16)
+        # universal axis settings
+        for ax in axs.flat:
+            ax.tick_params(labelsize=16, direction='in')
+            ax.set_axisbelow(True)
+            ax.grid(linestyle=':')
+        fig.text(0.05, 0.5, 'Normalized Counts', rotation=90, va='center', fontsize=18)
+        fig.savefig(os.path.join(args.outdir, '{}.png'.format(coll)))