Chao Peng · 3198a850 · cca1f023 · 77e3acfd · edae73ef · e2225375
--- a/benchmarks/imaging_shower_ML/scripts/epcut_scan.py

+ 138

− 107
+++ b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py

+ 138

− 107
-'''
+"""
    A script to scan the optimized cut on layer and E/p.
    It scan all the possible ScFi layers (20 in the EPIC brycecanyon configuration)
    The results give the best cut (highest pion rejection) on [layer, E/p] with a targeted electron efficiency

    Chao Peng (ANL)
    2022/11/13
-'''
+"""
 import os
 import gc
 import sys
 @@ -18,7 +18,7 @@ import matplotlib.pyplot as plt
 import matplotlib.backends.backend_pdf as mpdf
 from matplotlib.ticker import MultipleLocator
 from collections import OrderedDict
-from utils import flatten_collection, imcal_info
+from utils import flatten_collection, imcal_info, par_table, NpEncoder


 # default color cycle of matplotlib
 @@ -74,10 +74,16 @@ if __name__ == '__main__':
            help='relative resolution of tracking (used to smear truth momentum).'
            )
    parser.add_argument(
-            '-e', '--efficiency', type=float,
-            dest='eff',
+            '--optimize-for', type=int,
+            dest='opt_pid',
+            default=11,
+            help='optimize efficiency for the specified particle (PDGCODE).'
+            )
+    parser.add_argument(
+            '--optimize-efficiency', type=float,
+            dest='opt_eff',
            default=0.97,
-            help='targeted efficiency from the cuts.'
+            help='targeted efficiency for the cuts.'
            )
    parser.add_argument(
            '--sampling-fractions', type=str,
 @@ -97,18 +103,18 @@ if __name__ == '__main__':
    ntotal = rdf.Count().GetValue()
    ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal])

-    # data container
-    nlayers = int(imcal_info.nlayers_scfi)
+    # sampling fraction bins
    samps = args.sampling_fractions.split(',')
    samp_range = (float(samps[0]), float(samps[1]))
    samp_nbins = int(samps[2])
-    # print(nlayers)
    ep_bins = np.linspace(*samp_range, samp_nbins + 1)
    ep_centers = (ep_bins[:-1] + ep_bins[1:])/2.
-    el_hist = np.zeros(shape=(nlayers, samp_nbins))
-    pi_hist = np.zeros(shape=(nlayers, samp_nbins))
+    # data container
+    nlayers = int(imcal_info.nlayers_scfi)
+    hists = np.zeros(shape=(0, nlayers, samp_nbins))    # place holder

    # process events
+    npars = pd.Series(dtype=int)
    # avoid insane amount of memory use with a large dataset
    for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]):
        gc.collect()
 @@ -129,8 +135,15 @@ if __name__ == '__main__':
        dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2)
        # Smear with tracking resolution (0.5%)
        dfm.loc[:, 'Ptrack'] = dfm['P']*np.random.normal(1.0, args.res, len(dfm))
-        # print(dfm[['PDG', 'P', 'Ptrack']])
-        # print(dfm['PDG'].value_counts())
+
+        # count how many types of particles
+        npars = npars.add(dfm['PDG'].value_counts(), fill_value=0)
+        sorted_pids = [p for p in par_table.keys() if p in npars.index] \
+                    + [p for p in npars.index if p not in par_table.keys()]
+        npars = npars[sorted_pids]
+        # enlarge data container
+        if len(npars) > hists.shape[0]:
+            hists = np.vstack([hists, np.zeros(shape=(len(npars) - hists.shape[0],*hists.shape[1:]))])

        # reconstructed simulation hits
        df = flatten_collection(rdf, args.branch, (int(ev_begin), int(ev_end)), cols=[
 @@ -147,123 +160,119 @@ if __name__ == '__main__':
        dfe = dfe.reset_index().merge(dfm[['event', 'Ptrack', 'PDG']], on='event')
        dfe.loc[:, 'eoverp'] = dfe['energy']/dfe['Ptrack']

-        # NOTE: assumed only e- and pi-
-        el_mask = dfe['PDG'] == 11
-        dfel = dfe[el_mask]
-        dfpi = dfe[~el_mask]
-
-        # fill data layer by layer
-        for hist, dftmp in zip([el_hist, pi_hist], [dfel, dfpi]):
+        for pid, hist in zip(npars.index, hists):
+            mask = dfe['PDG'] == pid
+            dft = dfe[mask]
            for i in np.arange(nlayers):
-                vals = dftmp[dftmp['layer'] == i + 1]['eoverp'].values
+                vals = dft[dft['layer'] == i + 1]['eoverp'].values
                hvals, _ = np.histogram(vals, bins=ep_bins)
-                hist[i] += hvals
+                hist[i] = hist[i] + hvals
+
+    # fill dictionary if not pre-defined
+    for pid in npars.index:
+        if pid not in par_table.keys():
+            print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid))
+            par_table[pid] = dotdict(
+                name='PDG_{:d}'.format(pid),
+                full='PDG-{:d}'.format(pid),
+                latex='PDG_{:d}'.format(pid)
+                )

    # save binned data
-    for hist, label in zip([el_hist, pi_hist], ['el', 'pi']):
-        dfr = pd.DataFrame(
+    dfr = pd.DataFrame()
+    for pid, hist in zip(npars.index, hists):
+        dft = pd.DataFrame(
                columns=['bin_left', 'bin_right'] + ['layer_{:d}'.format(i+1) for i in np.arange(nlayers)],
                data=np.vstack([ep_bins[:-1], ep_bins[1:], hist]).T
                )
-        dfr.to_csv(os.path.join(args.outdir, '{}_eop_{}.csv'.format(args.ntag, label)), index=False)
-        # print(label, np.sum(hist, axis=1))
-        # print(dfr)
+        dft.loc[:, 'PDG'] = pid
+        dfr = pd.concat([dfr, dft], ignore_index=True)
+    dfr.to_csv(os.path.join(args.outdir, '{}_binned_data.csv'.format(args.ntag)), index=False)

+    # study E/p cut
+    opt_par = par_table.get(args.opt_pid)
+    if args.opt_pid not in npars.index:
+        print('Error = E/p cut: cannot find targeted particle {} in samples. Stop E/p cut scan.'.format(args.opt_pid))
+        exit(-1)

-    # NOTE assumed e-pi only
-    nel = np.sum(el_hist, axis=1)[0]
-    npi = ntotal - nel
+    # find the hist for targeted particle
+    opt_idx = {p: i for i, p in enumerate(npars.index)}[args.opt_pid]
+    opt_hist = hists[opt_idx]

-    # prepare output container
-    best = OrderedDict(
-        layer=int(nlayers + 1),
-        ep_cut=0.,
-        el_eff=0.,
-        pi_rej=0.,
-        )
-    ep_dict = OrderedDict(
-        info= OrderedDict(
-            nsamples=OrderedDict(
-                total=int(ntotal),
-                electron=nel,
-                pion=npi,
-                ),
-            targeted_efficiency=args.eff,
-            tracking_resolution=args.res,
-            ),
-        best=best,
-        )
-
-    # scan layers
    pdf = mpdf.PdfPages(os.path.join(args.outdir, '{}_layers.pdf'.format(args.ntag)))
+    # distribution plot by layer
    box_props = dict(boxstyle='round', facecolor='white', alpha=0.5)
+    cut_data = []
    for i in np.arange(nlayers):
-        elvals, pivals = el_hist[i], pi_hist[i]
        # cut position
        # NOTE: larger error with wider bins
-        perc = np.cumsum(elvals)/np.sum(elvals)
-        idx = len(perc[perc < 1. - args.eff])
+        perc = np.cumsum(opt_hist[i])/np.sum(opt_hist[i])
+        idx = len(perc[perc < 1. - args.opt_eff])
        ep_cut = ep_centers[idx]
-        # efficiency
-        eff = np.sum(elvals[idx:])/np.sum(elvals)
-        # rejection power
-        rej = np.sum(pivals[:idx])/np.sum(pivals)
-        ep_dict['layer_{:d}'.format(i + 1)] = OrderedDict(
-                ep_cut=ep_cut,
-                el_eff=eff,
-                pi_rej=rej,
-                nsamples_after_cut=OrderedDict(
-                    total=np.sum(elvals[idx:] + pivals[idx:]) ,
-                    electron=np.sum(elvals[idx:]),
-                    pion=np.sum(pivals[idx:]),
-                    )
-                )
-        # greater or [equal] here because always favor the cut on more layers
-        if rej >= best['pi_rej']:
-            best.update({
-                'layer': int(i + 1),
-                'ep_cut': ep_cut,
-                'el_eff': eff,
-                'pi_rej': rej,
-                })
-        # plot showing the results
+
        fig, ax = plt.subplots(figsize=(8, 8))
-        ax.hist(ep_centers, weights=pivals, bins=50, label='$\pi^-$', color=colors[0], ec=colors[0], alpha=0.5)
-        ax.hist(ep_centers, weights=elvals, bins=50, label='$e^-$', color=colors[1], ec=colors[1], alpha=0.5)
+        texts = [r'Layer $\leq${:d}'.format(i + 1)]
+        # cut results
+        cut_result = OrderedDict(ep_cut=ep_cut)
+        for k, (pid, hist) in enumerate(zip(npars.index, hists)):
+            vals = hist[i]
+            npar = npars[pid]
+            ncut = np.sum(vals[idx:])
+            eff = ncut/float(npar)
+            # uncertainties: binomial variance npq
+            nerr = np.sqrt(npar*eff*(1. - eff))
+            # d(x/N) = dx/N
+            eff_err = nerr/float(npar)
+
+            par = par_table.get(pid)
+            cut_result[par.name] = OrderedDict(
+                efficiency=eff,
+                error=eff_err,
+                # nsamples=npar, # this is redundant
+                nsamples_after_cut=ncut,
+                )
+            ax.hist(ep_centers, weights=vals, bins=50, label='${}$'.format(par.latex),
+                    color=colors[k], ec=colors[k], alpha=0.5)
+            texts.append(r'$\epsilon_{{{}}}={:.2f}$%'.format(par.latex, eff*100.))
+        cut_data.append(cut_result)
+
        ax.legend(fontsize=20, ncol=2, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
        ax.tick_params(labelsize=20)
        ax.set_xlabel('$E/p$', fontsize=20)
        ax.set_ylabel('Counts', fontsize=20)
        ax.axvline(x=ep_cut, color='k', ls='--', lw=2)
-        ax.text(0.5, 0.97,
-                'Layer $\leq${:d}\n$\epsilon_e={:.2f}$%\n$R_{{\pi}}={:.2f}$%'.format(i + 1, eff*100., rej*100.),
+        ax.text(0.5, 0.97, '\n'.join(texts),
                transform=ax.transAxes, fontsize=20, va='top', ha='center', bbox=box_props)
        pdf.savefig(fig)
        plt.close(fig)

-    # a plot for the cut scan
-    cuts = [ep_dict.get('layer_{:d}'.format(i + 1)) for i in np.arange(nlayers)]
-    cuts_pos = np.array([c.get('ep_cut') for c in cuts])
-    cuts_rej = np.array([c.get('pi_rej') for c in cuts])
-
-    # estimated uncertainty (binomial)
-    nerr = np.sqrt(cuts_rej*(1. - cuts_rej)*npi) # npq
-    # leftover pions
-    nres = npi * (1. - cuts_rej)
-    nres_lo = np.clip(nres - nerr, 1, npi)
-    nres_hi = np.clip(nres + nerr, 1, npi)
-
-    # rejection power
-    rej_pow = npi/nres
-    rej_err = ((rej_pow - npi/nres_hi), (npi/nres_lo - rej_pow))
-
+    # 2d-scan plot
    fig, ax1 = plt.subplots(figsize=(8, 8))
    ax2 = ax1.twinx()
    ax2.set_yscale('log')

-    ax1.plot(np.arange(nlayers) + 1, cuts_pos, ls='-', color=colors[0])
-    ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err,
-                 fmt='o', capsize=3, color=colors[1])
+    # cut position
+    cut_pos = [c.get('ep_cut') for c in cut_data]
+    ax1.plot(np.arange(nlayers) + 1, cut_pos, ls='-', color=colors[0])
+
+    # rejection power for other particles
+    npars2 = npars[npars.index != args.opt_pid]
+    # use alpha to identify different particles
+    alphas = 1.0 - np.linspace(0, 0.7, len(npars2))
+
+    rej_tot = np.zeros(shape=(nlayers,))
+    for pid, alpha in zip(npars2.index, alphas):
+        par = par_table.get(pid)
+        pdata = [c.get(par.name) for c in cut_data]
+        eff = np.array([c.get('efficiency') for c in pdata])
+        eff_err = np.array([c.get('error') for c in pdata])
+        rej_pow = np.reciprocal(eff)
+        # d(1/x) = -dx/x^2
+        rej_err = rej_pow**2*eff_err
+        rej_tot += rej_pow * npars2[pid]/float(npars2.sum())
+        # NOTE: low and high errors are switched when calculating rejection
+        ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err,
+                     fmt='o', capsize=3, color=colors[1], alpha=alpha, label='${}$'.format(par.latex))

    ax1.set_xlabel('Layer Number', fontsize=20)
    ax1.set_ylabel('Cut Position (E/p)', color=colors[0], fontsize=20)
 @@ -271,21 +280,43 @@ if __name__ == '__main__':
    ax2.grid(axis='both', which='both', ls=':')
    ax2.xaxis.set_major_locator(MultipleLocator(5))
    ax2.xaxis.set_minor_locator(MultipleLocator(1))
-    ax2.set_ylabel('$\pi^-$ Rejection Power', color=colors[1], fontsize=20)
+    ax2.set_ylabel('Rejection Power', color=colors[1], fontsize=20)
+    # ax2.legend(fontsize=20, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
+    ax1.set_title('2D Scan of E/p Cut', fontsize=22)
    ax1.tick_params(labelsize=20)
    ax2.tick_params(labelsize=20)
-    ax1.set_title('2D Scan of E/p Cut', fontsize=22)
-    ax1.text(0.5, 0.03, '$\epsilon_e \geq$ {:.2f}%'.format(args.eff*100.),
+    ax1.text(0.5, 0.03, r'$\epsilon_{{{}}}\geq$ {:.2f}%'.format(opt_par.latex, args.opt_eff*100.),
             transform=ax1.transAxes, fontsize=20, va='bottom', ha='center', bbox=box_props)
    fig.subplots_adjust(left=0.15, right=0.85)
-
    pdf.savefig(fig)
    plt.close(fig)
+
    pdf.close()

-    # save cut position and performance
-    ep_dict.update({'best': best})
-    ep_json = json.dumps(ep_dict, indent=4)
+    # save cut position and benchmarks
+    nsamples = OrderedDict(total=npars.sum())
+    for pid, count in npars.items():
+        par = par_table.get(pid)
+        nsamples[par.name] = count
+    ep_dict = OrderedDict(
+        info=OrderedDict(
+            nsamples=nsamples,
+            optimize_for=opt_par.name,
+            optimize_efficiency=args.opt_eff,
+            tracking_resolution=args.res,
+            ),
+        best=OrderedDict(layer=0),
+        )
+
+    for i, data in enumerate(cut_data):
+        ep_dict['layer_{:d}'.format(i + 1)] = data
+
+    # best layer
+    idx = np.argmax(rej_tot)
+    ep_dict['best'].update({'layer': idx + 1, 'rejection_power': rej_tot[idx]})
+    ep_dict['best'].update(cut_data[idx])
+    # save to json
+    ep_json = json.dumps(ep_dict, indent=4, cls=NpEncoder)
    with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile:
        outfile.write(ep_json)