Skip to content
Snippets Groups Projects

Improve ML training script

Merged Chao Peng requested to merge update_imcal_ml_benchmarks into master
Files
5
'''
"""
A script to scan the optimized cut on layer and E/p.
It scan all the possible ScFi layers (20 in the EPIC brycecanyon configuration)
The results give the best cut (highest pion rejection) on [layer, E/p] with a targeted electron efficiency
Chao Peng (ANL)
2022/11/13
'''
"""
import os
import gc
import sys
@@ -18,7 +18,7 @@ import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as mpdf
from matplotlib.ticker import MultipleLocator
from collections import OrderedDict
from utils import flatten_collection, imcal_info
from utils import flatten_collection, imcal_info, par_table, NpEncoder
# default color cycle of matplotlib
@@ -74,10 +74,16 @@ if __name__ == '__main__':
help='relative resolution of tracking (used to smear truth momentum).'
)
parser.add_argument(
'-e', '--efficiency', type=float,
dest='eff',
'--optimize-for', type=int,
dest='opt_pid',
default=11,
help='optimize efficiency for the specified particle (PDGCODE).'
)
parser.add_argument(
'--optimize-efficiency', type=float,
dest='opt_eff',
default=0.97,
help='targeted efficiency from the cuts.'
help='targeted efficiency for the cuts.'
)
parser.add_argument(
'--sampling-fractions', type=str,
@@ -97,18 +103,18 @@ if __name__ == '__main__':
ntotal = rdf.Count().GetValue()
ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal])
# data container
nlayers = int(imcal_info.nlayers_scfi)
# sampling fraction bins
samps = args.sampling_fractions.split(',')
samp_range = (float(samps[0]), float(samps[1]))
samp_nbins = int(samps[2])
# print(nlayers)
ep_bins = np.linspace(*samp_range, samp_nbins + 1)
ep_centers = (ep_bins[:-1] + ep_bins[1:])/2.
el_hist = np.zeros(shape=(nlayers, samp_nbins))
pi_hist = np.zeros(shape=(nlayers, samp_nbins))
# data container
nlayers = int(imcal_info.nlayers_scfi)
hists = np.zeros(shape=(0, nlayers, samp_nbins)) # place holder
# process events
npars = pd.Series(dtype=int)
# avoid insane amount of memory use with a large dataset
for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]):
gc.collect()
@@ -129,8 +135,15 @@ if __name__ == '__main__':
dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2)
# Smear with tracking resolution (0.5%)
dfm.loc[:, 'Ptrack'] = dfm['P']*np.random.normal(1.0, args.res, len(dfm))
# print(dfm[['PDG', 'P', 'Ptrack']])
# print(dfm['PDG'].value_counts())
# count how many types of particles
npars = npars.add(dfm['PDG'].value_counts(), fill_value=0)
sorted_pids = [p for p in par_table.keys() if p in npars.index] \
+ [p for p in npars.index if p not in par_table.keys()]
npars = npars[sorted_pids]
# enlarge data container
if len(npars) > hists.shape[0]:
hists = np.vstack([hists, np.zeros(shape=(len(npars) - hists.shape[0],*hists.shape[1:]))])
# reconstructed simulation hits
df = flatten_collection(rdf, args.branch, (int(ev_begin), int(ev_end)), cols=[
@@ -147,123 +160,119 @@ if __name__ == '__main__':
dfe = dfe.reset_index().merge(dfm[['event', 'Ptrack', 'PDG']], on='event')
dfe.loc[:, 'eoverp'] = dfe['energy']/dfe['Ptrack']
# NOTE: assumed only e- and pi-
el_mask = dfe['PDG'] == 11
dfel = dfe[el_mask]
dfpi = dfe[~el_mask]
# fill data layer by layer
for hist, dftmp in zip([el_hist, pi_hist], [dfel, dfpi]):
for pid, hist in zip(npars.index, hists):
mask = dfe['PDG'] == pid
dft = dfe[mask]
for i in np.arange(nlayers):
vals = dftmp[dftmp['layer'] == i + 1]['eoverp'].values
vals = dft[dft['layer'] == i + 1]['eoverp'].values
hvals, _ = np.histogram(vals, bins=ep_bins)
hist[i] += hvals
hist[i] = hist[i] + hvals
# fill dictionary if not pre-defined
for pid in npars.index:
if pid not in par_table.keys():
print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid))
par_table[pid] = dotdict(
name='PDG_{:d}'.format(pid),
full='PDG-{:d}'.format(pid),
latex='PDG_{:d}'.format(pid)
)
# save binned data
for hist, label in zip([el_hist, pi_hist], ['el', 'pi']):
dfr = pd.DataFrame(
dfr = pd.DataFrame()
for pid, hist in zip(npars.index, hists):
dft = pd.DataFrame(
columns=['bin_left', 'bin_right'] + ['layer_{:d}'.format(i+1) for i in np.arange(nlayers)],
data=np.vstack([ep_bins[:-1], ep_bins[1:], hist]).T
)
dfr.to_csv(os.path.join(args.outdir, '{}_eop_{}.csv'.format(args.ntag, label)), index=False)
# print(label, np.sum(hist, axis=1))
# print(dfr)
dft.loc[:, 'PDG'] = pid
dfr = pd.concat([dfr, dft], ignore_index=True)
dfr.to_csv(os.path.join(args.outdir, '{}_binned_data.csv'.format(args.ntag)), index=False)
# study E/p cut
opt_par = par_table.get(args.opt_pid)
if args.opt_pid not in npars.index:
print('Error = E/p cut: cannot find targeted particle {} in samples. Stop E/p cut scan.'.format(args.opt_pid))
exit(-1)
# NOTE assumed e-pi only
nel = np.sum(el_hist, axis=1)[0]
npi = ntotal - nel
# find the hist for targeted particle
opt_idx = {p: i for i, p in enumerate(npars.index)}[args.opt_pid]
opt_hist = hists[opt_idx]
# prepare output container
best = OrderedDict(
layer=int(nlayers + 1),
ep_cut=0.,
el_eff=0.,
pi_rej=0.,
)
ep_dict = OrderedDict(
info= OrderedDict(
nsamples=OrderedDict(
total=int(ntotal),
electron=nel,
pion=npi,
),
targeted_efficiency=args.eff,
tracking_resolution=args.res,
),
best=best,
)
# scan layers
pdf = mpdf.PdfPages(os.path.join(args.outdir, '{}_layers.pdf'.format(args.ntag)))
# distribution plot by layer
box_props = dict(boxstyle='round', facecolor='white', alpha=0.5)
cut_data = []
for i in np.arange(nlayers):
elvals, pivals = el_hist[i], pi_hist[i]
# cut position
# NOTE: larger error with wider bins
perc = np.cumsum(elvals)/np.sum(elvals)
idx = len(perc[perc < 1. - args.eff])
perc = np.cumsum(opt_hist[i])/np.sum(opt_hist[i])
idx = len(perc[perc < 1. - args.opt_eff])
ep_cut = ep_centers[idx]
# efficiency
eff = np.sum(elvals[idx:])/np.sum(elvals)
# rejection power
rej = np.sum(pivals[:idx])/np.sum(pivals)
ep_dict['layer_{:d}'.format(i + 1)] = OrderedDict(
ep_cut=ep_cut,
el_eff=eff,
pi_rej=rej,
nsamples_after_cut=OrderedDict(
total=np.sum(elvals[idx:] + pivals[idx:]) ,
electron=np.sum(elvals[idx:]),
pion=np.sum(pivals[idx:]),
)
)
# greater or [equal] here because always favor the cut on more layers
if rej >= best['pi_rej']:
best.update({
'layer': int(i + 1),
'ep_cut': ep_cut,
'el_eff': eff,
'pi_rej': rej,
})
# plot showing the results
fig, ax = plt.subplots(figsize=(8, 8))
ax.hist(ep_centers, weights=pivals, bins=50, label='$\pi^-$', color=colors[0], ec=colors[0], alpha=0.5)
ax.hist(ep_centers, weights=elvals, bins=50, label='$e^-$', color=colors[1], ec=colors[1], alpha=0.5)
texts = [r'Layer $\leq${:d}'.format(i + 1)]
# cut results
cut_result = OrderedDict(ep_cut=ep_cut)
for k, (pid, hist) in enumerate(zip(npars.index, hists)):
vals = hist[i]
npar = npars[pid]
ncut = np.sum(vals[idx:])
eff = ncut/float(npar)
# uncertainties: binomial variance npq
nerr = np.sqrt(npar*eff*(1. - eff))
# d(x/N) = dx/N
eff_err = nerr/float(npar)
par = par_table.get(pid)
cut_result[par.name] = OrderedDict(
efficiency=eff,
error=eff_err,
# nsamples=npar, # this is redundant
nsamples_after_cut=ncut,
)
ax.hist(ep_centers, weights=vals, bins=50, label='${}$'.format(par.latex),
color=colors[k], ec=colors[k], alpha=0.5)
texts.append(r'$\epsilon_{{{}}}={:.2f}$%'.format(par.latex, eff*100.))
cut_data.append(cut_result)
ax.legend(fontsize=20, ncol=2, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
ax.tick_params(labelsize=20)
ax.set_xlabel('$E/p$', fontsize=20)
ax.set_ylabel('Counts', fontsize=20)
ax.axvline(x=ep_cut, color='k', ls='--', lw=2)
ax.text(0.5, 0.97,
'Layer $\leq${:d}\n$\epsilon_e={:.2f}$%\n$R_{{\pi}}={:.2f}$%'.format(i + 1, eff*100., rej*100.),
ax.text(0.5, 0.97, '\n'.join(texts),
transform=ax.transAxes, fontsize=20, va='top', ha='center', bbox=box_props)
pdf.savefig(fig)
plt.close(fig)
# a plot for the cut scan
cuts = [ep_dict.get('layer_{:d}'.format(i + 1)) for i in np.arange(nlayers)]
cuts_pos = np.array([c.get('ep_cut') for c in cuts])
cuts_rej = np.array([c.get('pi_rej') for c in cuts])
# estimated uncertainty (binomial)
nerr = np.sqrt(cuts_rej*(1. - cuts_rej)*npi) # npq
# leftover pions
nres = npi * (1. - cuts_rej)
nres_lo = np.clip(nres - nerr, 1, npi)
nres_hi = np.clip(nres + nerr, 1, npi)
# rejection power
rej_pow = npi/nres
rej_err = ((rej_pow - npi/nres_hi), (npi/nres_lo - rej_pow))
# 2d-scan plot
fig, ax1 = plt.subplots(figsize=(8, 8))
ax2 = ax1.twinx()
ax2.set_yscale('log')
ax1.plot(np.arange(nlayers) + 1, cuts_pos, ls='-', color=colors[0])
ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err,
fmt='o', capsize=3, color=colors[1])
# cut position
cut_pos = [c.get('ep_cut') for c in cut_data]
ax1.plot(np.arange(nlayers) + 1, cut_pos, ls='-', color=colors[0])
# rejection power for other particles
npars2 = npars[npars.index != args.opt_pid]
# use alpha to identify different particles
alphas = 1.0 - np.linspace(0, 0.7, len(npars2))
rej_tot = np.zeros(shape=(nlayers,))
for pid, alpha in zip(npars2.index, alphas):
par = par_table.get(pid)
pdata = [c.get(par.name) for c in cut_data]
eff = np.array([c.get('efficiency') for c in pdata])
eff_err = np.array([c.get('error') for c in pdata])
rej_pow = np.reciprocal(eff)
# d(1/x) = -dx/x^2
rej_err = rej_pow**2*eff_err
rej_tot += rej_pow * npars2[pid]/float(npars2.sum())
# NOTE: low and high errors are switched when calculating rejection
ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err,
fmt='o', capsize=3, color=colors[1], alpha=alpha, label='${}$'.format(par.latex))
ax1.set_xlabel('Layer Number', fontsize=20)
ax1.set_ylabel('Cut Position (E/p)', color=colors[0], fontsize=20)
@@ -271,21 +280,43 @@ if __name__ == '__main__':
ax2.grid(axis='both', which='both', ls=':')
ax2.xaxis.set_major_locator(MultipleLocator(5))
ax2.xaxis.set_minor_locator(MultipleLocator(1))
ax2.set_ylabel('$\pi^-$ Rejection Power', color=colors[1], fontsize=20)
ax2.set_ylabel('Rejection Power', color=colors[1], fontsize=20)
# ax2.legend(fontsize=20, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
ax1.set_title('2D Scan of E/p Cut', fontsize=22)
ax1.tick_params(labelsize=20)
ax2.tick_params(labelsize=20)
ax1.set_title('2D Scan of E/p Cut', fontsize=22)
ax1.text(0.5, 0.03, '$\epsilon_e \geq$ {:.2f}%'.format(args.eff*100.),
ax1.text(0.5, 0.03, r'$\epsilon_{{{}}}\geq$ {:.2f}%'.format(opt_par.latex, args.opt_eff*100.),
transform=ax1.transAxes, fontsize=20, va='bottom', ha='center', bbox=box_props)
fig.subplots_adjust(left=0.15, right=0.85)
pdf.savefig(fig)
plt.close(fig)
pdf.close()
# save cut position and performance
ep_dict.update({'best': best})
ep_json = json.dumps(ep_dict, indent=4)
# save cut position and benchmarks
nsamples = OrderedDict(total=npars.sum())
for pid, count in npars.items():
par = par_table.get(pid)
nsamples[par.name] = count
ep_dict = OrderedDict(
info=OrderedDict(
nsamples=nsamples,
optimize_for=opt_par.name,
optimize_efficiency=args.opt_eff,
tracking_resolution=args.res,
),
best=OrderedDict(layer=0),
)
for i, data in enumerate(cut_data):
ep_dict['layer_{:d}'.format(i + 1)] = data
# best layer
idx = np.argmax(rej_tot)
ep_dict['best'].update({'layer': idx + 1, 'rejection_power': rej_tot[idx]})
ep_dict['best'].update(cut_data[idx])
# save to json
ep_json = json.dumps(ep_dict, indent=4, cls=NpEncoder)
with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile:
outfile.write(ep_json)
Loading