diff --git a/benchmarks/imaging_shower_ML/run_benchmark.py b/benchmarks/imaging_shower_ML/run_benchmark.py index 2b2ee476290dd5c6c52d54beb02899f454eaabd6..0620e040e9bfe9d199e1ac352666626c73a6abef 100755 --- a/benchmarks/imaging_shower_ML/run_benchmark.py +++ b/benchmarks/imaging_shower_ML/run_benchmark.py @@ -221,7 +221,8 @@ if __name__ == '__main__': '-o {epscan_dir} --name-tag {ntag}', '--batch-size {batch}', '-s {seed}', - '-e 0.97' + '--optimize-for 11', + '--optimize-efficiency 0.97', ] cmd = ' '.join(cmd).format(**kwargs).split(' ') return_code = subprocess.run(cmd).returncode @@ -255,6 +256,8 @@ if __name__ == '__main__': '-s {seed}', '--batch-size 256', '--epochs 20', + '--optimize-for 11', + '--optimize-efficiency 0.98', '--test-size {test_size}', ] cmd = ' '.join(cmd).format(**kwargs).split(' ') diff --git a/benchmarks/imaging_shower_ML/scripts/epcut_scan.py b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py index 6250aebd2da59e3c78f5607a634e65e11df229d7..0c35f4a247556a18fb65c07a39e915c24cd2618e 100644 --- a/benchmarks/imaging_shower_ML/scripts/epcut_scan.py +++ b/benchmarks/imaging_shower_ML/scripts/epcut_scan.py @@ -1,11 +1,11 @@ -''' +""" A script to scan the optimized cut on layer and E/p. It scan all the possible ScFi layers (20 in the EPIC brycecanyon configuration) The results give the best cut (highest pion rejection) on [layer, E/p] with a targeted electron efficiency Chao Peng (ANL) 2022/11/13 -''' +""" import os import gc import sys @@ -18,7 +18,7 @@ import matplotlib.pyplot as plt import matplotlib.backends.backend_pdf as mpdf from matplotlib.ticker import MultipleLocator from collections import OrderedDict -from utils import flatten_collection, imcal_info +from utils import flatten_collection, imcal_info, par_table, NpEncoder # default color cycle of matplotlib @@ -74,10 +74,16 @@ if __name__ == '__main__': help='relative resolution of tracking (used to smear truth momentum).' ) parser.add_argument( - '-e', '--efficiency', type=float, - dest='eff', + '--optimize-for', type=int, + dest='opt_pid', + default=11, + help='optimize efficiency for the specified particle (PDGCODE).' + ) + parser.add_argument( + '--optimize-efficiency', type=float, + dest='opt_eff', default=0.97, - help='targeted efficiency from the cuts.' + help='targeted efficiency for the cuts.' ) parser.add_argument( '--sampling-fractions', type=str, @@ -97,18 +103,18 @@ if __name__ == '__main__': ntotal = rdf.Count().GetValue() ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal]) - # data container - nlayers = int(imcal_info.nlayers_scfi) + # sampling fraction bins samps = args.sampling_fractions.split(',') samp_range = (float(samps[0]), float(samps[1])) samp_nbins = int(samps[2]) - # print(nlayers) ep_bins = np.linspace(*samp_range, samp_nbins + 1) ep_centers = (ep_bins[:-1] + ep_bins[1:])/2. - el_hist = np.zeros(shape=(nlayers, samp_nbins)) - pi_hist = np.zeros(shape=(nlayers, samp_nbins)) + # data container + nlayers = int(imcal_info.nlayers_scfi) + hists = np.zeros(shape=(0, nlayers, samp_nbins)) # place holder # process events + npars = pd.Series(dtype=int) # avoid insane amount of memory use with a large dataset for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]): gc.collect() @@ -129,8 +135,15 @@ if __name__ == '__main__': dfm.loc[:, 'P'] = np.sqrt(dfm['momentum.x']**2 + dfm['momentum.y']**2 + dfm['momentum.z']**2) # Smear with tracking resolution (0.5%) dfm.loc[:, 'Ptrack'] = dfm['P']*np.random.normal(1.0, args.res, len(dfm)) - # print(dfm[['PDG', 'P', 'Ptrack']]) - # print(dfm['PDG'].value_counts()) + + # count how many types of particles + npars = npars.add(dfm['PDG'].value_counts(), fill_value=0) + sorted_pids = [p for p in par_table.keys() if p in npars.index] \ + + [p for p in npars.index if p not in par_table.keys()] + npars = npars[sorted_pids] + # enlarge data container + if len(npars) > hists.shape[0]: + hists = np.vstack([hists, np.zeros(shape=(len(npars) - hists.shape[0],*hists.shape[1:]))]) # reconstructed simulation hits df = flatten_collection(rdf, args.branch, (int(ev_begin), int(ev_end)), cols=[ @@ -147,123 +160,119 @@ if __name__ == '__main__': dfe = dfe.reset_index().merge(dfm[['event', 'Ptrack', 'PDG']], on='event') dfe.loc[:, 'eoverp'] = dfe['energy']/dfe['Ptrack'] - # NOTE: assumed only e- and pi- - el_mask = dfe['PDG'] == 11 - dfel = dfe[el_mask] - dfpi = dfe[~el_mask] - - # fill data layer by layer - for hist, dftmp in zip([el_hist, pi_hist], [dfel, dfpi]): + for pid, hist in zip(npars.index, hists): + mask = dfe['PDG'] == pid + dft = dfe[mask] for i in np.arange(nlayers): - vals = dftmp[dftmp['layer'] == i + 1]['eoverp'].values + vals = dft[dft['layer'] == i + 1]['eoverp'].values hvals, _ = np.histogram(vals, bins=ep_bins) - hist[i] += hvals + hist[i] = hist[i] + hvals + + # fill dictionary if not pre-defined + for pid in npars.index: + if pid not in par_table.keys(): + print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid)) + par_table[pid] = dotdict( + name='PDG_{:d}'.format(pid), + full='PDG-{:d}'.format(pid), + latex='PDG_{:d}'.format(pid) + ) # save binned data - for hist, label in zip([el_hist, pi_hist], ['el', 'pi']): - dfr = pd.DataFrame( + dfr = pd.DataFrame() + for pid, hist in zip(npars.index, hists): + dft = pd.DataFrame( columns=['bin_left', 'bin_right'] + ['layer_{:d}'.format(i+1) for i in np.arange(nlayers)], data=np.vstack([ep_bins[:-1], ep_bins[1:], hist]).T ) - dfr.to_csv(os.path.join(args.outdir, '{}_eop_{}.csv'.format(args.ntag, label)), index=False) - # print(label, np.sum(hist, axis=1)) - # print(dfr) + dft.loc[:, 'PDG'] = pid + dfr = pd.concat([dfr, dft], ignore_index=True) + dfr.to_csv(os.path.join(args.outdir, '{}_binned_data.csv'.format(args.ntag)), index=False) + # study E/p cut + opt_par = par_table.get(args.opt_pid) + if args.opt_pid not in npars.index: + print('Error = E/p cut: cannot find targeted particle {} in samples. Stop E/p cut scan.'.format(args.opt_pid)) + exit(-1) - # NOTE assumed e-pi only - nel = np.sum(el_hist, axis=1)[0] - npi = ntotal - nel + # find the hist for targeted particle + opt_idx = {p: i for i, p in enumerate(npars.index)}[args.opt_pid] + opt_hist = hists[opt_idx] - # prepare output container - best = OrderedDict( - layer=int(nlayers + 1), - ep_cut=0., - el_eff=0., - pi_rej=0., - ) - ep_dict = OrderedDict( - info= OrderedDict( - nsamples=OrderedDict( - total=int(ntotal), - electron=nel, - pion=npi, - ), - targeted_efficiency=args.eff, - tracking_resolution=args.res, - ), - best=best, - ) - - # scan layers pdf = mpdf.PdfPages(os.path.join(args.outdir, '{}_layers.pdf'.format(args.ntag))) + # distribution plot by layer box_props = dict(boxstyle='round', facecolor='white', alpha=0.5) + cut_data = [] for i in np.arange(nlayers): - elvals, pivals = el_hist[i], pi_hist[i] # cut position # NOTE: larger error with wider bins - perc = np.cumsum(elvals)/np.sum(elvals) - idx = len(perc[perc < 1. - args.eff]) + perc = np.cumsum(opt_hist[i])/np.sum(opt_hist[i]) + idx = len(perc[perc < 1. - args.opt_eff]) ep_cut = ep_centers[idx] - # efficiency - eff = np.sum(elvals[idx:])/np.sum(elvals) - # rejection power - rej = np.sum(pivals[:idx])/np.sum(pivals) - ep_dict['layer_{:d}'.format(i + 1)] = OrderedDict( - ep_cut=ep_cut, - el_eff=eff, - pi_rej=rej, - nsamples_after_cut=OrderedDict( - total=np.sum(elvals[idx:] + pivals[idx:]) , - electron=np.sum(elvals[idx:]), - pion=np.sum(pivals[idx:]), - ) - ) - # greater or [equal] here because always favor the cut on more layers - if rej >= best['pi_rej']: - best.update({ - 'layer': int(i + 1), - 'ep_cut': ep_cut, - 'el_eff': eff, - 'pi_rej': rej, - }) - # plot showing the results + fig, ax = plt.subplots(figsize=(8, 8)) - ax.hist(ep_centers, weights=pivals, bins=50, label='$\pi^-$', color=colors[0], ec=colors[0], alpha=0.5) - ax.hist(ep_centers, weights=elvals, bins=50, label='$e^-$', color=colors[1], ec=colors[1], alpha=0.5) + texts = [r'Layer $\leq${:d}'.format(i + 1)] + # cut results + cut_result = OrderedDict(ep_cut=ep_cut) + for k, (pid, hist) in enumerate(zip(npars.index, hists)): + vals = hist[i] + npar = npars[pid] + ncut = np.sum(vals[idx:]) + eff = ncut/float(npar) + # uncertainties: binomial variance npq + nerr = np.sqrt(npar*eff*(1. - eff)) + # d(x/N) = dx/N + eff_err = nerr/float(npar) + + par = par_table.get(pid) + cut_result[par.name] = OrderedDict( + efficiency=eff, + error=eff_err, + # nsamples=npar, # this is redundant + nsamples_after_cut=ncut, + ) + ax.hist(ep_centers, weights=vals, bins=50, label='${}$'.format(par.latex), + color=colors[k], ec=colors[k], alpha=0.5) + texts.append(r'$\epsilon_{{{}}}={:.2f}$%'.format(par.latex, eff*100.)) + cut_data.append(cut_result) + ax.legend(fontsize=20, ncol=2, loc='upper center', bbox_to_anchor=(0.5, 1.12),) ax.tick_params(labelsize=20) ax.set_xlabel('$E/p$', fontsize=20) ax.set_ylabel('Counts', fontsize=20) ax.axvline(x=ep_cut, color='k', ls='--', lw=2) - ax.text(0.5, 0.97, - 'Layer $\leq${:d}\n$\epsilon_e={:.2f}$%\n$R_{{\pi}}={:.2f}$%'.format(i + 1, eff*100., rej*100.), + ax.text(0.5, 0.97, '\n'.join(texts), transform=ax.transAxes, fontsize=20, va='top', ha='center', bbox=box_props) pdf.savefig(fig) plt.close(fig) - # a plot for the cut scan - cuts = [ep_dict.get('layer_{:d}'.format(i + 1)) for i in np.arange(nlayers)] - cuts_pos = np.array([c.get('ep_cut') for c in cuts]) - cuts_rej = np.array([c.get('pi_rej') for c in cuts]) - - # estimated uncertainty (binomial) - nerr = np.sqrt(cuts_rej*(1. - cuts_rej)*npi) # npq - # leftover pions - nres = npi * (1. - cuts_rej) - nres_lo = np.clip(nres - nerr, 1, npi) - nres_hi = np.clip(nres + nerr, 1, npi) - - # rejection power - rej_pow = npi/nres - rej_err = ((rej_pow - npi/nres_hi), (npi/nres_lo - rej_pow)) - + # 2d-scan plot fig, ax1 = plt.subplots(figsize=(8, 8)) ax2 = ax1.twinx() ax2.set_yscale('log') - ax1.plot(np.arange(nlayers) + 1, cuts_pos, ls='-', color=colors[0]) - ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err, - fmt='o', capsize=3, color=colors[1]) + # cut position + cut_pos = [c.get('ep_cut') for c in cut_data] + ax1.plot(np.arange(nlayers) + 1, cut_pos, ls='-', color=colors[0]) + + # rejection power for other particles + npars2 = npars[npars.index != args.opt_pid] + # use alpha to identify different particles + alphas = 1.0 - np.linspace(0, 0.7, len(npars2)) + + rej_tot = np.zeros(shape=(nlayers,)) + for pid, alpha in zip(npars2.index, alphas): + par = par_table.get(pid) + pdata = [c.get(par.name) for c in cut_data] + eff = np.array([c.get('efficiency') for c in pdata]) + eff_err = np.array([c.get('error') for c in pdata]) + rej_pow = np.reciprocal(eff) + # d(1/x) = -dx/x^2 + rej_err = rej_pow**2*eff_err + rej_tot += rej_pow * npars2[pid]/float(npars2.sum()) + # NOTE: low and high errors are switched when calculating rejection + ax2.errorbar(np.arange(nlayers) + 1, rej_pow, yerr=rej_err, + fmt='o', capsize=3, color=colors[1], alpha=alpha, label='${}$'.format(par.latex)) ax1.set_xlabel('Layer Number', fontsize=20) ax1.set_ylabel('Cut Position (E/p)', color=colors[0], fontsize=20) @@ -271,21 +280,43 @@ if __name__ == '__main__': ax2.grid(axis='both', which='both', ls=':') ax2.xaxis.set_major_locator(MultipleLocator(5)) ax2.xaxis.set_minor_locator(MultipleLocator(1)) - ax2.set_ylabel('$\pi^-$ Rejection Power', color=colors[1], fontsize=20) + ax2.set_ylabel('Rejection Power', color=colors[1], fontsize=20) + # ax2.legend(fontsize=20, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),) + ax1.set_title('2D Scan of E/p Cut', fontsize=22) ax1.tick_params(labelsize=20) ax2.tick_params(labelsize=20) - ax1.set_title('2D Scan of E/p Cut', fontsize=22) - ax1.text(0.5, 0.03, '$\epsilon_e \geq$ {:.2f}%'.format(args.eff*100.), + ax1.text(0.5, 0.03, r'$\epsilon_{{{}}}\geq$ {:.2f}%'.format(opt_par.latex, args.opt_eff*100.), transform=ax1.transAxes, fontsize=20, va='bottom', ha='center', bbox=box_props) fig.subplots_adjust(left=0.15, right=0.85) - pdf.savefig(fig) plt.close(fig) + pdf.close() - # save cut position and performance - ep_dict.update({'best': best}) - ep_json = json.dumps(ep_dict, indent=4) + # save cut position and benchmarks + nsamples = OrderedDict(total=npars.sum()) + for pid, count in npars.items(): + par = par_table.get(pid) + nsamples[par.name] = count + ep_dict = OrderedDict( + info=OrderedDict( + nsamples=nsamples, + optimize_for=opt_par.name, + optimize_efficiency=args.opt_eff, + tracking_resolution=args.res, + ), + best=OrderedDict(layer=0), + ) + + for i, data in enumerate(cut_data): + ep_dict['layer_{:d}'.format(i + 1)] = data + + # best layer + idx = np.argmax(rej_tot) + ep_dict['best'].update({'layer': idx + 1, 'rejection_power': rej_tot[idx]}) + ep_dict['best'].update(cut_data[idx]) + # save to json + ep_json = json.dumps(ep_dict, indent=4, cls=NpEncoder) with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile: outfile.write(ep_json) diff --git a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py index 34c5c5de2e17077cb29a10cd4764fe0476de1f1f..dd605f7337ec33d353d08a0398e7ebb953cf85cd 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py @@ -1,10 +1,10 @@ -''' +""" A script to prepare datasets for ML training. It may also apply E/p cut to clean-up the samples Chao Peng (ANL) 2022/11/13 -''' +""" import ROOT import os import gc @@ -20,10 +20,10 @@ pd.set_option('display.max_rows', 500) # normalizing range for features # NOTE: assumed for barrel ecal, using the range of 0.5 - 2 meters to normalize the r0 values -WIN_ETA = (-0.2, 0.2) # unitless -WIN_PHI = (-0.4, 0.4) # rad -WIN_R0 = (500, 2000) # mm -WIN_EH = (0, 0.05) # GeV +WIN_ETA = (-0.2, 0.2) # unitless +WIN_PHI = (-0.4, 0.4) # rad +WIN_R0 = (500, 2000) # mm +WIN_EH = (0, 0.05) # GeV # only positive weights are returned @@ -39,9 +39,9 @@ def read_epcut(path): return None with open(args.epcut, 'r') as f: data = json.load(f) - epcut = data.get('best', {}) - epcut.update(data.get('info', {})) - return epcut + thecut = data.get('best', {}) + thecut.update(data.get('info', {})) + return thecut def lin_norm(vals, window, val_name='', var_name='', warn_win_size=True): @@ -99,7 +99,8 @@ def format_ml_data(dfh, dft, nlayers=20, nhits=50, # data for ML dfml = dfh[['event', 'layer']].copy() - # NOTE: not much a difference observed between eratio or normalized E with a narrow momentum range (e.g., 1.5-2.5 GeV/c) + # NOTE: not much a difference observed between eratio or normalized E with a narrow momentum range + # (e.g., 1.5-2.5 GeV/c) dfml.loc[:, 'eh'] = dfh['eratio'].values # dfml.loc[:, 'eh'] = lin_norm(dfh['energy'].values, win_eh, "Ehit") dfml.loc[:, 'r0'] = lin_norm(r0, win_r0, "r0") @@ -115,15 +116,15 @@ def format_ml_data(dfh, dft, nlayers=20, nhits=50, # padding with zeros # NOTE: if more than nhits per layer, the least energetic hits are dropped because of the previous energy sorting dfml = dfml.reindex( - pd.MultiIndex.from_product([dfml.index.levels[0], np.arange(nlayers) + 1, np.arange(nhits) + 1], - names=['event', 'layer', 'hit']), + pd.MultiIndex.from_product( + [dfml.index.levels[0], np.arange(nlayers) + 1, np.arange(nhits) + 1], + names=['event', 'layer', 'hit']), fill_value=0 ) return dfml, list(event_group.groups.keys()) - if __name__ == '__main__': parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') parser.add_argument( diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training.py b/benchmarks/imaging_shower_ML/scripts/ml_training.py index 730eccf6408edf6874178008424b4845afd140ad..6955ac24c6b79be4caad515d1802fd20423ee5a6 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_training.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py @@ -1,17 +1,16 @@ -''' +""" Script for supervised training of a simple ML model using Tensorflow 2. ML e/pi separation for imaging barrel ecal. Chao Peng (ANL) 2022/11/11 -''' +""" import os import sys import json import argparse import pandas as pd import numpy as np -import ROOT from collections import OrderedDict import matplotlib as mpl @@ -23,7 +22,7 @@ import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers -from utils import imcal_info +from utils import imcal_info, par_table, NpEncoder # default color cycle of matplotlib @@ -31,27 +30,32 @@ prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] -# full name, short name, latex str -PARTICLE_DICT = OrderedDict([ - (11, ('electron', 'electron', '$e^-$')), - (-211, ('pion-minus', 'pion-', '$\pi^-$')), - ]) +def count_pid_labels(label_lst, pid_list): + res = OrderedDict() + res['total'] = len(label_lst) + for p, cnt in zip(pid_list, np.bincount(label_lst)): + res[par_table.get(p).name] = '{:d}'.format(cnt) + return res +def build_model(): + my_model = keras.Sequential([ + keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=xdata.shape[1:]), + keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.25), + keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'), + keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Conv2D(64, (2, 2), padding='same', activation='relu'), + keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.25), -def count_pid_labels(labels, pid_label_dict): - result = OrderedDict() - result['total'] = len(labels) - counts = np.bincount(labels) - label_pid_dict = {i: pid for pid, i in pid_label_dict.items()} - for i, cnt in enumerate(counts): - if cnt == 0 and i not in label_pid_dict.values(): - continue - pid = label_pid_dict.get(i) - particle = PARTICLE_DICT.get(pid, (pid, pid, pid)) - # print(i, pid, cnt, particle) - result[str(particle[1])] = '{:d}'.format(cnt) - return result + keras.layers.Flatten(), + keras.layers.Dense(128, activation='relu'), + # keras.layers.Dropout(0.25), + keras.layers.Dense(32, activation='relu'), + keras.layers.Dense(len(pid_labels), activation='softmax') + ]) + return my_model if __name__ == '__main__': @@ -94,10 +98,14 @@ if __name__ == '__main__': help='batch size for ML training.' ) parser.add_argument( - '-e', '--efficiency', type=float, - dest='eff', + '--optimize-for', type=int, + default=11, + help='optimize cut for a target particle (PDGCODE).' + ) + parser.add_argument( + '--optimize-efficiency', type=float, default=0.98, - help='targeted efficiency from the cuts.' + help='used when --optimize-for is specified, try to keep the efficiency with optimized cuts.' ) parser.add_argument( '--read-model', action='store_true', @@ -110,11 +118,11 @@ if __name__ == '__main__': help='only used when --read-model is enabled, if not specified it reads the model from saving path.' ) # weights - for key, (name, sname, latex) in PARTICLE_DICT.items(): + for key, val in par_table.items(): parser.add_argument( - '--weight-{}'.format(name), type=float, + '--weight-{}'.format(val.full), type=float, default=1.0, - help='estimator weight for {} particles.'.format(name) + help='sample weight for {}.'.format(val.name) ) args = parser.parse_args() kwargs = vars(args) @@ -128,27 +136,32 @@ if __name__ == '__main__': df = pd.read_hdf(args.data_store, key=imcal_info.ml_data_key) # NOTE: assumed event index is exactly the same as df dft = pd.read_hdf(args.data_store, key=imcal_info.truth_data_key) + print('ML training: input sample size.') print(dft['PDG'].value_counts()) # fill dictionary if not pre-defined data_pid = dft['PDG'].unique() for pid in data_pid: - if pid not in PARTICLE_DICT.keys(): - print('WARNING = ML training: particle ({}) not found in dictionary, update the script to include it.'\ - .format(pid)) - PARTICLE_DICT[pid] = ('particle_{:d}'.format(pid), 'PDG_{:d}'.format(pid), 'PDG_{:d}'.format(pid)) + if pid not in par_table.keys(): + print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid)) + par_table[pid] = dotdict( + name='PDG_{:d}'.format(pid), + full='PDG-{:d}'.format(pid), + latex='PDG_{:d}'.format(pid) + ) # build labels and weights for particles # NOTE: pid follows the dictionary orders pid_labels = OrderedDict() pid_weights = OrderedDict() - for pid, (pname, _, _) in PARTICLE_DICT.items(): + for pid, part in par_table.items(): if pid not in data_pid: continue - pid_weights[pid] = kwargs.get('weight_{}'.format(pname.replace('-', '_')), 1.0) + pid_weights[pid] = kwargs.get('weight_{}'.format(part.full.replace('-', '_')), 1.0) pid_labels[pid] = len(pid_labels) - print(pid_labels) - print(pid_weights) + pids = list(pid_labels.keys()) + # print(pid_labels) + # print(pid_weights) # data shape # NOTE: number of layers for combined data @@ -157,7 +170,7 @@ if __name__ == '__main__': nhits = df.index.levels[-1].max() # training datasets - xdata = df.values.reshape(len(dft), nlayers, nhits, len(df.columns)) + xdata = df.values.reshape(dft.shape[0], nlayers, nhits, len(df.columns)) ydata = dft['PDG'].map(pid_labels).values wdata = dft['PDG'].map(pid_weights).values @@ -173,7 +186,7 @@ if __name__ == '__main__': # try to use GPUs # tf.debugging.set_log_device_placement(True) gpus = tf.config.list_logical_devices('GPU') - print("Number of GPUs Available: ", len(gpus)) + print("ML training: number of GPUs available: ", len(gpus)) strategy = tf.distribute.MirroredStrategy(gpus) if args.read_model: model_path = os.path.join(args.outdir, '{}_model'.format(args.ntag)) @@ -183,23 +196,7 @@ if __name__ == '__main__': else: with strategy.scope(): train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain, wtrain)) - model = keras.Sequential([ - keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=xdata.shape[1:]), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.25), - keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.3), - keras.layers.Conv2D(64, (2, 2), padding='same', activation='relu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - - keras.layers.Flatten(), - keras.layers.Dense(128, activation='relu'), - keras.layers.Dropout(0.25), - keras.layers.Dense(32, activation='relu'), - keras.layers.Dense(len(pid_labels), activation='softmax') - ]) - + model = build_model() model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['accuracy']) @@ -208,56 +205,79 @@ if __name__ == '__main__': # benchmark prediction = model.predict(xtest) - - # cut label probability - # NOTE: assumed two-labels (electron, pion) and ordered (see dictionary) - pid_probs = np.zeros(shape=(2, 2)) - fig, ax = plt.subplots(1, 1, figsize=(12, 9), dpi=160) - - # cut position (try to keep the targeted efficiency) - epi_probs = prediction[ytest == 0].T[1] - pion_prob_cut = np.percentile(epi_probs, args.eff*100.) - eff_string = '' + pred_weights = np.ones(len(pid_labels)) + + # results + npar = len(pid_labels) + opt_i = 0 + opt_cut = 0. + + # optimize for target particle's efficiency + opt_param = (None, 0, 1.0) + if args.optimize_for != 0: + opt_pid = args.optimize_for + if opt_pid not in pid_labels.keys(): + print('WARNING = ML training: ' + + 'particle ({}) [PDGCODE] not found in samples, skip optimizing.'.format(opt_pid)) + else: + opt_i = pid_labels.get(opt_pid) + # find optimal prob weight + opt_probs = prediction[ytest == opt_i].T[opt_i] + opt_cut = np.percentile(opt_probs, (1. - args.optimize_efficiency)*100.) + # NOTE: this weight calculation only works for two labels + opt_wt = (1. - opt_cut)/opt_cut + pred_weights[opt_i] = opt_wt + opt_param = (par_table.get(opt_pid).name, args.optimize_efficiency, opt_wt) + + # pick labels with the highest probability + pred_labels = np.argmax(prediction*pred_weights, axis=1) + pred_probs = np.zeros(shape=(npar, npar)) + opt_label = par_table.get(pids[opt_i]).latex + + fig, ax = plt.subplots(figsize=(12, 9), dpi=160) + effs = [] for pid, ilbl in pid_labels.items(): mask = (ytest == ilbl) - probs = prediction[mask] - label = PARTICLE_DICT.get(pid, [''])[-1] - - ax.hist(probs.T[1], bins=np.linspace(0, 1, 101), label=label, color=colors[ilbl], ec=colors[ilbl], alpha=0.5) - - pid_probs[ilbl][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) - pid_probs[ilbl][1] = 1. - pid_probs[ilbl][0] - if pid == -211 or pid == 211: - eff_string += '{} rej. = {:.2f}%'.format(label, pid_probs[ilbl][1]*100.) - else: - eff_string += '{} eff. = {:.2f}%\n'.format(label, pid_probs[ilbl][0]*100.) - ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') + pred_probs[ilbl] = np.bincount(pred_labels[mask])/float(np.sum(mask)) + labels = par_table.get(pid) + + ax.hist(prediction[mask].T[opt_i], bins=np.linspace(0, 1, 101), label='${}$'.format(labels.latex), + color=colors[ilbl], ec=colors[ilbl], alpha=0.5) + + effs.append((labels, pred_probs[ilbl][opt_i])) + if opt_cut > 0.: + ax.axvline(x=opt_cut, lw=2, color='k', ls='--') + eff_string = '\n'.join([r'$\epsilon_{{{}}} = {:.2f}$%'.format(labels.latex, eff*100.) for labels, eff in effs]) + data_to_axis = (ax.transAxes + ax.transData.inverted()).inverted() + ax.text(data_to_axis.transform((opt_cut, 1))[0] + 0.01, 0.99, eff_string, fontsize=24, + transform=ax.transAxes, ha='left', va='top') ax.set_yscale('log') ax.set_ylabel('Counts', fontsize=24) - ax.set_xlabel(r'$P_{\pi}$', fontsize=24) + ax.set_xlabel(r'$P_{{{}}}$'.format(opt_label), fontsize=24) ax.tick_params(direction='in', which='both', labelsize=24) - ax.text(0.55, 0.9, eff_string, fontsize=24, transform=ax.transAxes, ha='left', va='top') ax.legend(fontsize=24, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),) fig.savefig(os.path.join(args.outdir, '{}_pid_hist.pdf'.format(args.ntag))) - # save result + # save results + # all results result = OrderedDict( training= OrderedDict( epochs=args.epochs, batch_size=args.batch_size, - sample_size=count_pid_labels(ytrain, pid_labels), - sample_weights={PARTICLE_DICT.get(pid)[1]: weight for pid, weight in pid_weights.items()}, - targeted_efficiency=args.eff, - pion_prob_cut=pion_prob_cut, + sample_size=count_pid_labels(ytrain, pids), + sample_weights={par_table.get(pid).name: weight for pid, weight in pid_weights.items()}, + optimal_cut=OrderedDict( + target_particle=opt_param[0], + target_efficiency=opt_param[1], + target_label_weight=opt_param[2], + ), ), - test=OrderedDict( - sample_size=count_pid_labels(ytest, pid_labels), - el_eff=pid_probs[0][0], - pi_rej=pid_probs[1][1], - ) + sample_size=count_pid_labels(ytest, pids), + efficiencies=OrderedDict([(labels.name, eff) for labels, eff in effs]) + ), ) - res_json = json.dumps(result, indent=4) + res_json = json.dumps(result, indent=4, cls=NpEncoder) with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile: outfile.write(res_json) diff --git a/benchmarks/imaging_shower_ML/scripts/utils.py b/benchmarks/imaging_shower_ML/scripts/utils.py index 8403c2330b0ac7f7160a66fd4a7420b6f8584b85..f3cd7538f612447e88afa82a5d016bd6a46cb992 100644 --- a/benchmarks/imaging_shower_ML/scripts/utils.py +++ b/benchmarks/imaging_shower_ML/scripts/utils.py @@ -4,9 +4,10 @@ Chao Peng (ANL) 2022/11/13 ''' -import ROOT +import json import pandas as pd import numpy as np +from collections import OrderedDict class dotdict(dict): @@ -24,6 +25,24 @@ imcal_info = dotdict( truth_data_key='/truth', ) +# a lookup table to convert pdgcode to text +par_table = OrderedDict() +par_table[11] = dotdict(name='electron', full='electron', latex='e^-') +par_table[-211] = dotdict(name='pion-', full='pion-minus', latex='\pi^-') + + +# an encoder to serialize numpy data +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return super(NpEncoder, self).default(obj) + + # read from RDataFrame and flatten a given collection, return pandas dataframe def flatten_collection(rdf, collection, ev_range=None, cols=None, event_colname='event'): if not cols: