From 84f3cb75518212288c22f08d1f28bd99ede57e80 Mon Sep 17 00:00:00 2001 From: Chao Peng <cpeng@anl.gov> Date: Mon, 13 Dec 2021 09:09:38 -0600 Subject: [PATCH] update ML to date --- .../options/imaging_ml_data.py | 148 +++++++--- .../scripts/gen_particles.py | 1 + .../imaging_shower_ML/scripts/ml_training.py | 264 ++++++++++++++++++ .../scripts/prepare_tf_dataset.py | 232 ++++++++------- benchmarks/imaging_shower_ML/sim_rec_tag.py | 70 ++++- 5 files changed, 574 insertions(+), 141 deletions(-) create mode 100644 benchmarks/imaging_shower_ML/scripts/ml_training.py diff --git a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py index 22c87d9d..8a570bec 100644 --- a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py +++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py @@ -3,12 +3,16 @@ import ROOT from Gaudi.Configuration import * from GaudiKernel import SystemOfUnits as units -from Configurables import ApplicationMgr, EICDataSvc, PodioOutput, GeoSvc +from Configurables import ApplicationMgr, EICDataSvc, PodioInput, PodioOutput, GeoSvc +from GaudiKernel.SystemOfUnits import MeV, GeV, mm, cm, mrad -from Configurables import PodioInput -from Configurables import Jug__Digi__CalorimeterHitDigi as CalorimeterHitDigi -from Configurables import Jug__Reco__ImagingPixelReco as ImagingPixelReco -from Configurables import Jug__Reco__ImagingPixelMerger as ImagingPixelMerger +from Configurables import Jug__Base__InputCopier_dd4pod__Geant4ParticleCollection_dd4pod__Geant4ParticleCollection_ as MCCopier +from Configurables import Jug__Digi__CalorimeterHitDigi as CalHitDigi +from Configurables import Jug__Reco__CalorimeterHitReco as CalHitReco +from Configurables import Jug__Reco__CalorimeterHitsMerger as CalHitsMerger +from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger +from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter +from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner # input arguments through environment variables @@ -17,13 +21,13 @@ kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '') kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '') kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '') kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1)) -kwargs['nlayers'] = int(os.environ.get('IMCAL_ML_N_LAYERS', 20)) -kwargs['nhits'] = int(os.environ.get('IMCAL_ML_N_HITS', 20)) +kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate') +kwargs['img_nlayers'] = int(os.environ.get('IMCAL_ML_IMG_NLAYERS', 9)) +kwargs['nhits'] = int(os.environ.get('IMCAL_ML_NHITS', 20)) if kwargs['nev'] < 1: f = ROOT.TFile(kwargs['input']) kwargs['nev'] = f.events.GetEntries() - print(kwargs) # get sampling fraction from system environment variable, 1.0 by default sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) @@ -31,40 +35,118 @@ sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) geo_service = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')]) podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')]) -podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits']) +podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits', 'EcalBarrelScFiHits']) podout = PodioOutput('out', filename=kwargs['output']) -imcaldigi = CalorimeterHitDigi('imcal_digi', +copier = MCCopier('MCCopier', + OutputLevel=WARNING, + inputCollection='mcparticles', + outputCollection='mcparticles2') + +# Central Barrel Ecal (Imaging Cal.) +becal_img_daq = dict( + dynamicRangeADC=3*MeV, + capacityADC=8192, + pedestalMean=400, + pedestalSigma=20) # about 6 keV + +becal_img_digi = CalHitDigi('becal_img_digi', inputHitCollection='EcalBarrelHits', - outputHitCollection='EcalBarrelHitsDigi', - energyResolutions=[0., 0.02, 0.], - dynamicRangeADC=3*units.MeV, - pedestalSigma=40) -imcalreco = ImagingPixelReco('imcal_reco', - inputHitCollection=imcaldigi.outputHitCollection, - outputHitCollection='EcalBarrelHitsReco', - dynamicRangeADC=3.*units.MeV, - pedestalSigma=40, - thresholdFactor=3.5, - readoutClass='EcalBarrelHits', - layerField='layer', - sectorField='module') -imcaldata = ImagingPixelMerger('imcal_merger', -# OutputLevel=DEBUG, - inputHitCollection=imcalreco.outputHitCollection, - outputHitCollection='EcalBarrelHitsML', + outputHitCollection='EcalBarrelImagingHitsDigi', + energyResolutions=[0., 0.02, 0.], # 2% flat resolution + **becal_img_daq) + +becal_img_reco = CalHitReco('becal_img_reco', + inputHitCollection=becal_img_digi.outputHitCollection, + outputHitCollection='EcalBarrelImagingHitsReco', + thresholdFactor=3, # about 20 keV + readoutClass='EcalBarrelHits', # readout class + layerField='layer', # field to get layer id + sectorField='module', # field to get sector id + **becal_img_daq) + +becal_img_merger = MLDataMerger('becal_img_merger', + inputHitCollection=becal_img_reco.outputHitCollection, + outputHitCollection='EcalBarrelImagingHitsSeg', etaSize=0.001, - phiSize=0.001, - numberOfHits=kwargs['nhits'], - numberOfLayers=kwargs['nlayers']) + phiSize=0.001) + +becal_img_sorter = MLDataSorter('becal_img_sorter', + inputHitCollection=becal_img_merger.outputHitCollection, + outputHitCollection='EcalBarrelImagingHitsML', + numberOfLayers=kwargs['img_nlayers'], + numberOfHits=kwargs['nhits']) + + +#Central ECAL SciFi +# use the same daq_setting for digi/reco pair +becal_scfi_daq = dict( + dynamicRangeADC=50.*MeV, + capacityADC=32768, + pedestalMean=400, + pedestalSigma=10) +# becal_scfi_daq = dict( +# dynamicRangeADC=50.*MeV, +# capacityADC=2**16, +# pedestalMean=100, +# pedestalSigma=0) + +becal_scfi_digi = CalHitDigi('becal_scfi_digi', + inputHitCollection='EcalBarrelScFiHits', + outputHitCollection='EcalBarrelScFiHitsDigi', + **becal_scfi_daq) + +becal_scfi_reco = CalHitReco('becal_scfi_reco', + inputHitCollection=becal_scfi_digi.outputHitCollection, + outputHitCollection='EcalBarrelScFiHitsReco', + thresholdFactor=5.0, + readoutClass='EcalBarrelScFiHits', + layerField='layer', + sectorField='module', + localDetFields=['system', 'module'], # use local coordinates in each module (stave) + **becal_scfi_daq) + +# merge hits in different layer (projection to local x-y plane) +becal_scfi_merger = CalHitsMerger('becal_scfi_merger', + # OutputLevel=DEBUG, + inputHitCollection=becal_scfi_reco.outputHitCollection, + outputHitCollection='EcalBarrelScFiGridReco', + fields=['fiber'], + fieldRefNumbers=[1], + readoutClass='EcalBarrelScFiHits') + +becal_scfi_sorter = MLDataSorter('becal_scfi_sorter', + inputHitCollection=becal_scfi_merger.outputHitCollection, + outputHitCollection='EcalBarrelScFiHitsML', + numberOfLayers=20, + numberOfHits=kwargs['nhits']) + +# combine layers +becal_combiner = MLDataCombiner('becal_combiner', + inputHitCollection1=becal_img_sorter.outputHitCollection, + inputHitCollection2=becal_scfi_sorter.outputHitCollection, + outputHitCollection='EcalBarrelHitsCombinedML', + layerIncrement=100, + rule=kwargs['combine']) -podout.outputCommands = ['keep *'] +podout.outputCommands = [ +# 'keep *', + 'drop *', + 'keep EcalBarrel*Reco', + 'keep mcparticles*', + 'keep EcalBarrel*ML', + 'keep *Corr', +] ApplicationMgr( - TopAlg=[podin, imcaldigi, imcalreco, imcaldata, podout], + TopAlg=[podin, copier, + becal_img_digi, becal_img_reco, becal_img_merger, becal_img_sorter, + becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, becal_scfi_sorter, + becal_combiner, + podout], EvtSel='NONE', EvtMax=kwargs['nev'], ExtSvc=[podev], - OutputLevel=DEBUG + OutputLevel=WARNING ) diff --git a/benchmarks/imaging_shower_ML/scripts/gen_particles.py b/benchmarks/imaging_shower_ML/scripts/gen_particles.py index 2d0b81c8..af22f35d 100644 --- a/benchmarks/imaging_shower_ML/scripts/gen_particles.py +++ b/benchmarks/imaging_shower_ML/scripts/gen_particles.py @@ -16,6 +16,7 @@ PARTICLES = { "electron": (11, 0.51099895e-3), # electron "positron": (-11, 0.51099895e-3),# positron "photon": (22, 0), # photon + "muon": (13, 105.6583755), # muon } diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training.py b/benchmarks/imaging_shower_ML/scripts/ml_training.py new file mode 100644 index 00000000..f5cd917c --- /dev/null +++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py @@ -0,0 +1,264 @@ +import os +import pandas as pd +import numpy as np +from collections import OrderedDict +from scipy.interpolate import CubicSpline +# import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +import matplotlib as mpl +from matplotlib import pyplot as plt +from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator +from matplotlib.colors import LogNorm +from particle import Particle + +# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +# 5 layers 0-1-2-6-9 +# 6 layers 0-1-2-3-4-6-9 + + +def get_layer_cum_edeps(edep): + return np.tile(np.arange(edep.shape[1]), (edep.shape[0], 1)).flatten(), np.cumsum(edep, axis=1).flatten() + + +def draw_layer_edep(edep, label=''): + layer_num, edep_val = get_layer_cum_edeps(edep) + figure, axis = plt.subplots(figsize=(16, 9), dpi=120) + axis.hist2d(layer_num, edep_val, + bins=(np.arange(0.5, edep.shape[1] + 0.5, step=1.0), 100), norm=mpl.colors.LogNorm()) + axis.grid(linestyle=':') + axis.set_axisbelow(True) + axis.tick_params(labelsize=24, direction='in', which='both') + axis.set_ylabel('Cumulative Edep (MeV)', fontsize=24) + axis.set_xlabel('Layer number', fontsize=24) + axis.xaxis.set_major_locator(MaxNLocator(edep.shape[1], integer=True)) + if label: + axis.set_title(label, fontsize=26) + return figure, axis + + +train_model = True +do_epcut = False +nametag = 'real_imcal' +partag = 'epimuphka' +# partag = 'epi' +prange = (2.0, 2.0) +etag = '{:.1f}_{:.1f}_combined'.format(*prange) +epoch = 20 +# data_shape = (20, 20, 3) +data_shape = (29, 20, 5) +data_dir = r'D:\Data\EIC\ImagingCalorimetry\{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) +out_dir = r'C:\Users\cpeng\OneDrive\EIC\ImagingCalorimeter\AstroPix\hybrid\{name}_{part}_{etag}'\ + .format(name=nametag, part=partag, etag=etag) +raw_data = np.load(os.path.join(data_dir, 'data.npy')) +raw_tags = np.load(os.path.join(data_dir, 'tag.npy')) +edeps = np.load(os.path.join(data_dir, 'edep.npy')) + +# for interlayered data, null the last few layers +astro_nlayers = 9 +astro_nlayers_used = [0, 1, 2, 3, 4] +astro_nlayers_not_used = [2*i for i in np.arange(astro_nlayers) if i not in astro_nlayers_used] +if len(astro_nlayers_used) and len(astro_nlayers_not_used): + raw_data[:, astro_nlayers_not_used] = 0 + +os.makedirs(out_dir, exist_ok=True) +# label converter +labels = OrderedDict([ + (pid, [i, Particle.from_pdgid(pid).name, '']) + for i, pid in enumerate(sorted(np.unique(raw_tags.T[0].astype(int)), key=lambda x: np.abs(x))) +]) + +# add some labels in latex format +name_dict = {'gamma': r'$\gamma$', '-': '$^-$', '+': '$^+$', 'pi': r'$\pi$', 'mu': r'$\mu$'} +for pid, (i, name, _) in labels.items(): + labelstr = name + for str1, str2 in name_dict.items(): + labelstr = labelstr.replace(str1, str2) + labels[pid][2] = labelstr + +part_weights = {pid: 1.0 for pid, _ in labels.items()} +part_weights.update({ + 11: 1.0, + -211: 10.0, +}) + +begin_layer, end_layer = 0, 9 +epcut = 0.085 +if do_epcut: + epi_ids = (11, -211) # (11, -211) + # Eoverp cut in tags data + eovp = np.sum(edeps[:, begin_layer:end_layer], axis=1)*1e-3/raw_tags.T[1] + emask = raw_tags.T[0] == epi_ids[0] + pimask = raw_tags.T[0] == epi_ids[1] + + epmask = tuple([eovp > epcut]) + ep_eff = sum(raw_tags[epmask].T[0] == 11) / sum(emask) + ep_rej = 1. - sum(raw_tags[epmask].T[0] == -211) / sum(pimask) + + bins = np.linspace(0, 0.15, 151) + fig, ax = plt.subplots(figsize=(12, 9), dpi=120) + ax.hist(eovp[emask], histtype='step', density=True, bins=bins, label=labels[epi_ids[0]][2]) + ax.hist(eovp[pimask], histtype='step', density=True, bins=bins, label=labels[epi_ids[1]][2]) + ax.axvline(epcut, color='k') + ax.set_xlabel('Edep / p', fontsize=24) + ax.set_ylabel('Normalized Counts', fontsize=24) + ax.tick_params(labelsize=24) + ax.legend(fontsize=24) + ax.text(0.2, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(ep_eff*100., ep_rej*100.), + fontsize=24, transform=ax.transAxes, ha='left', va='top') + ax.set_title('$E/p > {:.2f}$% @ {:d} - {:d} X$_0$'.format(epcut*100., begin_layer, end_layer), fontsize=24) + fig.savefig(os.path.join(out_dir, 'pre_epcut.png')) + + fig, _ = draw_layer_edep(edeps[emask], 'Cumulative Edep over layer for electrons') + fig.savefig(os.path.join(out_dir, 'cum_edep_el.png')) + fig, _ = draw_layer_edep(edeps[pimask], 'Cumulative Edep over layer for pions') + fig.savefig(os.path.join(out_dir, 'cum_edep_pi.png')) + + data = raw_data[epmask] + tags = raw_tags[epmask] + +else: + ep_eff = 1.0 + ep_rej = 0. + data = raw_data + tags = raw_tags + +pid = np.vectorize(lambda x: labels[x][0])(tags.T[0]) +weights = np.vectorize(lambda x: part_weights[x])(tags.T[0]) + +if train_model: + indices = np.arange(len(data)) + np.random.shuffle(indices) + id_train = indices[:int(len(data)*0.8)] + id_valid = indices[int(len(data)*0.8):] + x_train = data[id_train] + y_train = pid[id_train] + w_train = weights[id_train] + + x_valid = data[id_valid] + y_valid = pid[id_valid] + w_valid = weights[id_valid] + + model = keras.Sequential([ + keras.layers.Conv2D(64, (3, 3), padding='same', activation='selu', input_shape=data_shape), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.25), + keras.layers.Conv2D(128, (2, 2), padding='same', activation='selu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.3), + keras.layers.Conv2D(64, (2, 2), padding='same', activation='selu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + + keras.layers.Flatten(), + keras.layers.Dense(128, activation='selu'), + keras.layers.Dropout(0.25), + keras.layers.Dense(32, activation='selu'), + keras.layers.Dense(len(labels), activation='softmax') + ]) + + model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), + metrics=['accuracy']) + history = model.fit(x_train, y_train, + epochs=epoch, sample_weight=w_train, validation_data=(x_valid, y_valid, w_valid)) + model.save(os.path.join(out_dir, 'model_epch{:d}'.format(epoch))) +else: + model = keras.models.load_model(os.path.join(out_dir, 'model_epch{:d}'.format(epoch))) +# ---- general ---- +# x_test = data +# y_test = pid +# prediction = model.predict(x_test) +# result = prediction.argmax(axis=1) +# pid_probs = [] +# for _, part in labels.items(): +# mask = (y_test == part[0]) +# res = result[mask] +# pid_probs.append(np.bincount(res, minlength=len(labels))/sum(mask)) +# pid_probs = np.asarray(pid_probs) + +# predictions +x_test = data +y_test = pid +prediction = model.predict(x_test) + +# two labels (pion - electron) +if len(labels) == 2: + # --- pion - electron ---- + pion_prob_cut = 0.5 + pid_probs = np.zeros(shape=(2, 2)) + fig, ax = plt.subplots(1, 1, figsize=(12, 9), dpi=160) + ax.set_yscale('log') + ax.set_ylabel('Counts', fontsize=24) + ax.set_xlabel(r'$P_{\pi}$', fontsize=24) + ax.tick_params(direction='in', which='both', labelsize=24) + for i, (_, part) in enumerate(labels.items()): + mask = (y_test == part[0]) + probs = prediction[mask] + ax.hist(probs.T[1], histtype='step', bins=np.linspace(0, 1, 101), label=part[1]) + pid_probs[i][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) + pid_probs[i][1] = 1. - pid_probs[i][0] + ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') + ax.text(0.55, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(pid_probs[0][0]*100., pid_probs[1][1]*100.), + fontsize=24, transform=ax.transAxes, ha='left', va='top') + ax.legend(fontsize=24, loc='lower left') + fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png')) + fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf')) + # --- pion - electron ---- + result_text = open(os.path.join(out_dir, 'rejection_result.txt'), 'w') + lines = [ + 'E/p cut position: {:.4f}, layer {:d} to {:d}, E/p efficiency: {:.4f}%, rejection power: {:.4f}'\ + .format(epcut, begin_layer, end_layer, ep_eff*100., 1./(1. - ep_rej)), + ] if do_epcut else [] + lines += [ + 'ML training weight: {}, pion cut position: {:.4f}, ML efficiency: {:.4f}%, rejection power: {:.4f}'\ + .format(list(part_weights.values()), pion_prob_cut, pid_probs[0][0]*100., 1./(1. - pid_probs[1][1])), + 'combined efficiency: {:.2f}%, rejection power: {:.1f}'\ + .format(pid_probs[0][0]*ep_eff*100., 1./((1. - ep_rej)*(1. - pid_probs[1][1]))), + ] + result_text.write('\n'.join(lines)) + result_text.close() + print('\n'.join(lines)) + +else: + # --- multi-particles --- + pid_probs = np.zeros(shape=(len(labels), len(labels))) + fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) + ax.set_yscale('log') + ax.set_ylabel('Counts', fontsize=26) + ax.set_xlabel(r'Likelihood of the Corresponding Label', fontsize=26) + ax.tick_params(direction='in', which='both', labelsize=26) + # label with maximum likelihood + for _, (i, part, label) in labels.items(): + mask = (y_test == i) + probs = prediction[mask] + pred_labels = probs.argmax(axis=1) + ax.hist(probs.T[i], histtype='step', bins=np.linspace(0, 1, 101), label=label) + for _, (j, _, _) in labels.items(): + pid_probs[i][j] = np.sum(pred_labels == j)/float(np.sum(mask)) + + ax.legend(fontsize=26, loc='upper center', ncol=3, handletextpad=0.3, columnspacing=0.6) + fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png')) + fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf')) + + # --- multi-particles --- +fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) +ax.imshow(pid_probs, cmap='PuBu', norm=LogNorm(vmin=0.01, vmax=1)) +ax.xaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[0], step=1.0))) +ax.yaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[1], step=1.0))) +ax.xaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[0], step=1.0))) +ax.yaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[1], step=1.0))) +ax.set_xticklabels([x[2] for _, x in labels.items()], fontsize=26, va='bottom') +ax.set_yticklabels([x[2] for _, x in labels.items()], fontsize=26, ha='left') +ax.tick_params(bottom=False, left=False, which='both') +ax.tick_params(pad=30, axis='x') +ax.tick_params(pad=40, axis='y') +ax.set_ylabel('Truth', fontsize=26) +ax.set_xlabel('Classification', fontsize=26) +for i in range(pid_probs.shape[0]): + for j in range(pid_probs.shape[1]): + color = 'k' if pid_probs[i, j] < 0.15 else 'w' + text = ax.text(j, i, '{:.2f}%'.format(pid_probs[i, j]*100.), ha='center', va='center', + color=color, fontsize=28 - len(labels)*2) +ax.grid('-', color='k', which='minor') +fig.savefig(os.path.join(out_dir, 'pid_tag.png')) +fig.savefig(os.path.join(out_dir, 'pid_tag.pdf')) diff --git a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py index ea5ac2a1..52b2d529 100644 --- a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py +++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py @@ -1,57 +1,37 @@ import ROOT import os +import gc import numpy as np import pandas as pd import argparse import sys -# read hits data from root file -def get_ml_data(path, evnums=None, shape=(20, 20, 3), branch='EcalBarrelHitsML'): - f = ROOT.TFile(path) - events = f.events - if evnums is None: - evnums = np.arange(events.GetEntries()) - elif isinstance(evnums, int): - evnums = [evnums] - - dbuf = np.zeros(shape=np.hstack([len(evnums), shape])) - idb = 0 - for iev in evnums: - if iev >= events.GetEntries(): - print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1)) - continue - - events.GetEntry(iev) - for hit in getattr(events, branch): - dbuf[iev][hit.layerID][hit.hitID] = (hit.edep, hit.eta, hit.polar.phi) - - return dbuf - - -# execute this script -# read mc particles from root file -def get_mcp_simple(path, evnums=None, branch='mcparticles'): - f = ROOT.TFile(path) - events = f.events - if evnums is None: - evnums = np.arange(events.GetEntries()) - elif isinstance(evnums, int): - evnums = [evnums] - - dbuf = np.zeros(shape=(len(evnums), 6)) - idb = 0 - for iev in evnums: - if iev >= events.GetEntries(): - print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1)) - continue - - events.GetEntry(iev) - # extract full mc particle data - part = getattr(events, branch)[2] - dbuf[idb] = (iev, part.ps.x, part.ps.y, part.ps.z, part.pdgID, part.status) - idb += 1 - return pd.DataFrame(data=dbuf[:idb], columns=['event', 'px', 'py', 'pz', 'pid', 'status']) +# read from RDataFrame and flatten a given collection, return pandas dataframe +def flatten_collection(rdf, collection, cols=None, event_colname='event'): + if not cols: + cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))] + else: + cols = ['{}.{}'.format(collection, c) for c in cols] + if not cols: + print('cannot find any branch under collection {}'.format(collection)) + return pd.DataFrame() + + data = rdf.AsNumpy(cols) + # flatten the data, add an event id to identify clusters from different events + evns = [] + for i, vec in enumerate(data[cols[0]]): + evns += [i]*vec.size() + for n, vals in data.items(): + # make sure ints are not converted to floats + typename = vals[0].__class__.__name__.lower() + dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32 + # type safe creation + data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype) + # build data frame + dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()}) + dfp.loc[:, event_colname] = evns + return dfp # load root macros, input is an argument string @@ -64,6 +44,15 @@ def load_root_macros(arg_macros): print('\"{}\" does not exist, skip loading it.'.format(path)) +def cartesian_to_polar(x, y, z): + r = np.sqrt(x**2 + y**2 + z**2) + rc = np.sqrt(x**2 + y**2) + theta = np.arccos(z / r) + phi = np.arctan2(y, x) + eta = -np.log(np.tan(theta / 2.)) + return r, theta, phi, rc, eta + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') parser.add_argument('file', type=str, help='path to root file') @@ -71,58 +60,111 @@ if __name__ == '__main__': # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file') parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros', help='root macros to load (accept multiple paths separated by \",\")') - parser.add_argument('--topo-size', type=float, default=2.0, dest='topo_size', - help='bin size for projection plot (mrad)') - parser.add_argument('--topo-range', type=float, default=200.0, dest='topo_range', + parser.add_argument('--topo-range', type=float, default=1000.0, dest='topo_range', help='half range for projection plot (mrad)') - parser.add_argument('--shape', type=str, default="20, 20, 3", dest='shape', - help='shape of data (3D), default (20, 20, 3)') + parser.add_argument('--shape', type=str, default='20, 20', dest='shape', + help='shape of data (Nlayer, Nhits), default (20, 20)') + parser.add_argument('--features', type=str, default='energy, eta, phi', + help='features of hit data, default (eta, phi, energy)') + parser.add_argument('--branch', type=str, default='EcalBarrelImagingHitsML', help='name of data branch') + parser.add_argument('--null-columns', type=str, default='', help='make the columns zeros') args = parser.parse_args() - os.makedirs(args.outdir, exist_ok=True) load_root_macros(args.macros) - data = get_ml_data(args.file, shape=[int(i.strip()) for i in args.shape.split(',')]) - dfmcp = get_mcp_simple(args.file, branch='mcparticles') - # dfmcp = dfmcp[dfmcp['status'] == 24578] - dfmcp.loc[:, 'p'] = np.linalg.norm(dfmcp[['px', 'py', 'pz']].values, axis=1) - dfmcp['phi'] = np.arctan2(dfmcp['py'].values, dfmcp['px'].values) - dfmcp['theta'] = np.arccos(dfmcp['pz'].values/dfmcp['p'].values) - dfmcp['eta'] = -np.log(np.tan(dfmcp['theta'].values/2.)) - - tag = np.zeros(shape=(data.shape[0], 7)) - trg = args.topo_range/1000. - - for iev in np.arange(data.shape[0]): - ev = data[iev].transpose(2,0,1).reshape(3,-1) - truth = dfmcp[dfmcp['event'] == iev] - if not truth.shape[0]: - continue - mask = ev[0] > 0. - if not sum(mask): - continue - eta = ev[1][ev[0] > 0.] - phi = ev[2][ev[0] > 0.] - # normalize - # print(ev.T.reshape(20, 20, 3) == data[iev]) - ev[0] /= 10. - # ev[1][mask] = (ev[1][mask] - eta.mean() + trg)/(2.*trg) - # ev[2][mask] = (ev[2][mask] - phi.mean() + trg)/(2.*trg) - ev[1][mask] = (ev[1][mask] - truth['eta'].values + trg)/(2.*trg) - ev[2][mask] = (ev[2][mask] - truth['phi'].values + trg)/(2.*trg) - # ev[1][mask] = (ev[1][mask] + 1.0)/2. - # ev[2][mask] = (ev[2][mask] + np.pi)/2./np.pi - out_range = (ev[1] > 1.) | (ev[1] < 0.) | (ev[2] > 1.) | (ev[2] < 0.) - ev.T[out_range] = (0., 0., 0.) - test = data[iev].transpose(2,0,1).reshape(3,-1) - tag[iev] = np.hstack([truth[['pid', 'eta', 'phi', 'p']].values[0], (eta.mean(), phi.mean(), trg)]) - # print(ev[1]) - # print((eta.shape, phi.shape), (eta.mean(), phi.mean()), truth[['eta', 'phi']].values) - - # momentum - mask = tag.T[3] > 0. - print('training data shape: {}'.format(data[mask].shape)) - print('tagging data shape: {}'.format(tag[mask].shape)) - np.save(os.path.join(args.outdir, 'data.npy'), data[mask]) - np.save(os.path.join(args.outdir, 'tag.npy'), tag[mask]) + + # read data and mcparticles + rdf = ROOT.RDataFrame("events", args.file) + df = flatten_collection(rdf, args.branch, ['layer', 'energy', 'position.x', 'position.y', 'position.z']) + df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True) + + r, theta, phi, rc, eta = cartesian_to_polar(*df[['position.x', 'position.y', 'position.z']].values.T) + df.loc[:, 'r'] = r + df.loc[:, 'theta'] = theta + df.loc[:, 'phi'] = phi + df.loc[:, 'rc'] = rc + df.loc[:, 'eta'] = eta + + dfm = flatten_collection(rdf, 'mcparticles2', ['genStatus', 'pdgID', 'ps.x', 'ps.y', 'ps.z', 'mass']) + dfm.rename(columns={c: c.replace('mcparticles2.', '') for c in dfm.columns}, inplace=True) + # selete incident particles + dfm = dfm[dfm['genStatus'].isin([0, 1])] + # NOTE: assumed single particles + dfm = dfm.groupby('event').first() + p, theta, phi, pT, eta = cartesian_to_polar(*dfm[['ps.x', 'ps.y', 'ps.z']].values.T) + dfm.loc[:, 'p'] = p + dfm.loc[:, 'theta'] = theta + dfm.loc[:, 'phi'] = phi + dfm.loc[:, 'pT'] = pT + dfm.loc[:, 'eta'] = eta + + # energy unit is in GeV, converting to MeV first + df.loc[:, 'eMeV'] = df['energy']*1000 + # scaled by 5 MeV to have most of them in [0, 1] + df.loc[:, 'energy'] = df['eMeV']/5. + # convert eta, phi, edep so they are within [0, 1] + df.loc[:, 'etotal'] = df.groupby('event')['energy'].transform('sum') + df = df[df['etotal'] > 0.] + + event_group = df.groupby('event') + df.loc[:, 'logw'] = 0. + # energy = 0 means no hits + mask = df['energy'] > 0. + df.loc[mask, 'logw'] = np.log(df.loc[mask, 'energy'].values / df.loc[mask, 'etotal'].values) + 6.2 + df.loc[:, 'logw'] = df['logw'].clip(0) + df.loc[:, 'wtotal'] = event_group['logw'].transform('sum') + + df.loc[:, 'etaw'] = df['eta']*df['logw'] + df.loc[:, 'phiw'] = df['phi']*df['logw'] + + df.loc[:, 'eta_mean'] = event_group['etaw'].transform('sum') / df['wtotal'] + df.loc[:, 'phi_mean'] = event_group['phiw'].transform('sum') / df['wtotal'] + + # topo range in rad + trg = args.topo_range / 1000. + df.loc[mask, 'eta'] = (df.loc[mask, 'eta'].values - df.loc[mask, 'eta_mean'].values + trg) / (2.*trg) + df.loc[mask, 'phi'] = ((df.loc[mask, 'phi'].values - df.loc[mask, 'phi_mean'].values)/np.pi + trg) / (2.*trg) + df.loc[~mask, 'eta'] = 0. + df.loc[~mask, 'phi'] = 0. + df.loc[~mask, 'rc'] = 0. + # scale by 2 meter, which keeps barrel region well under 1.0 + df.loc[:, 'rc'] = df['rc'].values / 2000.0 + df.loc[:, 'layer_type'] = np.floor(df['layer']/100.).astype(int) + df.loc[df['layer_type'] == 1, 'eta'] = 0. + + # null specified columns (for scfi which does not have eta info) + nullcols = [c.strip() for c in args.null_columns.split(',')] + for col in nullcols: + df.loc[:, col] = 0. + + gc.collect() + # features and data-shape + featcols = [f.strip() for f in args.features.split(',')] + dshape = [int(i.strip()) for i in args.shape.split(',')] + dshape.append(len(featcols)) + + event_ids = df['event'].unique() + data = df.set_index('event')[featcols].clip(0, 1).values.reshape([len(event_ids)] + dshape) + tags = dfm.loc[event_ids, ['pdgID', 'p', 'pT', 'eta', 'phi', 'mass']] + # also save Edep per layers + # merge the sandwich layers + df['layer'] = df['layer'] % 100 + edeps = df.groupby(['event', 'layer'])['eMeV'].sum().unstack() + + """ TEST + from matplotlib import pyplot as plt + + fig, axs = plt.subplots(3, 1) + axs.flat[0].hist(df.loc[df['edep'] > 0, 'eta'].values, bins=np.linspace(-2, 2, 101), ec='k') + axs.flat[1].hist(df.loc[df['edep'] > 0, 'phi'].values, bins=np.linspace(-2, 2, 101), ec='k') + axs.flat[2].hist(df.loc[df['edep'] > 0, 'edep'].values, bins=np.linspace(-2, 2, 101), ec='k') + fig.savefig('test.png') + """ + print('Training data set shape = {}'.format(data.shape)) + print('Tagging data set shape = {}'.format(tags.shape)) + print('Edep/layer data set shape = {}'.format(edeps.shape)) + + np.save(os.path.join(args.outdir, 'data.npy'), data) + np.save(os.path.join(args.outdir, 'tag.npy'), tags.values) + np.save(os.path.join(args.outdir, 'edep.npy'), edeps.values) diff --git a/benchmarks/imaging_shower_ML/sim_rec_tag.py b/benchmarks/imaging_shower_ML/sim_rec_tag.py index 3a7aaedb..59360629 100755 --- a/benchmarks/imaging_shower_ML/sim_rec_tag.py +++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py @@ -18,12 +18,14 @@ parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100, parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.') parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.') parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).') -parser.add_argument('--numberOfLayers', dest='nlayer', type=int, default=20, help='number of layers in ML data.') -parser.add_argument('--numberOfHits', dest='nhit', type=int, default=20, help='number of hits in ML data.') +parser.add_argument('--nlayers', dest='nlayers', type=int, default=9, help='number of layers in ML data.') +parser.add_argument('--nhits', dest='nhits', type=int, default=20, help='number of hits in ML data.') parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".') parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.') parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.') parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.') +parser.add_argument('--combine-method', type=str, default='interlayer', help='Path to detector compact file.') +parser.add_argument('--physics-list', type=str, default='FTFP_BERT', help='Path to detector compact file.') args = parser.parse_args() kwargs = vars(args) @@ -35,7 +37,6 @@ gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwa sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs)) -os.makedirs(tag_dir, exist_ok=True) procs = [p.strip() for p in args.process.split(',')] sdir = os.path.dirname(os.path.realpath(__file__)) @@ -45,7 +46,8 @@ if 'sim' in procs: gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file, '-n', '{}'.format(args.nev), '-s', '{}'.format(args.seed), - '--angmin', '50', '--angmax', '130', + # '--angmin', '45', '--angmax', '135', + '--angmin', '85', '--angmax', '95', '--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax), '--particles', args.particles] subprocess.run(gen_cmd) @@ -54,6 +56,7 @@ if 'sim' in procs: '--part.minimalKineticEnergy', '1*TeV', '--numberOfEvents', '{}'.format(args.nev), '--runType', 'batch', + # '--physics.list', args.physics_list, '--inputFiles', gen_file, '--outputFile', sim_file, '--compact', args.compact, @@ -70,16 +73,23 @@ if 'sim' in procs: if 'rec' in procs: # export to environment variables (used to pass arguments to the option file) - os.environ['JUGGLER_SIM_FILE'] = sim_file - os.environ['JUGGLER_REC_FILE'] = rec_file - os.environ['JUGGLER_COMPACT_PATH'] = args.compact - os.environ['JUGGLER_N_EVENTS'] = '{}'.format(args.nev) - os.environ['IMCAL_ML_N_LAYERS'] = '{}'.format(args.nlayer) - os.environ['IMCAL_ML_N_HITS'] = '{}'.format(args.nhit) + run_env = os.environ.copy() + run_env.update({ + 'JUGGLER_SIM_FILE': sim_file, + 'JUGGLER_REC_FILE': rec_file, + 'JUGGLER_COMPACT_PATH': args.compact, + 'JUGGLER_N_EVENTS': str(args.nev), + 'IMCAL_ML_IMG_NLAYERS': str(args.nlayers), + 'IMCAL_ML_NHITS': str(args.nhits), + 'IMCAL_ML_COMBINE': str(args.combine_method), + }) juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv') - rec_cmd = ['xenv', '-x', juggler_xenv, 'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')] - return_code = subprocess.run(rec_cmd).returncode + rec_cmd = [ + 'xenv', '-x', juggler_xenv, # v35+ do not need xenv anymore + 'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py') + ] + return_code = subprocess.run(rec_cmd, env=run_env).returncode print(return_code) if return_code is not None and return_code < 0: print("ERROR running juggler (reconstruction)!") @@ -88,9 +98,43 @@ if 'rec' in procs: if 'tag' in procs: + os.makedirs(tag_dir, exist_ok=True) + """ + # imaging layers tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file, '-o', tag_dir, - '--shape', '{nlayer},{nhit},3'.format(**kwargs)] + '--branch', 'EcalBarrelImagingHitsML', + '--shape', '{nlayers}, {nhits}'.format(**kwargs), + '--features', 'energy, rc, eta, phi', + ] + return_code = subprocess.run(tag_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running ML data tagging!") + exit(return_code) + + # scfi layers + tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file, + '-o', tag_dir + '_scfi', + '--branch', 'EcalBarrelScFiHitsML', + '--shape', '{nlayers}, {nhits}'.format(**kwargs), + # '--shape', '{}, {}'.format(20 - kwargs['nlayers'], kwargs['nhits']), + '--features', 'energy, rc, eta, phi', + '--null-columns', 'eta', + ] + return_code = subprocess.run(tag_cmd).returncode + print(return_code) + if return_code is not None and return_code < 0: + print("ERROR running ML data tagging!") + exit(return_code) + """ + # combined + tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file, + '-o', tag_dir + '_combined', + '--branch', 'EcalBarrelHitsCombinedML', + '--shape', '{}, {}'.format(20 + kwargs['nlayers'], kwargs['nhits']), + '--features', 'layer_type, energy, rc, eta, phi', + ] return_code = subprocess.run(tag_cmd).returncode print(return_code) if return_code is not None and return_code < 0: -- GitLab