From 74ecae3226953e8b7e90e5c6cfac2f875e5c2cf1 Mon Sep 17 00:00:00 2001 From: Robinpreet Dhillon <rdhill13@student.ubc.ca> Date: Wed, 16 Mar 2022 15:42:07 +0000 Subject: [PATCH] Run ML training on 100 events for every pipeline (manual for 100k) --- .../emcal_barrel_pion_rejection_analysis.cxx | 12 +- benchmarks/imaging_shower_ML/config.yml | 40 +- .../options/imaging_ml_data.py | 14 +- .../imaging_shower_ML/scripts/ml_training.py | 547 +++++++++--------- .../scripts/prepare_tf_dataset.py | 3 +- benchmarks/imaging_shower_ML/sim_rec_tag.py | 10 +- 6 files changed, 332 insertions(+), 294 deletions(-) diff --git a/benchmarks/imaging_ecal/analysis/emcal_barrel_pion_rejection_analysis.cxx b/benchmarks/imaging_ecal/analysis/emcal_barrel_pion_rejection_analysis.cxx index 3b1e2cc5..042655bc 100644 --- a/benchmarks/imaging_ecal/analysis/emcal_barrel_pion_rejection_analysis.cxx +++ b/benchmarks/imaging_ecal/analysis/emcal_barrel_pion_rejection_analysis.cxx @@ -15,8 +15,6 @@ #include "eicd/CalorimeterHitData.h" #include "eicd/ClusterCollection.h" #include "eicd/ClusterData.h" -#include "eicd/ClusterLayerData.h" - #include "eicd/RawCalorimeterHitCollection.h" #include "eicd/RawCalorimeterHitData.h" @@ -229,7 +227,7 @@ void emcal_barrel_pion_rejection_analysis( // EcalBarrelImagingClustersLayers Functions//////////////////////////////////////////////////////////////////////////////// // Number of hits - auto nhitsClusLayerImg = [] (const std::vector<eicd::ClusterLayerData>& evt) { + auto nhitsClusLayerImg = [] (const std::vector<eicd::ClusterData>& evt) { int nhitsTot = 0; for (const auto &i : evt){ nhitsTot += i.nhits; @@ -238,10 +236,10 @@ void emcal_barrel_pion_rejection_analysis( }; // Number of clusters - auto nClusLayerImg = [] (const std::vector<eicd::ClusterLayerData>& evt) {return (int) evt.size(); }; + auto nClusLayerImg = [] (const std::vector<eicd::ClusterData>& evt) {return (int) evt.size(); }; // Energy deposition in cluster [GeV] - auto EClusLayerImg = [](const std::vector<eicd::ClusterLayerData>& evt) { + auto EClusLayerImg = [](const std::vector<eicd::ClusterData>& evt) { double total_edep = 0.0; for (const auto& i: evt){ total_edep += i.energy; @@ -250,7 +248,7 @@ void emcal_barrel_pion_rejection_analysis( }; // Max Energy deposition in cluster [GeV] - auto EClusLayerMaxImg = [](const std::vector<eicd::ClusterLayerData>& evt) { + auto EClusLayerMaxImg = [](const std::vector<eicd::ClusterData>& evt) { double max = 0.0; for (const auto& i: evt){ if (i.energy > max){max = i.energy;} @@ -259,7 +257,7 @@ void emcal_barrel_pion_rejection_analysis( }; // Min Energy deposition in cluster [GeV] - auto EClusLayerMax2Img = [](const std::vector<eicd::ClusterLayerData>& evt) { + auto EClusLayerMax2Img = [](const std::vector<eicd::ClusterData>& evt) { double max1 = 0.0; double max2 = 0.0; for (const auto& i: evt){ diff --git a/benchmarks/imaging_shower_ML/config.yml b/benchmarks/imaging_shower_ML/config.yml index 5f2bd4ff..bb75e6a2 100644 --- a/benchmarks/imaging_shower_ML/config.yml +++ b/benchmarks/imaging_shower_ML/config.yml @@ -1,35 +1,53 @@ -ml_shower:tagging_epi : +ml_shower:tagging_epimuphka_100: + extends: .rec_benchmark + stage: benchmarks1 + script: + - pwd + - ls -l + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka_100 -n 100 --particles "electron,pion-,muon,photon,kaon-" + --pmin 0.5 --pmax 10 + +ml_shower:tagging_epimuphka: extends: .rec_benchmark when: manual stage: benchmarks1 script: - - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epi -n 100000 - --particles "electron,pion-,pion-" --pmin 0.5 --pmax 10 + - ls -hal + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_epimuphka -n 10000 --particles "electron,pion-,muon,photon,kaon-" + --pmin 0.5 --pmax 10 -ml_shower:tagging_e : +ml_shower:tagging_e: extends: .rec_benchmark when: manual stage: benchmarks1 script: - - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100000 + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_e -n 100 --particles "electron" --pmin 0.5 --pmax 10 -ml_shower:tagging_pi : +ml_shower:tagging_pi: extends: .rec_benchmark when: manual stage: benchmarks1 script: - - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100000 + - python3 benchmarks/imaging_shower_ML/sim_rec_tag.py -t imcal_pi -n 100 --particles "pion-" --pmin 0.5 --pmax 10 +ml_shower:training_100: + extends: .rec_benchmark + stage: process + needs: ["ml_shower:tagging_epimuphka_100"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"] + script: + - pip install tensorflow particle + - python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka_100 --pmin 0.5 --pmax 10 + ml_shower:training: extends: .rec_benchmark - when: manual stage: process - needs: ["ml_shower:tagging_epi", "ml_shower:tagging_e", "ml_shower:tagging_pi"] + when: manual + needs: ["ml_shower:tagging_epimuphka"]#, "ml_shower:tagging_e", "ml_shower:tagging_pi"] script: - - ls -lrth - # TODO + - pip install tensorflow particle + - python3 benchmarks/imaging_shower_ML/scripts/ml_training.py -t imcal_epimuphka --pmin 0.5 --pmax 10 ml_shower:test: extends: .rec_benchmark diff --git a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py index 12bc3ef4..4b8188c9 100644 --- a/benchmarks/imaging_shower_ML/options/imaging_ml_data.py +++ b/benchmarks/imaging_shower_ML/options/imaging_ml_data.py @@ -60,13 +60,13 @@ becal_img_reco = CalHitReco('becal_img_reco', **becal_img_daq) becal_img_merger = MLDataMerger('becal_img_merger', - inputHitCollection=becal_img_reco.outputHitCollection, - outputHitCollection='EcalBarrelImagingHitsSeg', + inputHits=becal_img_reco.outputHitCollection, + outputHits='EcalBarrelImagingHitsSeg', etaSize=0.001, phiSize=0.001) becal_img_sorter = MLDataSorter('becal_img_sorter', - inputHitCollection=becal_img_merger.outputHitCollection, + inputHitCollection=becal_img_merger.outputHits, outputHitCollection='EcalBarrelImagingHitsML', numberOfLayers=kwargs['img_nlayers'], numberOfHits=kwargs['nhits']) @@ -117,12 +117,14 @@ becal_scfi_sorter = MLDataSorter('becal_scfi_sorter', # combine layers becal_combiner = MLDataCombiner('becal_combiner', - inputHitCollection1=becal_img_sorter.outputHitCollection, - inputHitCollection2=becal_scfi_sorter.outputHitCollection, - outputHitCollection='EcalBarrelHitsCombinedML', + inputHits1=becal_img_sorter.outputHitCollection, + inputHits2=becal_scfi_sorter.outputHitCollection, + outputHits='EcalBarrelHitsCombinedML', layerIncrement=100, rule=kwargs['combine']) + + podout.outputCommands = [ # 'keep *', 'drop *', diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training.py b/benchmarks/imaging_shower_ML/scripts/ml_training.py index f5cd917c..82517e3b 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_training.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py @@ -1,264 +1,283 @@ -import os -import pandas as pd -import numpy as np -from collections import OrderedDict -from scipy.interpolate import CubicSpline -# import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers -import matplotlib as mpl -from matplotlib import pyplot as plt -from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator -from matplotlib.colors import LogNorm -from particle import Particle - -# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" -# 5 layers 0-1-2-6-9 -# 6 layers 0-1-2-3-4-6-9 - - -def get_layer_cum_edeps(edep): - return np.tile(np.arange(edep.shape[1]), (edep.shape[0], 1)).flatten(), np.cumsum(edep, axis=1).flatten() - - -def draw_layer_edep(edep, label=''): - layer_num, edep_val = get_layer_cum_edeps(edep) - figure, axis = plt.subplots(figsize=(16, 9), dpi=120) - axis.hist2d(layer_num, edep_val, - bins=(np.arange(0.5, edep.shape[1] + 0.5, step=1.0), 100), norm=mpl.colors.LogNorm()) - axis.grid(linestyle=':') - axis.set_axisbelow(True) - axis.tick_params(labelsize=24, direction='in', which='both') - axis.set_ylabel('Cumulative Edep (MeV)', fontsize=24) - axis.set_xlabel('Layer number', fontsize=24) - axis.xaxis.set_major_locator(MaxNLocator(edep.shape[1], integer=True)) - if label: - axis.set_title(label, fontsize=26) - return figure, axis - - -train_model = True -do_epcut = False -nametag = 'real_imcal' -partag = 'epimuphka' -# partag = 'epi' -prange = (2.0, 2.0) -etag = '{:.1f}_{:.1f}_combined'.format(*prange) -epoch = 20 -# data_shape = (20, 20, 3) -data_shape = (29, 20, 5) -data_dir = r'D:\Data\EIC\ImagingCalorimetry\{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) -out_dir = r'C:\Users\cpeng\OneDrive\EIC\ImagingCalorimeter\AstroPix\hybrid\{name}_{part}_{etag}'\ - .format(name=nametag, part=partag, etag=etag) -raw_data = np.load(os.path.join(data_dir, 'data.npy')) -raw_tags = np.load(os.path.join(data_dir, 'tag.npy')) -edeps = np.load(os.path.join(data_dir, 'edep.npy')) - -# for interlayered data, null the last few layers -astro_nlayers = 9 -astro_nlayers_used = [0, 1, 2, 3, 4] -astro_nlayers_not_used = [2*i for i in np.arange(astro_nlayers) if i not in astro_nlayers_used] -if len(astro_nlayers_used) and len(astro_nlayers_not_used): - raw_data[:, astro_nlayers_not_used] = 0 - -os.makedirs(out_dir, exist_ok=True) -# label converter -labels = OrderedDict([ - (pid, [i, Particle.from_pdgid(pid).name, '']) - for i, pid in enumerate(sorted(np.unique(raw_tags.T[0].astype(int)), key=lambda x: np.abs(x))) -]) - -# add some labels in latex format -name_dict = {'gamma': r'$\gamma$', '-': '$^-$', '+': '$^+$', 'pi': r'$\pi$', 'mu': r'$\mu$'} -for pid, (i, name, _) in labels.items(): - labelstr = name - for str1, str2 in name_dict.items(): - labelstr = labelstr.replace(str1, str2) - labels[pid][2] = labelstr - -part_weights = {pid: 1.0 for pid, _ in labels.items()} -part_weights.update({ - 11: 1.0, - -211: 10.0, -}) - -begin_layer, end_layer = 0, 9 -epcut = 0.085 -if do_epcut: - epi_ids = (11, -211) # (11, -211) - # Eoverp cut in tags data - eovp = np.sum(edeps[:, begin_layer:end_layer], axis=1)*1e-3/raw_tags.T[1] - emask = raw_tags.T[0] == epi_ids[0] - pimask = raw_tags.T[0] == epi_ids[1] - - epmask = tuple([eovp > epcut]) - ep_eff = sum(raw_tags[epmask].T[0] == 11) / sum(emask) - ep_rej = 1. - sum(raw_tags[epmask].T[0] == -211) / sum(pimask) - - bins = np.linspace(0, 0.15, 151) - fig, ax = plt.subplots(figsize=(12, 9), dpi=120) - ax.hist(eovp[emask], histtype='step', density=True, bins=bins, label=labels[epi_ids[0]][2]) - ax.hist(eovp[pimask], histtype='step', density=True, bins=bins, label=labels[epi_ids[1]][2]) - ax.axvline(epcut, color='k') - ax.set_xlabel('Edep / p', fontsize=24) - ax.set_ylabel('Normalized Counts', fontsize=24) - ax.tick_params(labelsize=24) - ax.legend(fontsize=24) - ax.text(0.2, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(ep_eff*100., ep_rej*100.), - fontsize=24, transform=ax.transAxes, ha='left', va='top') - ax.set_title('$E/p > {:.2f}$% @ {:d} - {:d} X$_0$'.format(epcut*100., begin_layer, end_layer), fontsize=24) - fig.savefig(os.path.join(out_dir, 'pre_epcut.png')) - - fig, _ = draw_layer_edep(edeps[emask], 'Cumulative Edep over layer for electrons') - fig.savefig(os.path.join(out_dir, 'cum_edep_el.png')) - fig, _ = draw_layer_edep(edeps[pimask], 'Cumulative Edep over layer for pions') - fig.savefig(os.path.join(out_dir, 'cum_edep_pi.png')) - - data = raw_data[epmask] - tags = raw_tags[epmask] - -else: - ep_eff = 1.0 - ep_rej = 0. - data = raw_data - tags = raw_tags - -pid = np.vectorize(lambda x: labels[x][0])(tags.T[0]) -weights = np.vectorize(lambda x: part_weights[x])(tags.T[0]) - -if train_model: - indices = np.arange(len(data)) - np.random.shuffle(indices) - id_train = indices[:int(len(data)*0.8)] - id_valid = indices[int(len(data)*0.8):] - x_train = data[id_train] - y_train = pid[id_train] - w_train = weights[id_train] - - x_valid = data[id_valid] - y_valid = pid[id_valid] - w_valid = weights[id_valid] - - model = keras.Sequential([ - keras.layers.Conv2D(64, (3, 3), padding='same', activation='selu', input_shape=data_shape), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.25), - keras.layers.Conv2D(128, (2, 2), padding='same', activation='selu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - keras.layers.Dropout(0.3), - keras.layers.Conv2D(64, (2, 2), padding='same', activation='selu'), - # keras.layers.MaxPooling2D((2, 2), strides=2), - - keras.layers.Flatten(), - keras.layers.Dense(128, activation='selu'), - keras.layers.Dropout(0.25), - keras.layers.Dense(32, activation='selu'), - keras.layers.Dense(len(labels), activation='softmax') - ]) - - model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - metrics=['accuracy']) - history = model.fit(x_train, y_train, - epochs=epoch, sample_weight=w_train, validation_data=(x_valid, y_valid, w_valid)) - model.save(os.path.join(out_dir, 'model_epch{:d}'.format(epoch))) -else: - model = keras.models.load_model(os.path.join(out_dir, 'model_epch{:d}'.format(epoch))) -# ---- general ---- -# x_test = data -# y_test = pid -# prediction = model.predict(x_test) -# result = prediction.argmax(axis=1) -# pid_probs = [] -# for _, part in labels.items(): -# mask = (y_test == part[0]) -# res = result[mask] -# pid_probs.append(np.bincount(res, minlength=len(labels))/sum(mask)) -# pid_probs = np.asarray(pid_probs) - -# predictions -x_test = data -y_test = pid -prediction = model.predict(x_test) - -# two labels (pion - electron) -if len(labels) == 2: - # --- pion - electron ---- - pion_prob_cut = 0.5 - pid_probs = np.zeros(shape=(2, 2)) - fig, ax = plt.subplots(1, 1, figsize=(12, 9), dpi=160) - ax.set_yscale('log') - ax.set_ylabel('Counts', fontsize=24) - ax.set_xlabel(r'$P_{\pi}$', fontsize=24) - ax.tick_params(direction='in', which='both', labelsize=24) - for i, (_, part) in enumerate(labels.items()): - mask = (y_test == part[0]) - probs = prediction[mask] - ax.hist(probs.T[1], histtype='step', bins=np.linspace(0, 1, 101), label=part[1]) - pid_probs[i][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) - pid_probs[i][1] = 1. - pid_probs[i][0] - ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') - ax.text(0.55, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(pid_probs[0][0]*100., pid_probs[1][1]*100.), - fontsize=24, transform=ax.transAxes, ha='left', va='top') - ax.legend(fontsize=24, loc='lower left') - fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png')) - fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf')) - # --- pion - electron ---- - result_text = open(os.path.join(out_dir, 'rejection_result.txt'), 'w') - lines = [ - 'E/p cut position: {:.4f}, layer {:d} to {:d}, E/p efficiency: {:.4f}%, rejection power: {:.4f}'\ - .format(epcut, begin_layer, end_layer, ep_eff*100., 1./(1. - ep_rej)), - ] if do_epcut else [] - lines += [ - 'ML training weight: {}, pion cut position: {:.4f}, ML efficiency: {:.4f}%, rejection power: {:.4f}'\ - .format(list(part_weights.values()), pion_prob_cut, pid_probs[0][0]*100., 1./(1. - pid_probs[1][1])), - 'combined efficiency: {:.2f}%, rejection power: {:.1f}'\ - .format(pid_probs[0][0]*ep_eff*100., 1./((1. - ep_rej)*(1. - pid_probs[1][1]))), - ] - result_text.write('\n'.join(lines)) - result_text.close() - print('\n'.join(lines)) - -else: - # --- multi-particles --- - pid_probs = np.zeros(shape=(len(labels), len(labels))) - fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) - ax.set_yscale('log') - ax.set_ylabel('Counts', fontsize=26) - ax.set_xlabel(r'Likelihood of the Corresponding Label', fontsize=26) - ax.tick_params(direction='in', which='both', labelsize=26) - # label with maximum likelihood - for _, (i, part, label) in labels.items(): - mask = (y_test == i) - probs = prediction[mask] - pred_labels = probs.argmax(axis=1) - ax.hist(probs.T[i], histtype='step', bins=np.linspace(0, 1, 101), label=label) - for _, (j, _, _) in labels.items(): - pid_probs[i][j] = np.sum(pred_labels == j)/float(np.sum(mask)) - - ax.legend(fontsize=26, loc='upper center', ncol=3, handletextpad=0.3, columnspacing=0.6) - fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png')) - fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf')) - - # --- multi-particles --- -fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) -ax.imshow(pid_probs, cmap='PuBu', norm=LogNorm(vmin=0.01, vmax=1)) -ax.xaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[0], step=1.0))) -ax.yaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[1], step=1.0))) -ax.xaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[0], step=1.0))) -ax.yaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[1], step=1.0))) -ax.set_xticklabels([x[2] for _, x in labels.items()], fontsize=26, va='bottom') -ax.set_yticklabels([x[2] for _, x in labels.items()], fontsize=26, ha='left') -ax.tick_params(bottom=False, left=False, which='both') -ax.tick_params(pad=30, axis='x') -ax.tick_params(pad=40, axis='y') -ax.set_ylabel('Truth', fontsize=26) -ax.set_xlabel('Classification', fontsize=26) -for i in range(pid_probs.shape[0]): - for j in range(pid_probs.shape[1]): - color = 'k' if pid_probs[i, j] < 0.15 else 'w' - text = ax.text(j, i, '{:.2f}%'.format(pid_probs[i, j]*100.), ha='center', va='center', - color=color, fontsize=28 - len(labels)*2) -ax.grid('-', color='k', which='minor') -fig.savefig(os.path.join(out_dir, 'pid_tag.png')) -fig.savefig(os.path.join(out_dir, 'pid_tag.pdf')) +import os +import pandas as pd +import numpy as np +from collections import OrderedDict +from scipy.interpolate import CubicSpline +import uproot as uproot +import matplotlib.pyplot as plt +import logging +import sys +import subprocess +import argparse + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +import matplotlib as mpl +from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator +from matplotlib.colors import LogNorm +from particle import Particle + +print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) + +# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" +# 5 layers 0-1-2-6-9 +# 6 layers 0-1-2-3-4-6-9 + + +def get_layer_cum_edeps(edep): + return np.tile(np.arange(edep.shape[1]), (edep.shape[0], 1)).flatten(), np.cumsum(edep, axis=1).flatten() + + +def draw_layer_edep(edep, label=''): + layer_num, edep_val = get_layer_cum_edeps(edep) + figure, axis = plt.subplots(figsize=(16, 9), dpi=120) + axis.hist2d(layer_num, edep_val, + bins=(np.arange(0.5, edep.shape[1] + 0.5, step=1.0), 100), norm=mpl.colors.LogNorm()) + axis.grid(linestyle=':') + axis.set_axisbelow(True) + axis.tick_params(labelsize=24, direction='in', which='both') + axis.set_ylabel('Cumulative Edep (MeV)', fontsize=24) + axis.set_xlabel('Layer number', fontsize=24) + axis.xaxis.set_major_locator(MaxNLocator(edep.shape[1], integer=True)) + if label: + axis.set_title(label, fontsize=26) + return figure, axis + +parser = argparse.ArgumentParser() +parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.') +parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.') +parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.') +args = parser.parse_args() +kwargs = vars(args) + +train_model = True +do_epcut = False +#nametag = 'imcal' +#partag = 'epimuphka' +#prange = (0.5, 10.0) +#etag = '{:.1f}_{:.1f}_combined'.format(*prange) +epoch = 20 +data_shape = (29, 20, 5) +#data_dir = r'sim_output/tag_data/{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) +#out_dir = r'sim_output/tag_data/output/{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag) +data_dir = os.path.join('sim_output/tag_data/', '{nametag}_{pmin}_{pmax}_combined'.format(**kwargs)) +out_dir = os.path.join('sim_output/tag_data/output/', '{nametag}_{pmin}_{pmax}_combined'.format(**kwargs)) +raw_data = np.load(os.path.join(data_dir, 'data.npy')) +raw_tags = np.load(os.path.join(data_dir, 'tag.npy')) +edeps = np.load(os.path.join(data_dir, 'edep.npy')) + +# for interlayered data, null the last few layers +astro_nlayers = 9 +astro_nlayers_used = [0, 1, 2, 3, 4] +astro_nlayers_not_used = [2*i for i in np.arange(astro_nlayers) if i not in astro_nlayers_used] +if len(astro_nlayers_used) and len(astro_nlayers_not_used): + raw_data[:, astro_nlayers_not_used] = 0 + +os.makedirs(out_dir, exist_ok=True) +# label converter +labels = OrderedDict([ + (pid, [i, Particle.from_pdgid(pid).name, '']) + for i, pid in enumerate(sorted(np.unique(raw_tags.T[0].astype(int)), key=lambda x: np.abs(x))) +]) + +# add some labels in latex format +name_dict = {'gamma': r'$\gamma$', '-': '$^-$', '+': '$^+$', 'pi': r'$\pi$', 'mu': r'$\mu$'} +for pid, (i, name, _) in labels.items(): + labelstr = name + for str1, str2 in name_dict.items(): + labelstr = labelstr.replace(str1, str2) + labels[pid][2] = labelstr + +part_weights = {pid: 1.0 for pid, _ in labels.items()} +part_weights.update({ + 11: 1.0, + -211: 10.0, +}) + +begin_layer, end_layer = 0, 9 +epcut = 0.085 +if do_epcut: + epi_ids = (11, -211) # (11, -211) + # Eoverp cut in tags data + eovp = np.sum(edeps[:, begin_layer:end_layer], axis=1)*1e-3/raw_tags.T[1] + emask = raw_tags.T[0] == epi_ids[0] + pimask = raw_tags.T[0] == epi_ids[1] + + epmask = tuple([eovp > epcut]) + ep_eff = sum(raw_tags[epmask].T[0] == 11) / sum(emask) + ep_rej = 1. - sum(raw_tags[epmask].T[0] == -211) / sum(pimask) + + bins = np.linspace(0, 0.15, 151) + fig, ax = plt.subplots(figsize=(12, 9), dpi=120) + ax.hist(eovp[emask], histtype='step', density=True, bins=bins, label=labels[epi_ids[0]][2]) + ax.hist(eovp[pimask], histtype='step', density=True, bins=bins, label=labels[epi_ids[1]][2]) + ax.axvline(epcut, color='k') + ax.set_xlabel('Edep / p', fontsize=24) + ax.set_ylabel('Normalized Counts', fontsize=24) + ax.tick_params(labelsize=24) + ax.legend(fontsize=24) + ax.text(0.2, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(ep_eff*100., ep_rej*100.), + fontsize=24, transform=ax.transAxes, ha='left', va='top') + ax.set_title('$E/p > {:.2f}$% @ {:d} - {:d} X$_0$'.format(epcut*100., begin_layer, end_layer), fontsize=24) + fig.savefig(os.path.join('results/ml/models/emcal/', 'pre_epcut.png')) + + fig, _ = draw_layer_edep(edeps[emask], 'Cumulative Edep over layer for electrons') + fig.savefig(os.path.join('results/ml/models/emcal/', 'cum_edep_el.png')) + fig, _ = draw_layer_edep(edeps[pimask], 'Cumulative Edep over layer for pions') + fig.savefig(os.path.join('results/ml/models/emcal/', 'cum_edep_pi.png')) + + data = raw_data[epmask] + tags = raw_tags[epmask] + +else: + ep_eff = 1.0 + ep_rej = 0. + data = raw_data + tags = raw_tags + +pid = np.vectorize(lambda x: labels[x][0])(tags.T[0]) +weights = np.vectorize(lambda x: part_weights[x])(tags.T[0]) + +if train_model: + indices = np.arange(len(data)) + np.random.shuffle(indices) + id_train = indices[:int(len(data)*0.8)] + id_valid = indices[int(len(data)*0.8):] + x_train = data[id_train] + y_train = pid[id_train] + w_train = weights[id_train] + + x_valid = data[id_valid] + y_valid = pid[id_valid] + w_valid = weights[id_valid] + + model = keras.Sequential([ + keras.layers.Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=data_shape), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.25), + keras.layers.Conv2D(128, (2, 2), padding='same', activation='relu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + keras.layers.Dropout(0.3), + keras.layers.Conv2D(64, (2, 2), padding='same', activation='relu'), + # keras.layers.MaxPooling2D((2, 2), strides=2), + + keras.layers.Flatten(), + keras.layers.Dense(128, activation='relu'), + keras.layers.Dropout(0.25), + keras.layers.Dense(32, activation='relu'), + keras.layers.Dense(len(labels), activation='softmax') + ]) + + model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), + loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), + metrics=['accuracy']) + history = model.fit(x_train, y_train, + epochs=epoch, sample_weight=w_train, validation_data=(x_valid, y_valid, w_valid)) + model.save(os.path.join('results/ml/models/emcal/', 'model_epch{:d}'.format(epoch))) + + converter = tf.lite.TFLiteConverter.from_keras_model(model) + tflite_model = converter.convert() + + with open(os.path.join('results/ml/models/emcal/', 'lite_model_{nametag}.tflite'.format(**kwargs)), 'wb') as f: + f.write(tflite_model) +else: + model = keras.models.load_model(os.path.join('results/ml/models/emcal/', 'model_epch{:d}'.format(epoch))) +# ---- general ---- +# x_test = data +# y_test = pid +# prediction = model.predict(x_test) +# result = prediction.argmax(axis=1) +# pid_probs = [] +# for _, part in labels.items(): +# mask = (y_test == part[0]) +# res = result[mask] +# pid_probs.append(np.bincount(res, minlength=len(labels))/sum(mask)) +# pid_probs = np.asarray(pid_probs) + +# predictions +x_test = data +y_test = pid +prediction = model.predict(x_test) + +# two labels (pion - electron) +if len(labels) == 2: + # --- pion - electron ---- + pion_prob_cut = 0.5 + pid_probs = np.zeros(shape=(2, 2)) + fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=160) + ax.set_yscale('log') + ax.set_ylabel('Counts', fontsize=24) + ax.set_xlabel(r'$P_{\pi}$', fontsize=24) + ax.tick_params(direction='in', which='both', labelsize=24) + for i, (_, part) in enumerate(labels.items()): + mask = (y_test == part[0]) + probs = prediction[mask] + ax.hist(probs.T[1], histtype='step', bins=np.linspace(0, 1, 101), label=part[1]) + pid_probs[i][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs)) + pid_probs[i][1] = 1. - pid_probs[i][0] + ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--') + ax.text(0.55, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(pid_probs[0][0]*100., pid_probs[1][1]*100.), + fontsize=24, transform=ax.transAxes, ha='left', va='top') + ax.legend(fontsize=24, loc='lower left') + fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.png')) + fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.pdf')) + # --- pion - electron ---- + result_text = open(os.path.join(out_dir, 'rejection_result.txt'), 'w') + lines = [ + 'E/p cut position: {:.4f}, layer {:d} to {:d}, E/p efficiency: {:.4f}%, rejection power: {:.4f}'\ + .format(epcut, begin_layer, end_layer, ep_eff*100., 1./(1. - ep_rej)), + ] if do_epcut else [] + lines += [ + 'ML training weight: {}, pion cut position: {:.4f}, ML efficiency: {:.4f}%, rejection power: {:.4f}'\ + .format(list(part_weights.values()), pion_prob_cut, pid_probs[0][0]*100., 1./(1. - pid_probs[1][1])), + 'combined efficiency: {:.2f}%, rejection power: {:.1f}'\ + .format(pid_probs[0][0]*ep_eff*100., 1./((1. - ep_rej)*(1. - pid_probs[1][1]))), + ] + result_text.write('\n'.join(lines)) + result_text.close() + print('\n'.join(lines)) + +else: + # --- multi-particles --- + pid_probs = np.zeros(shape=(len(labels), len(labels))) + fig, ax = plt.subplots(1, 1, figsize=(12, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) + ax.set_yscale('log') + ax.set_ylabel('Counts', fontsize=26) + ax.set_xlabel(r'Likelihood of the Corresponding Label', fontsize=26) + ax.tick_params(direction='in', which='both', labelsize=26) + # label with maximum likelihood + for _, (i, part, label) in labels.items(): + mask = (y_test == i) + probs = prediction[mask] + pred_labels = probs.argmax(axis=1) + ax.hist(probs.T[i], histtype='step', bins=np.linspace(0, 1, 101), label=label) + for _, (j, _, _) in labels.items(): + pid_probs[i][j] = np.sum(pred_labels == j)/float(np.sum(mask)) + + ax.legend(fontsize=26, loc='upper center', ncol=3, handletextpad=0.3, columnspacing=0.6) + fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.png')) + fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag_hist.pdf')) + + # --- multi-particles --- +fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95}) +ax.imshow(pid_probs, cmap='PuBu', norm=LogNorm(vmin=0.01, vmax=1)) +ax.xaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[0], step=1.0))) +ax.yaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[1], step=1.0))) +ax.xaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[0], step=1.0))) +ax.yaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[1], step=1.0))) +ax.set_xticklabels([x[2] for _, x in labels.items()], fontsize=26, va='bottom') +ax.set_yticklabels([x[2] for _, x in labels.items()], fontsize=26, ha='left') +ax.tick_params(bottom=False, left=False, which='both') +ax.tick_params(pad=30, axis='x') +ax.tick_params(pad=40, axis='y') +ax.set_ylabel('Truth', fontsize=26) +ax.set_xlabel('Classification', fontsize=26) +for i in range(pid_probs.shape[0]): + for j in range(pid_probs.shape[1]): + color = 'k' if pid_probs[i, j] < 0.15 else 'w' + text = ax.text(j, i, '{:.2f}%'.format(pid_probs[i, j]*100.), ha='center', va='center', + color=color, fontsize=28 - len(labels)*2) +ax.grid('-', color='k', which='minor') +fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag.png')) +fig.savefig(os.path.join('results/ml/models/emcal/', 'pid_tag.pdf')) diff --git a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py index 75bfb05c..2b7455bf 100644 --- a/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py +++ b/benchmarks/imaging_shower_ML/scripts/prepare_tf_dataset.py @@ -145,7 +145,8 @@ if __name__ == '__main__': event_ids = df['event'].unique() data = df.set_index('event')[featcols].clip(0, 1).values.reshape([len(event_ids)] + dshape) - tags = dfm.loc[event_ids, ['pdgID', 'p', 'pT', 'eta', 'phi', 'mass']] + + tags = dfm.loc[event_ids, ['PDG', 'p', 'pT', 'eta', 'phi', 'mass']] # also save Edep per layers # merge the sandwich layers df['layer'] = df['layer'] % 100 diff --git a/benchmarks/imaging_shower_ML/sim_rec_tag.py b/benchmarks/imaging_shower_ML/sim_rec_tag.py index e7425a61..debc82cc 100755 --- a/benchmarks/imaging_shower_ML/sim_rec_tag.py +++ b/benchmarks/imaging_shower_ML/sim_rec_tag.py @@ -30,13 +30,13 @@ parser.add_argument('--physics-list', type=str, default='FTFP_BERT', help='Path args = parser.parse_args() kwargs = vars(args) -for mdir in ['gen_data', 'sim_data', 'rec_data', 'tag_data']: +for mdir in ['sim_output/gen_data', 'sim_output/sim_data', 'sim_output/rec_data', 'sim_output/tag_data']: os.makedirs(mdir, exist_ok=True) -gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs)) -sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.edm4hep.root'.format(**kwargs)) -rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) -tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs)) +gen_file = os.path.join('sim_output/gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs)) +sim_file = os.path.join('sim_output/sim_data', '{nametag}_{pmin}_{pmax}.edm4hep.root'.format(**kwargs)) +rec_file = os.path.join('sim_output/rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) +tag_dir = os.path.join('sim_output/tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs)) procs = [p.strip() for p in args.process.split(',')] sdir = os.path.dirname(os.path.realpath(__file__)) -- GitLab