diff --git a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py index 2d1c8ca5eb6eec866b489ff932b63cbcc5192404..d266a5401dada91cc7e85b2ce113c29d87b73c77 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_data_preprocess.py @@ -10,6 +10,8 @@ import os import gc import sys import json +import glob +import h5py import numpy as np import pandas as pd import argparse @@ -20,8 +22,8 @@ pd.set_option('display.max_rows', 500) # normalizing range for features # NOTE: assumed for barrel ecal, using the range of 0.5 - 2 meters to normalize the r0 values -WIN_ETA = (-0.2, 0.2) # unitless -WIN_PHI = (-0.4, 0.4) # rad +WIN_ETA = (-0.4, 0.4) # unitless +WIN_PHI = (-0.6, 0.6) # rad WIN_R0 = (500, 2000) # mm WIN_EH = (0, 0.05) # GeV @@ -49,7 +51,7 @@ def lin_norm(vals, window, val_name='', var_name='', warn_win_size=True): if warn_win_size and len(vals) > 1000: perc = (np.percentile(vals, 5), np.percentile(vals, 95)) if perc[0] < window[0] or perc[1] > window[1]: - warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit {} values {}.'\ + warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit 95 percentile of {} values {}.'\ .format(window, val_name, perc) if var_name: warnstr += ' Try adjust {}'.format(var_name) @@ -129,7 +131,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') parser.add_argument( 'files', type=str, - help='path to the input files, separated by \",\".' + help='path to the input file, support wild-card.' ) parser.add_argument( '-o', type=str, @@ -139,7 +141,7 @@ if __name__ == '__main__': ) parser.add_argument( '--batch-size', type=int, - default=100000, + default=10000, help='batch size to process data.' ) parser.add_argument( @@ -173,28 +175,20 @@ if __name__ == '__main__': default=None, help='A json file contains epcut information (best will be used).' ) - parser.add_argument( - '--append-data', action='store_true', - default=False, - help='Append data to pre-existing data file.' - ) args = parser.parse_args() + h5f = h5py.File(args.outpath, 'w') - data_store = pd.HDFStore(args.outpath) - # print(data_store.keys()) - # clear data store - if not args.append_data: - for dkey in [imcal_info.ml_data_key, imcal_info.truth_data_key]: - if dkey in data_store.keys(): - data_store.remove(dkey) - print('Prepare ML data: remove pre-existed dataset {}'.format(dkey)) - - if args.seed > 0: - np.random.seed(args.seed) + if args.seed <= 0: + args.seed = int.from_bytes(os.urandom(4), byteorder='big', signed=False) + print('Using a random seed {:d}.'.format(args.seed)) + np.random.seed(args.seed) # read data and MCParticles - print([f.strip() for f in args.files.split(',')]) - rdf = ROOT.RDataFrame("events", [f.strip() for f in args.files.split(',')]) + # read data and MCParticles + files = glob.glob(args.files) + print('input data files:') + print(files) + rdf = ROOT.RDataFrame("events", files) # event range ntotal = rdf.Count().GetValue() ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal]) @@ -205,6 +199,7 @@ if __name__ == '__main__': # process events # avoid insane amount of memory use with a large dataset + first_save = True for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]): gc.collect() print('Prepare ML data: processing events {:d} - {:d}'.format(ev_begin, ev_end)) @@ -285,10 +280,25 @@ if __name__ == '__main__': # combine two datasets df = pd.concat([dfi.reset_index(), dfs.reset_index()], ignore_index=True)\ .set_index(['event', 'ltype', 'layer', 'hit']).sort_index() - # print(df.head(100)) - data_store.append(imcal_info.ml_data_key, df, format='t', data_columns=True) - data_store.append(imcal_info.truth_data_key, dfm[['PDG', 'P', 'mass']], format='t', data_columns=True) - - data_store.close() - + ml_data = df.values.reshape(len(ievs), imcal_info.nlayers_img + imcal_info.nlayers_scfi, args.nhits, len(df.columns)) + tag_data = dfm[['PDG', 'P', 'mass']].values + if not len(ml_data): + continue + + # create datasets + if first_save: + h5f.create_dataset(imcal_info.ml_data_key, data=ml_data, compression="gzip", + chunks=True, maxshape=(None, *ml_data.shape[1:])) + h5f.create_dataset(imcal_info.truth_data_key, data=tag_data, compression="gzip", + chunks=True, maxshape=(None, *tag_data.shape[1:])) + first_save = False + # Append new data to it + else: + h5f[imcal_info.ml_data_key].resize((h5f[imcal_info.ml_data_key].shape[0] + ml_data.shape[0]), axis=0) + h5f[imcal_info.ml_data_key][-ml_data.shape[0]:] = ml_data + h5f[imcal_info.truth_data_key].resize((h5f[imcal_info.truth_data_key].shape[0] + tag_data.shape[0]), axis=0) + h5f[imcal_info.truth_data_key][-tag_data.shape[0]:] = tag_data + print("saved data {} - {}".format(h5f[imcal_info.ml_data_key].shape, h5f[imcal_info.truth_data_key].shape)) + # print(df.head(100)) + h5f.close() diff --git a/benchmarks/imaging_shower_ML/scripts/ml_imaging_data.py b/benchmarks/imaging_shower_ML/scripts/ml_imaging_data.py index fb945271b764517b99a4a22ea8b9761d69bd5199..61eee6238fe4fef26e49224159e1b9e6d11fa74f 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_imaging_data.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_imaging_data.py @@ -129,7 +129,7 @@ if __name__ == '__main__': if args.seed <= 0: args.seed = int.from_bytes(os.urandom(4), byteorder='big', signed=False) - print('Using a random seed {:d}.'.format(args.seed)) + print('Using a random seed {:d}.'.format(args.seed)) np.random.seed(args.seed) # read data and MCParticles diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training.py b/benchmarks/imaging_shower_ML/scripts/ml_training.py index 43561aed119d0f552669dcd1ea37f49e47f94e5f..1ec11c0fc552d8f3bf4b6d5de6ea4737474c57bb 100644 --- a/benchmarks/imaging_shower_ML/scripts/ml_training.py +++ b/benchmarks/imaging_shower_ML/scripts/ml_training.py @@ -9,6 +9,7 @@ import os import sys import json import argparse +import h5py import pandas as pd import numpy as np from collections import OrderedDict @@ -89,22 +90,22 @@ if __name__ == '__main__': ) parser.add_argument( '--epochs', type=int, - default=10, + default=15, help='epochs for ML training.' ) parser.add_argument( '--batch-size', type=int, - default=512, + default=128, help='batch size for ML training.' ) parser.add_argument( '--optimize-for', type=int, - default=11, + default=22, help='optimize cut for a target particle (PDGCODE).' ) parser.add_argument( '--optimize-efficiency', type=float, - default=0.98, + default=0.95, help='used when --optimize-for is specified, try to keep the efficiency with optimized cuts.' ) parser.add_argument( @@ -133,14 +134,14 @@ if __name__ == '__main__': np.random.seed(args.seed) tf.random.set_seed(args.seed) - df = pd.read_hdf(args.data_store, key=imcal_info.ml_data_key) - # NOTE: assumed event index is exactly the same as df - dft = pd.read_hdf(args.data_store, key=imcal_info.truth_data_key) - print('ML training: input sample size.') - print(dft['PDG'].value_counts()) + hf = h5py.File(args.data_store, 'r') + # preload all datasets + xdata = hf.get(imcal_info.ml_data_key)[...] + ml_tags = hf.get(imcal_info.truth_data_key)[...] + ml_pids = pd.Series(data=ml_tags[:, 0]) # fill dictionary if not pre-defined - data_pid = dft['PDG'].unique() + data_pid = ml_pids.unique() for pid in data_pid: if pid not in par_table.keys(): print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid)) @@ -160,19 +161,17 @@ if __name__ == '__main__': pid_weights[pid] = kwargs.get('weight_{}'.format(part.full.replace('-', '_')), 1.0) pid_labels[pid] = len(pid_labels) pids = list(pid_labels.keys()) - # print(pid_labels) + print('ML training: input sample size {} - {}'.format(xdata.shape, ml_tags.shape)) + print(pid_labels) # print(pid_weights) # data shape # NOTE: number of layers for combined data nlayers = imcal_info.nlayers_img + imcal_info.nlayers_scfi - # NOTE: index levels should be [event, layer_type, layer, hit], change the code if data structure is changed - nhits = df.index.levels[-1].max() # training datasets - xdata = df.values.reshape(dft.shape[0], nlayers, nhits, len(df.columns)) - ydata = dft['PDG'].map(pid_labels).values - wdata = dft['PDG'].map(pid_weights).values + ydata = ml_pids.map(pid_labels).values + wdata = ml_pids.map(pid_weights).values # seperate train and test data indices = np.arange(len(xdata)) diff --git a/benchmarks/imaging_shower_ML/scripts/ml_training_image.py b/benchmarks/imaging_shower_ML/scripts/ml_training_image.py deleted file mode 100644 index 1ec11c0fc552d8f3bf4b6d5de6ea4737474c57bb..0000000000000000000000000000000000000000 --- a/benchmarks/imaging_shower_ML/scripts/ml_training_image.py +++ /dev/null @@ -1,283 +0,0 @@ -""" - Script for supervised training of a simple ML model using Tensorflow 2. - ML e/pi separation for imaging barrel ecal. - - Chao Peng (ANL) - 2022/11/11 -""" -import os -import sys -import json -import argparse -import h5py -import pandas as pd -import numpy as np -from collections import OrderedDict - -import matplotlib as mpl -import matplotlib.pyplot as plt -from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator -from matplotlib.colors import LogNorm - -import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers - -from utils import dotdict, imcal_info, par_table, NpEncoder - - -# default color cycle of matplotlib -prop_cycle = plt.rcParams['axes.prop_cycle'] -colors = prop_cycle.by_key()['color'] - - -def count_pid_labels(label_lst, pid_list): - res = OrderedDict() - res['total'] = len(label_lst) - for p, cnt in zip(pid_list, np.bincount(label_lst)): - res[par_table.get(p).name] = '{:d}'.format(cnt) - return res - - -## Slightly beefier VGG-style CNN -def build_vgg(input_shape, n_labels=2): - my_model = keras.Sequential([ - keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu',padding='same',input_shape=input_shape), - keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu',padding='same'), - keras.layers.MaxPooling2D(pool_size=(2, 2),strides=2), - keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'), - keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'), - keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'), - keras.layers.MaxPooling2D(pool_size=(2, 2),strides=2), - keras.layers.Flatten(), - keras.layers.Dense(1024, activation='relu'), - keras.layers.Dense(1024, activation='relu'), - #keras.layers.Dense(512, activation='relu'), - keras.layers.Dense(n_labels, activation='softmax') - ]) - - return my_model - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - parser.add_argument( - 'data_store', type=str, - help='path to data store.' - ) - parser.add_argument( - '-o', type=str, - dest='outdir', - default='ml_result', - help='output directory.' - ) - parser.add_argument( - '-s', '--seed', type=int, - default=-1, - help='random seed.' - ) - parser.add_argument( - '-t', '--name-tag', type=str, - dest='ntag', - default='imcal_ml', - help='name tag for output files/models.' - ) - parser.add_argument( - '--test-size', type=float, - default=0.2, - help='relative size of test samples (between 0-1).' - ) - parser.add_argument( - '--epochs', type=int, - default=15, - help='epochs for ML training.' - ) - parser.add_argument( - '--batch-size', type=int, - default=128, - help='batch size for ML training.' - ) - parser.add_argument( - '--optimize-for', type=int, - default=22, - help='optimize cut for a target particle (PDGCODE).' - ) - parser.add_argument( - '--optimize-efficiency', type=float, - default=0.95, - help='used when --optimize-for is specified, try to keep the efficiency with optimized cuts.' - ) - parser.add_argument( - '--read-model', action='store_true', - default=False, - help='read a trained model' - ) - parser.add_argument( - '--read-model-path', type=str, - default='', - help='only used when --read-model is enabled, if not specified it reads the model from saving path.' - ) - # weights - for key, val in par_table.items(): - parser.add_argument( - '--weight-{}'.format(val.full), type=float, - default=1.0, - help='sample weight for {}.'.format(val.name) - ) - args = parser.parse_args() - kwargs = vars(args) - - os.makedirs(args.outdir, exist_ok=True) - - if args.seed > 0: - np.random.seed(args.seed) - tf.random.set_seed(args.seed) - - hf = h5py.File(args.data_store, 'r') - # preload all datasets - xdata = hf.get(imcal_info.ml_data_key)[...] - ml_tags = hf.get(imcal_info.truth_data_key)[...] - ml_pids = pd.Series(data=ml_tags[:, 0]) - - # fill dictionary if not pre-defined - data_pid = ml_pids.unique() - for pid in data_pid: - if pid not in par_table.keys(): - print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid)) - par_table[pid] = dotdict( - name='PDG_{:d}'.format(pid), - full='PDG-{:d}'.format(pid), - latex='PDG_{:d}'.format(pid) - ) - - # build labels and weights for particles - # NOTE: pid follows the dictionary orders - pid_labels = OrderedDict() - pid_weights = OrderedDict() - for pid, part in par_table.items(): - if pid not in data_pid: - continue - pid_weights[pid] = kwargs.get('weight_{}'.format(part.full.replace('-', '_')), 1.0) - pid_labels[pid] = len(pid_labels) - pids = list(pid_labels.keys()) - print('ML training: input sample size {} - {}'.format(xdata.shape, ml_tags.shape)) - print(pid_labels) - # print(pid_weights) - - # data shape - # NOTE: number of layers for combined data - nlayers = imcal_info.nlayers_img + imcal_info.nlayers_scfi - - # training datasets - ydata = ml_pids.map(pid_labels).values - wdata = ml_pids.map(pid_weights).values - - # seperate train and test data - indices = np.arange(len(xdata)) - np.random.shuffle(indices) - id_train = indices[int(len(indices)*args.test_size):] - id_test = indices[:int(len(indices)*args.test_size)] - # test data will be used for benchmarking, no need for sampling weights - xtrain, ytrain, wtrain = xdata[id_train], ydata[id_train], wdata[id_train] - xtest, ytest = xdata[id_test], ydata[id_test] - - # try to use GPUs - # tf.debugging.set_log_device_placement(True) - gpus = tf.config.list_logical_devices('GPU') - print("ML training: number of GPUs available: ", len(gpus)) - strategy = tf.distribute.MirroredStrategy(gpus) - if args.read_model: - model_path = os.path.join(args.outdir, '{}_model'.format(args.ntag)) - if args.read_model_path: - model_path = args.read_model_path - model = keras.models.load_model(model_path) - else: - with strategy.scope(): - train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain, wtrain)) - model = build_vgg(xdata.shape[1:], len(pid_labels)) - model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - metrics=['accuracy']) - history = model.fit(train_dataset.batch(args.batch_size), epochs=args.epochs) - model.save(os.path.join(args.outdir, '{}_model'.format(args.ntag))) - - # benchmark - prediction = model.predict(xtest) - pred_weights = np.ones(len(pid_labels)) - - # results - npar = len(pid_labels) - opt_i = 0 - opt_cut = 0. - - # optimize for target particle's efficiency - opt_param = (None, 0, 1.0) - if args.optimize_for != 0: - opt_pid = args.optimize_for - if opt_pid not in pid_labels.keys(): - print('WARNING = ML training: ' + - 'particle ({}) [PDGCODE] not found in samples, skip optimizing.'.format(opt_pid)) - else: - opt_i = pid_labels.get(opt_pid) - # find optimal prob weight - opt_probs = prediction[ytest == opt_i].T[opt_i] - opt_cut = np.percentile(opt_probs, (1. - args.optimize_efficiency)*100.) - # NOTE: this weight calculation only works for two labels - opt_wt = (1. - opt_cut)/opt_cut - pred_weights[opt_i] = opt_wt - opt_param = (par_table.get(opt_pid).name, args.optimize_efficiency, opt_wt) - - # pick labels with the highest probability - pred_labels = np.argmax(prediction*pred_weights, axis=1) - pred_probs = np.zeros(shape=(npar, npar)) - opt_label = par_table.get(pids[opt_i]).latex - - fig, ax = plt.subplots(figsize=(12, 9), dpi=160) - effs = [] - for pid, ilbl in pid_labels.items(): - mask = (ytest == ilbl) - pred_probs[ilbl] = np.bincount(pred_labels[mask])/float(np.sum(mask)) - labels = par_table.get(pid) - - ax.hist(prediction[mask].T[opt_i], bins=np.linspace(0, 1, 101), label='${}$'.format(labels.latex), - color=colors[ilbl], ec=colors[ilbl], alpha=0.5) - - effs.append((labels, pred_probs[ilbl][opt_i])) - if opt_cut > 0.: - ax.axvline(x=opt_cut, lw=2, color='k', ls='--') - eff_string = '\n'.join([r'$\epsilon_{{{}}} = {:.2f}$%'.format(labels.latex, eff*100.) for labels, eff in effs]) - data_to_axis = (ax.transAxes + ax.transData.inverted()).inverted() - ax.text(data_to_axis.transform((opt_cut, 1))[0] + 0.01, 0.99, eff_string, fontsize=24, - transform=ax.transAxes, ha='left', va='top') - ax.set_yscale('log') - ax.set_ylabel('Counts', fontsize=24) - ax.set_xlabel(r'$P_{{{}}}$'.format(opt_label), fontsize=24) - ax.tick_params(direction='in', which='both', labelsize=24) - ax.legend(fontsize=24, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),) - fig.savefig(os.path.join(args.outdir, '{}_pid_hist.pdf'.format(args.ntag))) - - # save results - # all results - result = OrderedDict( - training= OrderedDict( - epochs=args.epochs, - batch_size=args.batch_size, - sample_size=count_pid_labels(ytrain, pids), - sample_weights={par_table.get(pid).name: weight for pid, weight in pid_weights.items()}, - optimal_cut=OrderedDict( - target_particle=opt_param[0], - target_efficiency=opt_param[1], - target_label_weight=opt_param[2], - ), - ), - test=OrderedDict( - sample_size=count_pid_labels(ytest, pids), - efficiencies=OrderedDict([(labels.name, eff) for labels, eff in effs]), - rejections=OrderedDict([(labels.name, 1./eff) for labels, eff in effs]), - ), - ) - res_json = json.dumps(result, indent=4, cls=NpEncoder) - with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile: - outfile.write(res_json) -