Skip to content
Snippets Groups Projects
Commit bcc023f4 authored by Chao Peng's avatar Chao Peng
Browse files

unified the way to prepare data and so the training script

parent 529e7373
Branches
No related tags found
No related merge requests found
......@@ -10,6 +10,8 @@ import os
import gc
import sys
import json
import glob
import h5py
import numpy as np
import pandas as pd
import argparse
......@@ -20,8 +22,8 @@ pd.set_option('display.max_rows', 500)
# normalizing range for features
# NOTE: assumed for barrel ecal, using the range of 0.5 - 2 meters to normalize the r0 values
WIN_ETA = (-0.2, 0.2) # unitless
WIN_PHI = (-0.4, 0.4) # rad
WIN_ETA = (-0.4, 0.4) # unitless
WIN_PHI = (-0.6, 0.6) # rad
WIN_R0 = (500, 2000) # mm
WIN_EH = (0, 0.05) # GeV
......@@ -49,7 +51,7 @@ def lin_norm(vals, window, val_name='', var_name='', warn_win_size=True):
if warn_win_size and len(vals) > 1000:
perc = (np.percentile(vals, 5), np.percentile(vals, 95))
if perc[0] < window[0] or perc[1] > window[1]:
warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit {} values {}.'\
warnstr = 'WARNING = Prepare ML data: normalization window {} does not fit 95 percentile of {} values {}.'\
.format(window, val_name, perc)
if var_name:
warnstr += ' Try adjust {}'.format(var_name)
......@@ -129,7 +131,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis')
parser.add_argument(
'files', type=str,
help='path to the input files, separated by \",\".'
help='path to the input file, support wild-card.'
)
parser.add_argument(
'-o', type=str,
......@@ -139,7 +141,7 @@ if __name__ == '__main__':
)
parser.add_argument(
'--batch-size', type=int,
default=100000,
default=10000,
help='batch size to process data.'
)
parser.add_argument(
......@@ -173,28 +175,20 @@ if __name__ == '__main__':
default=None,
help='A json file contains epcut information (best will be used).'
)
parser.add_argument(
'--append-data', action='store_true',
default=False,
help='Append data to pre-existing data file.'
)
args = parser.parse_args()
h5f = h5py.File(args.outpath, 'w')
data_store = pd.HDFStore(args.outpath)
# print(data_store.keys())
# clear data store
if not args.append_data:
for dkey in [imcal_info.ml_data_key, imcal_info.truth_data_key]:
if dkey in data_store.keys():
data_store.remove(dkey)
print('Prepare ML data: remove pre-existed dataset {}'.format(dkey))
if args.seed > 0:
np.random.seed(args.seed)
if args.seed <= 0:
args.seed = int.from_bytes(os.urandom(4), byteorder='big', signed=False)
print('Using a random seed {:d}.'.format(args.seed))
np.random.seed(args.seed)
# read data and MCParticles
print([f.strip() for f in args.files.split(',')])
rdf = ROOT.RDataFrame("events", [f.strip() for f in args.files.split(',')])
# read data and MCParticles
files = glob.glob(args.files)
print('input data files:')
print(files)
rdf = ROOT.RDataFrame("events", files)
# event range
ntotal = rdf.Count().GetValue()
ev_bins = np.append(np.arange(0, ntotal, args.batch_size), [ntotal])
......@@ -205,6 +199,7 @@ if __name__ == '__main__':
# process events
# avoid insane amount of memory use with a large dataset
first_save = True
for ev_begin, ev_end in zip(ev_bins[:-1], ev_bins[1:]):
gc.collect()
print('Prepare ML data: processing events {:d} - {:d}'.format(ev_begin, ev_end))
......@@ -285,10 +280,25 @@ if __name__ == '__main__':
# combine two datasets
df = pd.concat([dfi.reset_index(), dfs.reset_index()], ignore_index=True)\
.set_index(['event', 'ltype', 'layer', 'hit']).sort_index()
# print(df.head(100))
data_store.append(imcal_info.ml_data_key, df, format='t', data_columns=True)
data_store.append(imcal_info.truth_data_key, dfm[['PDG', 'P', 'mass']], format='t', data_columns=True)
data_store.close()
ml_data = df.values.reshape(len(ievs), imcal_info.nlayers_img + imcal_info.nlayers_scfi, args.nhits, len(df.columns))
tag_data = dfm[['PDG', 'P', 'mass']].values
if not len(ml_data):
continue
# create datasets
if first_save:
h5f.create_dataset(imcal_info.ml_data_key, data=ml_data, compression="gzip",
chunks=True, maxshape=(None, *ml_data.shape[1:]))
h5f.create_dataset(imcal_info.truth_data_key, data=tag_data, compression="gzip",
chunks=True, maxshape=(None, *tag_data.shape[1:]))
first_save = False
# Append new data to it
else:
h5f[imcal_info.ml_data_key].resize((h5f[imcal_info.ml_data_key].shape[0] + ml_data.shape[0]), axis=0)
h5f[imcal_info.ml_data_key][-ml_data.shape[0]:] = ml_data
h5f[imcal_info.truth_data_key].resize((h5f[imcal_info.truth_data_key].shape[0] + tag_data.shape[0]), axis=0)
h5f[imcal_info.truth_data_key][-tag_data.shape[0]:] = tag_data
print("saved data {} - {}".format(h5f[imcal_info.ml_data_key].shape, h5f[imcal_info.truth_data_key].shape))
# print(df.head(100))
h5f.close()
......@@ -129,7 +129,7 @@ if __name__ == '__main__':
if args.seed <= 0:
args.seed = int.from_bytes(os.urandom(4), byteorder='big', signed=False)
print('Using a random seed {:d}.'.format(args.seed))
print('Using a random seed {:d}.'.format(args.seed))
np.random.seed(args.seed)
# read data and MCParticles
......
......@@ -9,6 +9,7 @@ import os
import sys
import json
import argparse
import h5py
import pandas as pd
import numpy as np
from collections import OrderedDict
......@@ -89,22 +90,22 @@ if __name__ == '__main__':
)
parser.add_argument(
'--epochs', type=int,
default=10,
default=15,
help='epochs for ML training.'
)
parser.add_argument(
'--batch-size', type=int,
default=512,
default=128,
help='batch size for ML training.'
)
parser.add_argument(
'--optimize-for', type=int,
default=11,
default=22,
help='optimize cut for a target particle (PDGCODE).'
)
parser.add_argument(
'--optimize-efficiency', type=float,
default=0.98,
default=0.95,
help='used when --optimize-for is specified, try to keep the efficiency with optimized cuts.'
)
parser.add_argument(
......@@ -133,14 +134,14 @@ if __name__ == '__main__':
np.random.seed(args.seed)
tf.random.set_seed(args.seed)
df = pd.read_hdf(args.data_store, key=imcal_info.ml_data_key)
# NOTE: assumed event index is exactly the same as df
dft = pd.read_hdf(args.data_store, key=imcal_info.truth_data_key)
print('ML training: input sample size.')
print(dft['PDG'].value_counts())
hf = h5py.File(args.data_store, 'r')
# preload all datasets
xdata = hf.get(imcal_info.ml_data_key)[...]
ml_tags = hf.get(imcal_info.truth_data_key)[...]
ml_pids = pd.Series(data=ml_tags[:, 0])
# fill dictionary if not pre-defined
data_pid = dft['PDG'].unique()
data_pid = ml_pids.unique()
for pid in data_pid:
if pid not in par_table.keys():
print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid))
......@@ -160,19 +161,17 @@ if __name__ == '__main__':
pid_weights[pid] = kwargs.get('weight_{}'.format(part.full.replace('-', '_')), 1.0)
pid_labels[pid] = len(pid_labels)
pids = list(pid_labels.keys())
# print(pid_labels)
print('ML training: input sample size {} - {}'.format(xdata.shape, ml_tags.shape))
print(pid_labels)
# print(pid_weights)
# data shape
# NOTE: number of layers for combined data
nlayers = imcal_info.nlayers_img + imcal_info.nlayers_scfi
# NOTE: index levels should be [event, layer_type, layer, hit], change the code if data structure is changed
nhits = df.index.levels[-1].max()
# training datasets
xdata = df.values.reshape(dft.shape[0], nlayers, nhits, len(df.columns))
ydata = dft['PDG'].map(pid_labels).values
wdata = dft['PDG'].map(pid_weights).values
ydata = ml_pids.map(pid_labels).values
wdata = ml_pids.map(pid_weights).values
# seperate train and test data
indices = np.arange(len(xdata))
......
"""
Script for supervised training of a simple ML model using Tensorflow 2.
ML e/pi separation for imaging barrel ecal.
Chao Peng (ANL)
2022/11/11
"""
import os
import sys
import json
import argparse
import h5py
import pandas as pd
import numpy as np
from collections import OrderedDict
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator
from matplotlib.colors import LogNorm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from utils import dotdict, imcal_info, par_table, NpEncoder
# default color cycle of matplotlib
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
def count_pid_labels(label_lst, pid_list):
res = OrderedDict()
res['total'] = len(label_lst)
for p, cnt in zip(pid_list, np.bincount(label_lst)):
res[par_table.get(p).name] = '{:d}'.format(cnt)
return res
## Slightly beefier VGG-style CNN
def build_vgg(input_shape, n_labels=2):
my_model = keras.Sequential([
keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu',padding='same',input_shape=input_shape),
keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu',padding='same'),
keras.layers.MaxPooling2D(pool_size=(2, 2),strides=2),
keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'),
keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'),
keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu'),
keras.layers.MaxPooling2D(pool_size=(2, 2),strides=2),
keras.layers.Flatten(),
keras.layers.Dense(1024, activation='relu'),
keras.layers.Dense(1024, activation='relu'),
#keras.layers.Dense(512, activation='relu'),
keras.layers.Dense(n_labels, activation='softmax')
])
return my_model
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'data_store', type=str,
help='path to data store.'
)
parser.add_argument(
'-o', type=str,
dest='outdir',
default='ml_result',
help='output directory.'
)
parser.add_argument(
'-s', '--seed', type=int,
default=-1,
help='random seed.'
)
parser.add_argument(
'-t', '--name-tag', type=str,
dest='ntag',
default='imcal_ml',
help='name tag for output files/models.'
)
parser.add_argument(
'--test-size', type=float,
default=0.2,
help='relative size of test samples (between 0-1).'
)
parser.add_argument(
'--epochs', type=int,
default=15,
help='epochs for ML training.'
)
parser.add_argument(
'--batch-size', type=int,
default=128,
help='batch size for ML training.'
)
parser.add_argument(
'--optimize-for', type=int,
default=22,
help='optimize cut for a target particle (PDGCODE).'
)
parser.add_argument(
'--optimize-efficiency', type=float,
default=0.95,
help='used when --optimize-for is specified, try to keep the efficiency with optimized cuts.'
)
parser.add_argument(
'--read-model', action='store_true',
default=False,
help='read a trained model'
)
parser.add_argument(
'--read-model-path', type=str,
default='',
help='only used when --read-model is enabled, if not specified it reads the model from saving path.'
)
# weights
for key, val in par_table.items():
parser.add_argument(
'--weight-{}'.format(val.full), type=float,
default=1.0,
help='sample weight for {}.'.format(val.name)
)
args = parser.parse_args()
kwargs = vars(args)
os.makedirs(args.outdir, exist_ok=True)
if args.seed > 0:
np.random.seed(args.seed)
tf.random.set_seed(args.seed)
hf = h5py.File(args.data_store, 'r')
# preload all datasets
xdata = hf.get(imcal_info.ml_data_key)[...]
ml_tags = hf.get(imcal_info.truth_data_key)[...]
ml_pids = pd.Series(data=ml_tags[:, 0])
# fill dictionary if not pre-defined
data_pid = ml_pids.unique()
for pid in data_pid:
if pid not in par_table.keys():
print('WARNING = ML training: particle ({}) not found in table, updating it.'.format(pid))
par_table[pid] = dotdict(
name='PDG_{:d}'.format(pid),
full='PDG-{:d}'.format(pid),
latex='PDG_{:d}'.format(pid)
)
# build labels and weights for particles
# NOTE: pid follows the dictionary orders
pid_labels = OrderedDict()
pid_weights = OrderedDict()
for pid, part in par_table.items():
if pid not in data_pid:
continue
pid_weights[pid] = kwargs.get('weight_{}'.format(part.full.replace('-', '_')), 1.0)
pid_labels[pid] = len(pid_labels)
pids = list(pid_labels.keys())
print('ML training: input sample size {} - {}'.format(xdata.shape, ml_tags.shape))
print(pid_labels)
# print(pid_weights)
# data shape
# NOTE: number of layers for combined data
nlayers = imcal_info.nlayers_img + imcal_info.nlayers_scfi
# training datasets
ydata = ml_pids.map(pid_labels).values
wdata = ml_pids.map(pid_weights).values
# seperate train and test data
indices = np.arange(len(xdata))
np.random.shuffle(indices)
id_train = indices[int(len(indices)*args.test_size):]
id_test = indices[:int(len(indices)*args.test_size)]
# test data will be used for benchmarking, no need for sampling weights
xtrain, ytrain, wtrain = xdata[id_train], ydata[id_train], wdata[id_train]
xtest, ytest = xdata[id_test], ydata[id_test]
# try to use GPUs
# tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_logical_devices('GPU')
print("ML training: number of GPUs available: ", len(gpus))
strategy = tf.distribute.MirroredStrategy(gpus)
if args.read_model:
model_path = os.path.join(args.outdir, '{}_model'.format(args.ntag))
if args.read_model_path:
model_path = args.read_model_path
model = keras.models.load_model(model_path)
else:
with strategy.scope():
train_dataset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain, wtrain))
model = build_vgg(xdata.shape[1:], len(pid_labels))
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
history = model.fit(train_dataset.batch(args.batch_size), epochs=args.epochs)
model.save(os.path.join(args.outdir, '{}_model'.format(args.ntag)))
# benchmark
prediction = model.predict(xtest)
pred_weights = np.ones(len(pid_labels))
# results
npar = len(pid_labels)
opt_i = 0
opt_cut = 0.
# optimize for target particle's efficiency
opt_param = (None, 0, 1.0)
if args.optimize_for != 0:
opt_pid = args.optimize_for
if opt_pid not in pid_labels.keys():
print('WARNING = ML training: ' +
'particle ({}) [PDGCODE] not found in samples, skip optimizing.'.format(opt_pid))
else:
opt_i = pid_labels.get(opt_pid)
# find optimal prob weight
opt_probs = prediction[ytest == opt_i].T[opt_i]
opt_cut = np.percentile(opt_probs, (1. - args.optimize_efficiency)*100.)
# NOTE: this weight calculation only works for two labels
opt_wt = (1. - opt_cut)/opt_cut
pred_weights[opt_i] = opt_wt
opt_param = (par_table.get(opt_pid).name, args.optimize_efficiency, opt_wt)
# pick labels with the highest probability
pred_labels = np.argmax(prediction*pred_weights, axis=1)
pred_probs = np.zeros(shape=(npar, npar))
opt_label = par_table.get(pids[opt_i]).latex
fig, ax = plt.subplots(figsize=(12, 9), dpi=160)
effs = []
for pid, ilbl in pid_labels.items():
mask = (ytest == ilbl)
pred_probs[ilbl] = np.bincount(pred_labels[mask])/float(np.sum(mask))
labels = par_table.get(pid)
ax.hist(prediction[mask].T[opt_i], bins=np.linspace(0, 1, 101), label='${}$'.format(labels.latex),
color=colors[ilbl], ec=colors[ilbl], alpha=0.5)
effs.append((labels, pred_probs[ilbl][opt_i]))
if opt_cut > 0.:
ax.axvline(x=opt_cut, lw=2, color='k', ls='--')
eff_string = '\n'.join([r'$\epsilon_{{{}}} = {:.2f}$%'.format(labels.latex, eff*100.) for labels, eff in effs])
data_to_axis = (ax.transAxes + ax.transData.inverted()).inverted()
ax.text(data_to_axis.transform((opt_cut, 1))[0] + 0.01, 0.99, eff_string, fontsize=24,
transform=ax.transAxes, ha='left', va='top')
ax.set_yscale('log')
ax.set_ylabel('Counts', fontsize=24)
ax.set_xlabel(r'$P_{{{}}}$'.format(opt_label), fontsize=24)
ax.tick_params(direction='in', which='both', labelsize=24)
ax.legend(fontsize=24, ncol=4, loc='upper center', bbox_to_anchor=(0.5, 1.12),)
fig.savefig(os.path.join(args.outdir, '{}_pid_hist.pdf'.format(args.ntag)))
# save results
# all results
result = OrderedDict(
training= OrderedDict(
epochs=args.epochs,
batch_size=args.batch_size,
sample_size=count_pid_labels(ytrain, pids),
sample_weights={par_table.get(pid).name: weight for pid, weight in pid_weights.items()},
optimal_cut=OrderedDict(
target_particle=opt_param[0],
target_efficiency=opt_param[1],
target_label_weight=opt_param[2],
),
),
test=OrderedDict(
sample_size=count_pid_labels(ytest, pids),
efficiencies=OrderedDict([(labels.name, eff) for labels, eff in effs]),
rejections=OrderedDict([(labels.name, 1./eff) for labels, eff in effs]),
),
)
res_json = json.dumps(result, indent=4, cls=NpEncoder)
with open(os.path.join(args.outdir, '{}_result.json'.format(args.ntag)), 'w') as outfile:
outfile.write(res_json)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment