Skip to content
Snippets Groups Projects
Commit 84f3cb75 authored by Chao Peng's avatar Chao Peng
Browse files

update ML to date

parent 1c377c8f
No related branches found
No related tags found
1 merge request!233update ML to date
...@@ -3,12 +3,16 @@ import ROOT ...@@ -3,12 +3,16 @@ import ROOT
from Gaudi.Configuration import * from Gaudi.Configuration import *
from GaudiKernel import SystemOfUnits as units from GaudiKernel import SystemOfUnits as units
from Configurables import ApplicationMgr, EICDataSvc, PodioOutput, GeoSvc from Configurables import ApplicationMgr, EICDataSvc, PodioInput, PodioOutput, GeoSvc
from GaudiKernel.SystemOfUnits import MeV, GeV, mm, cm, mrad
from Configurables import PodioInput from Configurables import Jug__Base__InputCopier_dd4pod__Geant4ParticleCollection_dd4pod__Geant4ParticleCollection_ as MCCopier
from Configurables import Jug__Digi__CalorimeterHitDigi as CalorimeterHitDigi from Configurables import Jug__Digi__CalorimeterHitDigi as CalHitDigi
from Configurables import Jug__Reco__ImagingPixelReco as ImagingPixelReco from Configurables import Jug__Reco__CalorimeterHitReco as CalHitReco
from Configurables import Jug__Reco__ImagingPixelMerger as ImagingPixelMerger from Configurables import Jug__Reco__CalorimeterHitsMerger as CalHitsMerger
from Configurables import Jug__Reco__ImagingPixelMerger as MLDataMerger
from Configurables import Jug__Reco__ImagingPixelDataSorter as MLDataSorter
from Configurables import Jug__Reco__ImagingPixelDataCombiner as MLDataCombiner
# input arguments through environment variables # input arguments through environment variables
...@@ -17,13 +21,13 @@ kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '') ...@@ -17,13 +21,13 @@ kwargs['input'] = os.environ.get('JUGGLER_SIM_FILE', '')
kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '') kwargs['output'] = os.environ.get('JUGGLER_REC_FILE', '')
kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '') kwargs['compact'] = os.environ.get('JUGGLER_COMPACT_PATH', '')
kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1)) kwargs['nev'] = int(os.environ.get('JUGGLER_N_EVENTS', -1))
kwargs['nlayers'] = int(os.environ.get('IMCAL_ML_N_LAYERS', 20)) kwargs['combine'] = os.environ.get('IMCAL_ML_COMBINE', 'concatenate')
kwargs['nhits'] = int(os.environ.get('IMCAL_ML_N_HITS', 20)) kwargs['img_nlayers'] = int(os.environ.get('IMCAL_ML_IMG_NLAYERS', 9))
kwargs['nhits'] = int(os.environ.get('IMCAL_ML_NHITS', 20))
if kwargs['nev'] < 1: if kwargs['nev'] < 1:
f = ROOT.TFile(kwargs['input']) f = ROOT.TFile(kwargs['input'])
kwargs['nev'] = f.events.GetEntries() kwargs['nev'] = f.events.GetEntries()
print(kwargs) print(kwargs)
# get sampling fraction from system environment variable, 1.0 by default # get sampling fraction from system environment variable, 1.0 by default
sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0'))
...@@ -31,40 +35,118 @@ sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0')) ...@@ -31,40 +35,118 @@ sf = float(os.environ.get('JUGGLER_SAMP_FRAC', '1.0'))
geo_service = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')]) geo_service = GeoSvc('GeoSvc', detectors=[f.strip() for f in kwargs['compact'].split(',')])
podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')]) podev = EICDataSvc('EventDataSvc', inputs=[f.strip() for f in kwargs['input'].split(',')])
podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits']) podin = PodioInput('PodioReader', collections=['mcparticles', 'EcalBarrelHits', 'EcalBarrelScFiHits'])
podout = PodioOutput('out', filename=kwargs['output']) podout = PodioOutput('out', filename=kwargs['output'])
imcaldigi = CalorimeterHitDigi('imcal_digi', copier = MCCopier('MCCopier',
OutputLevel=WARNING,
inputCollection='mcparticles',
outputCollection='mcparticles2')
# Central Barrel Ecal (Imaging Cal.)
becal_img_daq = dict(
dynamicRangeADC=3*MeV,
capacityADC=8192,
pedestalMean=400,
pedestalSigma=20) # about 6 keV
becal_img_digi = CalHitDigi('becal_img_digi',
inputHitCollection='EcalBarrelHits', inputHitCollection='EcalBarrelHits',
outputHitCollection='EcalBarrelHitsDigi', outputHitCollection='EcalBarrelImagingHitsDigi',
energyResolutions=[0., 0.02, 0.], energyResolutions=[0., 0.02, 0.], # 2% flat resolution
dynamicRangeADC=3*units.MeV, **becal_img_daq)
pedestalSigma=40)
imcalreco = ImagingPixelReco('imcal_reco', becal_img_reco = CalHitReco('becal_img_reco',
inputHitCollection=imcaldigi.outputHitCollection, inputHitCollection=becal_img_digi.outputHitCollection,
outputHitCollection='EcalBarrelHitsReco', outputHitCollection='EcalBarrelImagingHitsReco',
dynamicRangeADC=3.*units.MeV, thresholdFactor=3, # about 20 keV
pedestalSigma=40, readoutClass='EcalBarrelHits', # readout class
thresholdFactor=3.5, layerField='layer', # field to get layer id
readoutClass='EcalBarrelHits', sectorField='module', # field to get sector id
layerField='layer', **becal_img_daq)
sectorField='module')
imcaldata = ImagingPixelMerger('imcal_merger', becal_img_merger = MLDataMerger('becal_img_merger',
# OutputLevel=DEBUG, inputHitCollection=becal_img_reco.outputHitCollection,
inputHitCollection=imcalreco.outputHitCollection, outputHitCollection='EcalBarrelImagingHitsSeg',
outputHitCollection='EcalBarrelHitsML',
etaSize=0.001, etaSize=0.001,
phiSize=0.001, phiSize=0.001)
numberOfHits=kwargs['nhits'],
numberOfLayers=kwargs['nlayers']) becal_img_sorter = MLDataSorter('becal_img_sorter',
inputHitCollection=becal_img_merger.outputHitCollection,
outputHitCollection='EcalBarrelImagingHitsML',
numberOfLayers=kwargs['img_nlayers'],
numberOfHits=kwargs['nhits'])
#Central ECAL SciFi
# use the same daq_setting for digi/reco pair
becal_scfi_daq = dict(
dynamicRangeADC=50.*MeV,
capacityADC=32768,
pedestalMean=400,
pedestalSigma=10)
# becal_scfi_daq = dict(
# dynamicRangeADC=50.*MeV,
# capacityADC=2**16,
# pedestalMean=100,
# pedestalSigma=0)
becal_scfi_digi = CalHitDigi('becal_scfi_digi',
inputHitCollection='EcalBarrelScFiHits',
outputHitCollection='EcalBarrelScFiHitsDigi',
**becal_scfi_daq)
becal_scfi_reco = CalHitReco('becal_scfi_reco',
inputHitCollection=becal_scfi_digi.outputHitCollection,
outputHitCollection='EcalBarrelScFiHitsReco',
thresholdFactor=5.0,
readoutClass='EcalBarrelScFiHits',
layerField='layer',
sectorField='module',
localDetFields=['system', 'module'], # use local coordinates in each module (stave)
**becal_scfi_daq)
# merge hits in different layer (projection to local x-y plane)
becal_scfi_merger = CalHitsMerger('becal_scfi_merger',
# OutputLevel=DEBUG,
inputHitCollection=becal_scfi_reco.outputHitCollection,
outputHitCollection='EcalBarrelScFiGridReco',
fields=['fiber'],
fieldRefNumbers=[1],
readoutClass='EcalBarrelScFiHits')
becal_scfi_sorter = MLDataSorter('becal_scfi_sorter',
inputHitCollection=becal_scfi_merger.outputHitCollection,
outputHitCollection='EcalBarrelScFiHitsML',
numberOfLayers=20,
numberOfHits=kwargs['nhits'])
# combine layers
becal_combiner = MLDataCombiner('becal_combiner',
inputHitCollection1=becal_img_sorter.outputHitCollection,
inputHitCollection2=becal_scfi_sorter.outputHitCollection,
outputHitCollection='EcalBarrelHitsCombinedML',
layerIncrement=100,
rule=kwargs['combine'])
podout.outputCommands = ['keep *'] podout.outputCommands = [
# 'keep *',
'drop *',
'keep EcalBarrel*Reco',
'keep mcparticles*',
'keep EcalBarrel*ML',
'keep *Corr',
]
ApplicationMgr( ApplicationMgr(
TopAlg=[podin, imcaldigi, imcalreco, imcaldata, podout], TopAlg=[podin, copier,
becal_img_digi, becal_img_reco, becal_img_merger, becal_img_sorter,
becal_scfi_digi, becal_scfi_reco, becal_scfi_merger, becal_scfi_sorter,
becal_combiner,
podout],
EvtSel='NONE', EvtSel='NONE',
EvtMax=kwargs['nev'], EvtMax=kwargs['nev'],
ExtSvc=[podev], ExtSvc=[podev],
OutputLevel=DEBUG OutputLevel=WARNING
) )
...@@ -16,6 +16,7 @@ PARTICLES = { ...@@ -16,6 +16,7 @@ PARTICLES = {
"electron": (11, 0.51099895e-3), # electron "electron": (11, 0.51099895e-3), # electron
"positron": (-11, 0.51099895e-3),# positron "positron": (-11, 0.51099895e-3),# positron
"photon": (22, 0), # photon "photon": (22, 0), # photon
"muon": (13, 105.6583755), # muon
} }
......
import os
import pandas as pd
import numpy as np
from collections import OrderedDict
from scipy.interpolate import CubicSpline
# import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.ticker import MultipleLocator, FixedLocator, MaxNLocator
from matplotlib.colors import LogNorm
from particle import Particle
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# 5 layers 0-1-2-6-9
# 6 layers 0-1-2-3-4-6-9
def get_layer_cum_edeps(edep):
return np.tile(np.arange(edep.shape[1]), (edep.shape[0], 1)).flatten(), np.cumsum(edep, axis=1).flatten()
def draw_layer_edep(edep, label=''):
layer_num, edep_val = get_layer_cum_edeps(edep)
figure, axis = plt.subplots(figsize=(16, 9), dpi=120)
axis.hist2d(layer_num, edep_val,
bins=(np.arange(0.5, edep.shape[1] + 0.5, step=1.0), 100), norm=mpl.colors.LogNorm())
axis.grid(linestyle=':')
axis.set_axisbelow(True)
axis.tick_params(labelsize=24, direction='in', which='both')
axis.set_ylabel('Cumulative Edep (MeV)', fontsize=24)
axis.set_xlabel('Layer number', fontsize=24)
axis.xaxis.set_major_locator(MaxNLocator(edep.shape[1], integer=True))
if label:
axis.set_title(label, fontsize=26)
return figure, axis
train_model = True
do_epcut = False
nametag = 'real_imcal'
partag = 'epimuphka'
# partag = 'epi'
prange = (2.0, 2.0)
etag = '{:.1f}_{:.1f}_combined'.format(*prange)
epoch = 20
# data_shape = (20, 20, 3)
data_shape = (29, 20, 5)
data_dir = r'D:\Data\EIC\ImagingCalorimetry\{name}_{part}_{etag}'.format(name=nametag, part=partag, etag=etag)
out_dir = r'C:\Users\cpeng\OneDrive\EIC\ImagingCalorimeter\AstroPix\hybrid\{name}_{part}_{etag}'\
.format(name=nametag, part=partag, etag=etag)
raw_data = np.load(os.path.join(data_dir, 'data.npy'))
raw_tags = np.load(os.path.join(data_dir, 'tag.npy'))
edeps = np.load(os.path.join(data_dir, 'edep.npy'))
# for interlayered data, null the last few layers
astro_nlayers = 9
astro_nlayers_used = [0, 1, 2, 3, 4]
astro_nlayers_not_used = [2*i for i in np.arange(astro_nlayers) if i not in astro_nlayers_used]
if len(astro_nlayers_used) and len(astro_nlayers_not_used):
raw_data[:, astro_nlayers_not_used] = 0
os.makedirs(out_dir, exist_ok=True)
# label converter
labels = OrderedDict([
(pid, [i, Particle.from_pdgid(pid).name, ''])
for i, pid in enumerate(sorted(np.unique(raw_tags.T[0].astype(int)), key=lambda x: np.abs(x)))
])
# add some labels in latex format
name_dict = {'gamma': r'$\gamma$', '-': '$^-$', '+': '$^+$', 'pi': r'$\pi$', 'mu': r'$\mu$'}
for pid, (i, name, _) in labels.items():
labelstr = name
for str1, str2 in name_dict.items():
labelstr = labelstr.replace(str1, str2)
labels[pid][2] = labelstr
part_weights = {pid: 1.0 for pid, _ in labels.items()}
part_weights.update({
11: 1.0,
-211: 10.0,
})
begin_layer, end_layer = 0, 9
epcut = 0.085
if do_epcut:
epi_ids = (11, -211) # (11, -211)
# Eoverp cut in tags data
eovp = np.sum(edeps[:, begin_layer:end_layer], axis=1)*1e-3/raw_tags.T[1]
emask = raw_tags.T[0] == epi_ids[0]
pimask = raw_tags.T[0] == epi_ids[1]
epmask = tuple([eovp > epcut])
ep_eff = sum(raw_tags[epmask].T[0] == 11) / sum(emask)
ep_rej = 1. - sum(raw_tags[epmask].T[0] == -211) / sum(pimask)
bins = np.linspace(0, 0.15, 151)
fig, ax = plt.subplots(figsize=(12, 9), dpi=120)
ax.hist(eovp[emask], histtype='step', density=True, bins=bins, label=labels[epi_ids[0]][2])
ax.hist(eovp[pimask], histtype='step', density=True, bins=bins, label=labels[epi_ids[1]][2])
ax.axvline(epcut, color='k')
ax.set_xlabel('Edep / p', fontsize=24)
ax.set_ylabel('Normalized Counts', fontsize=24)
ax.tick_params(labelsize=24)
ax.legend(fontsize=24)
ax.text(0.2, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(ep_eff*100., ep_rej*100.),
fontsize=24, transform=ax.transAxes, ha='left', va='top')
ax.set_title('$E/p > {:.2f}$% @ {:d} - {:d} X$_0$'.format(epcut*100., begin_layer, end_layer), fontsize=24)
fig.savefig(os.path.join(out_dir, 'pre_epcut.png'))
fig, _ = draw_layer_edep(edeps[emask], 'Cumulative Edep over layer for electrons')
fig.savefig(os.path.join(out_dir, 'cum_edep_el.png'))
fig, _ = draw_layer_edep(edeps[pimask], 'Cumulative Edep over layer for pions')
fig.savefig(os.path.join(out_dir, 'cum_edep_pi.png'))
data = raw_data[epmask]
tags = raw_tags[epmask]
else:
ep_eff = 1.0
ep_rej = 0.
data = raw_data
tags = raw_tags
pid = np.vectorize(lambda x: labels[x][0])(tags.T[0])
weights = np.vectorize(lambda x: part_weights[x])(tags.T[0])
if train_model:
indices = np.arange(len(data))
np.random.shuffle(indices)
id_train = indices[:int(len(data)*0.8)]
id_valid = indices[int(len(data)*0.8):]
x_train = data[id_train]
y_train = pid[id_train]
w_train = weights[id_train]
x_valid = data[id_valid]
y_valid = pid[id_valid]
w_valid = weights[id_valid]
model = keras.Sequential([
keras.layers.Conv2D(64, (3, 3), padding='same', activation='selu', input_shape=data_shape),
# keras.layers.MaxPooling2D((2, 2), strides=2),
keras.layers.Dropout(0.25),
keras.layers.Conv2D(128, (2, 2), padding='same', activation='selu'),
# keras.layers.MaxPooling2D((2, 2), strides=2),
keras.layers.Dropout(0.3),
keras.layers.Conv2D(64, (2, 2), padding='same', activation='selu'),
# keras.layers.MaxPooling2D((2, 2), strides=2),
keras.layers.Flatten(),
keras.layers.Dense(128, activation='selu'),
keras.layers.Dropout(0.25),
keras.layers.Dense(32, activation='selu'),
keras.layers.Dense(len(labels), activation='softmax')
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
history = model.fit(x_train, y_train,
epochs=epoch, sample_weight=w_train, validation_data=(x_valid, y_valid, w_valid))
model.save(os.path.join(out_dir, 'model_epch{:d}'.format(epoch)))
else:
model = keras.models.load_model(os.path.join(out_dir, 'model_epch{:d}'.format(epoch)))
# ---- general ----
# x_test = data
# y_test = pid
# prediction = model.predict(x_test)
# result = prediction.argmax(axis=1)
# pid_probs = []
# for _, part in labels.items():
# mask = (y_test == part[0])
# res = result[mask]
# pid_probs.append(np.bincount(res, minlength=len(labels))/sum(mask))
# pid_probs = np.asarray(pid_probs)
# predictions
x_test = data
y_test = pid
prediction = model.predict(x_test)
# two labels (pion - electron)
if len(labels) == 2:
# --- pion - electron ----
pion_prob_cut = 0.5
pid_probs = np.zeros(shape=(2, 2))
fig, ax = plt.subplots(1, 1, figsize=(12, 9), dpi=160)
ax.set_yscale('log')
ax.set_ylabel('Counts', fontsize=24)
ax.set_xlabel(r'$P_{\pi}$', fontsize=24)
ax.tick_params(direction='in', which='both', labelsize=24)
for i, (_, part) in enumerate(labels.items()):
mask = (y_test == part[0])
probs = prediction[mask]
ax.hist(probs.T[1], histtype='step', bins=np.linspace(0, 1, 101), label=part[1])
pid_probs[i][0] = sum(probs.T[1] < pion_prob_cut) / float(len(probs))
pid_probs[i][1] = 1. - pid_probs[i][0]
ax.axvline(x=pion_prob_cut, lw=2, color='k', ls='--')
ax.text(0.55, 0.9, '$e$ eff. = {:.2f}%\n$\pi$ rej. = {:.2f}%'.format(pid_probs[0][0]*100., pid_probs[1][1]*100.),
fontsize=24, transform=ax.transAxes, ha='left', va='top')
ax.legend(fontsize=24, loc='lower left')
fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png'))
fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf'))
# --- pion - electron ----
result_text = open(os.path.join(out_dir, 'rejection_result.txt'), 'w')
lines = [
'E/p cut position: {:.4f}, layer {:d} to {:d}, E/p efficiency: {:.4f}%, rejection power: {:.4f}'\
.format(epcut, begin_layer, end_layer, ep_eff*100., 1./(1. - ep_rej)),
] if do_epcut else []
lines += [
'ML training weight: {}, pion cut position: {:.4f}, ML efficiency: {:.4f}%, rejection power: {:.4f}'\
.format(list(part_weights.values()), pion_prob_cut, pid_probs[0][0]*100., 1./(1. - pid_probs[1][1])),
'combined efficiency: {:.2f}%, rejection power: {:.1f}'\
.format(pid_probs[0][0]*ep_eff*100., 1./((1. - ep_rej)*(1. - pid_probs[1][1]))),
]
result_text.write('\n'.join(lines))
result_text.close()
print('\n'.join(lines))
else:
# --- multi-particles ---
pid_probs = np.zeros(shape=(len(labels), len(labels)))
fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95})
ax.set_yscale('log')
ax.set_ylabel('Counts', fontsize=26)
ax.set_xlabel(r'Likelihood of the Corresponding Label', fontsize=26)
ax.tick_params(direction='in', which='both', labelsize=26)
# label with maximum likelihood
for _, (i, part, label) in labels.items():
mask = (y_test == i)
probs = prediction[mask]
pred_labels = probs.argmax(axis=1)
ax.hist(probs.T[i], histtype='step', bins=np.linspace(0, 1, 101), label=label)
for _, (j, _, _) in labels.items():
pid_probs[i][j] = np.sum(pred_labels == j)/float(np.sum(mask))
ax.legend(fontsize=26, loc='upper center', ncol=3, handletextpad=0.3, columnspacing=0.6)
fig.savefig(os.path.join(out_dir, 'pid_tag_hist.png'))
fig.savefig(os.path.join(out_dir, 'pid_tag_hist.pdf'))
# --- multi-particles ---
fig, ax = plt.subplots(1, 1, figsize=(8, 8), dpi=160, gridspec_kw={'left': 0.15, 'right': 0.95})
ax.imshow(pid_probs, cmap='PuBu', norm=LogNorm(vmin=0.01, vmax=1))
ax.xaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[0], step=1.0)))
ax.yaxis.set_major_locator(FixedLocator(np.arange(0, pid_probs.shape[1], step=1.0)))
ax.xaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[0], step=1.0)))
ax.yaxis.set_minor_locator(FixedLocator(np.arange(0.5, pid_probs.shape[1], step=1.0)))
ax.set_xticklabels([x[2] for _, x in labels.items()], fontsize=26, va='bottom')
ax.set_yticklabels([x[2] for _, x in labels.items()], fontsize=26, ha='left')
ax.tick_params(bottom=False, left=False, which='both')
ax.tick_params(pad=30, axis='x')
ax.tick_params(pad=40, axis='y')
ax.set_ylabel('Truth', fontsize=26)
ax.set_xlabel('Classification', fontsize=26)
for i in range(pid_probs.shape[0]):
for j in range(pid_probs.shape[1]):
color = 'k' if pid_probs[i, j] < 0.15 else 'w'
text = ax.text(j, i, '{:.2f}%'.format(pid_probs[i, j]*100.), ha='center', va='center',
color=color, fontsize=28 - len(labels)*2)
ax.grid('-', color='k', which='minor')
fig.savefig(os.path.join(out_dir, 'pid_tag.png'))
fig.savefig(os.path.join(out_dir, 'pid_tag.pdf'))
import ROOT import ROOT
import os import os
import gc
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import argparse import argparse
import sys import sys
# read hits data from root file # read from RDataFrame and flatten a given collection, return pandas dataframe
def get_ml_data(path, evnums=None, shape=(20, 20, 3), branch='EcalBarrelHitsML'): def flatten_collection(rdf, collection, cols=None, event_colname='event'):
f = ROOT.TFile(path) if not cols:
events = f.events cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
if evnums is None: else:
evnums = np.arange(events.GetEntries()) cols = ['{}.{}'.format(collection, c) for c in cols]
elif isinstance(evnums, int): if not cols:
evnums = [evnums] print('cannot find any branch under collection {}'.format(collection))
return pd.DataFrame()
dbuf = np.zeros(shape=np.hstack([len(evnums), shape]))
idb = 0 data = rdf.AsNumpy(cols)
for iev in evnums: # flatten the data, add an event id to identify clusters from different events
if iev >= events.GetEntries(): evns = []
print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1)) for i, vec in enumerate(data[cols[0]]):
continue evns += [i]*vec.size()
for n, vals in data.items():
events.GetEntry(iev) # make sure ints are not converted to floats
for hit in getattr(events, branch): typename = vals[0].__class__.__name__.lower()
dbuf[iev][hit.layerID][hit.hitID] = (hit.edep, hit.eta, hit.polar.phi) dtype = np.int64 if 'int' in typename or 'long' in typename else np.float32
# type safe creation
return dbuf data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype)
# build data frame
dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()})
# execute this script dfp.loc[:, event_colname] = evns
# read mc particles from root file return dfp
def get_mcp_simple(path, evnums=None, branch='mcparticles'):
f = ROOT.TFile(path)
events = f.events
if evnums is None:
evnums = np.arange(events.GetEntries())
elif isinstance(evnums, int):
evnums = [evnums]
dbuf = np.zeros(shape=(len(evnums), 6))
idb = 0
for iev in evnums:
if iev >= events.GetEntries():
print('Error: event {:d} is out of range (0 - {:d})'.format(iev, events.GetEntries() - 1))
continue
events.GetEntry(iev)
# extract full mc particle data
part = getattr(events, branch)[2]
dbuf[idb] = (iev, part.ps.x, part.ps.y, part.ps.z, part.pdgID, part.status)
idb += 1
return pd.DataFrame(data=dbuf[:idb], columns=['event', 'px', 'py', 'pz', 'pid', 'status'])
# load root macros, input is an argument string # load root macros, input is an argument string
...@@ -64,6 +44,15 @@ def load_root_macros(arg_macros): ...@@ -64,6 +44,15 @@ def load_root_macros(arg_macros):
print('\"{}\" does not exist, skip loading it.'.format(path)) print('\"{}\" does not exist, skip loading it.'.format(path))
def cartesian_to_polar(x, y, z):
r = np.sqrt(x**2 + y**2 + z**2)
rc = np.sqrt(x**2 + y**2)
theta = np.arccos(z / r)
phi = np.arctan2(y, x)
eta = -np.log(np.tan(theta / 2.))
return r, theta, phi, rc, eta
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis') parser = argparse.ArgumentParser(description='Visualize the cluster (layer-wise) from analysis')
parser.add_argument('file', type=str, help='path to root file') parser.add_argument('file', type=str, help='path to root file')
...@@ -71,58 +60,111 @@ if __name__ == '__main__': ...@@ -71,58 +60,111 @@ if __name__ == '__main__':
# parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file') # parser.add_argument('--compact', type=str, default='', dest='compact', help='compact file')
parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros', parser.add_argument('-m', '--macros', type=str, default='rootlogon.C', dest='macros',
help='root macros to load (accept multiple paths separated by \",\")') help='root macros to load (accept multiple paths separated by \",\")')
parser.add_argument('--topo-size', type=float, default=2.0, dest='topo_size', parser.add_argument('--topo-range', type=float, default=1000.0, dest='topo_range',
help='bin size for projection plot (mrad)')
parser.add_argument('--topo-range', type=float, default=200.0, dest='topo_range',
help='half range for projection plot (mrad)') help='half range for projection plot (mrad)')
parser.add_argument('--shape', type=str, default="20, 20, 3", dest='shape', parser.add_argument('--shape', type=str, default='20, 20', dest='shape',
help='shape of data (3D), default (20, 20, 3)') help='shape of data (Nlayer, Nhits), default (20, 20)')
parser.add_argument('--features', type=str, default='energy, eta, phi',
help='features of hit data, default (eta, phi, energy)')
parser.add_argument('--branch', type=str, default='EcalBarrelImagingHitsML', help='name of data branch')
parser.add_argument('--null-columns', type=str, default='', help='make the columns zeros')
args = parser.parse_args() args = parser.parse_args()
os.makedirs(args.outdir, exist_ok=True) os.makedirs(args.outdir, exist_ok=True)
load_root_macros(args.macros) load_root_macros(args.macros)
data = get_ml_data(args.file, shape=[int(i.strip()) for i in args.shape.split(',')])
dfmcp = get_mcp_simple(args.file, branch='mcparticles') # read data and mcparticles
# dfmcp = dfmcp[dfmcp['status'] == 24578] rdf = ROOT.RDataFrame("events", args.file)
dfmcp.loc[:, 'p'] = np.linalg.norm(dfmcp[['px', 'py', 'pz']].values, axis=1) df = flatten_collection(rdf, args.branch, ['layer', 'energy', 'position.x', 'position.y', 'position.z'])
dfmcp['phi'] = np.arctan2(dfmcp['py'].values, dfmcp['px'].values) df.rename(columns={c: c.replace(args.branch + '.', '') for c in df.columns}, inplace=True)
dfmcp['theta'] = np.arccos(dfmcp['pz'].values/dfmcp['p'].values)
dfmcp['eta'] = -np.log(np.tan(dfmcp['theta'].values/2.)) r, theta, phi, rc, eta = cartesian_to_polar(*df[['position.x', 'position.y', 'position.z']].values.T)
df.loc[:, 'r'] = r
tag = np.zeros(shape=(data.shape[0], 7)) df.loc[:, 'theta'] = theta
trg = args.topo_range/1000. df.loc[:, 'phi'] = phi
df.loc[:, 'rc'] = rc
for iev in np.arange(data.shape[0]): df.loc[:, 'eta'] = eta
ev = data[iev].transpose(2,0,1).reshape(3,-1)
truth = dfmcp[dfmcp['event'] == iev] dfm = flatten_collection(rdf, 'mcparticles2', ['genStatus', 'pdgID', 'ps.x', 'ps.y', 'ps.z', 'mass'])
if not truth.shape[0]: dfm.rename(columns={c: c.replace('mcparticles2.', '') for c in dfm.columns}, inplace=True)
continue # selete incident particles
mask = ev[0] > 0. dfm = dfm[dfm['genStatus'].isin([0, 1])]
if not sum(mask): # NOTE: assumed single particles
continue dfm = dfm.groupby('event').first()
eta = ev[1][ev[0] > 0.] p, theta, phi, pT, eta = cartesian_to_polar(*dfm[['ps.x', 'ps.y', 'ps.z']].values.T)
phi = ev[2][ev[0] > 0.] dfm.loc[:, 'p'] = p
# normalize dfm.loc[:, 'theta'] = theta
# print(ev.T.reshape(20, 20, 3) == data[iev]) dfm.loc[:, 'phi'] = phi
ev[0] /= 10. dfm.loc[:, 'pT'] = pT
# ev[1][mask] = (ev[1][mask] - eta.mean() + trg)/(2.*trg) dfm.loc[:, 'eta'] = eta
# ev[2][mask] = (ev[2][mask] - phi.mean() + trg)/(2.*trg)
ev[1][mask] = (ev[1][mask] - truth['eta'].values + trg)/(2.*trg) # energy unit is in GeV, converting to MeV first
ev[2][mask] = (ev[2][mask] - truth['phi'].values + trg)/(2.*trg) df.loc[:, 'eMeV'] = df['energy']*1000
# ev[1][mask] = (ev[1][mask] + 1.0)/2. # scaled by 5 MeV to have most of them in [0, 1]
# ev[2][mask] = (ev[2][mask] + np.pi)/2./np.pi df.loc[:, 'energy'] = df['eMeV']/5.
out_range = (ev[1] > 1.) | (ev[1] < 0.) | (ev[2] > 1.) | (ev[2] < 0.) # convert eta, phi, edep so they are within [0, 1]
ev.T[out_range] = (0., 0., 0.) df.loc[:, 'etotal'] = df.groupby('event')['energy'].transform('sum')
test = data[iev].transpose(2,0,1).reshape(3,-1) df = df[df['etotal'] > 0.]
tag[iev] = np.hstack([truth[['pid', 'eta', 'phi', 'p']].values[0], (eta.mean(), phi.mean(), trg)])
# print(ev[1]) event_group = df.groupby('event')
# print((eta.shape, phi.shape), (eta.mean(), phi.mean()), truth[['eta', 'phi']].values) df.loc[:, 'logw'] = 0.
# energy = 0 means no hits
# momentum mask = df['energy'] > 0.
mask = tag.T[3] > 0. df.loc[mask, 'logw'] = np.log(df.loc[mask, 'energy'].values / df.loc[mask, 'etotal'].values) + 6.2
print('training data shape: {}'.format(data[mask].shape)) df.loc[:, 'logw'] = df['logw'].clip(0)
print('tagging data shape: {}'.format(tag[mask].shape)) df.loc[:, 'wtotal'] = event_group['logw'].transform('sum')
np.save(os.path.join(args.outdir, 'data.npy'), data[mask])
np.save(os.path.join(args.outdir, 'tag.npy'), tag[mask]) df.loc[:, 'etaw'] = df['eta']*df['logw']
df.loc[:, 'phiw'] = df['phi']*df['logw']
df.loc[:, 'eta_mean'] = event_group['etaw'].transform('sum') / df['wtotal']
df.loc[:, 'phi_mean'] = event_group['phiw'].transform('sum') / df['wtotal']
# topo range in rad
trg = args.topo_range / 1000.
df.loc[mask, 'eta'] = (df.loc[mask, 'eta'].values - df.loc[mask, 'eta_mean'].values + trg) / (2.*trg)
df.loc[mask, 'phi'] = ((df.loc[mask, 'phi'].values - df.loc[mask, 'phi_mean'].values)/np.pi + trg) / (2.*trg)
df.loc[~mask, 'eta'] = 0.
df.loc[~mask, 'phi'] = 0.
df.loc[~mask, 'rc'] = 0.
# scale by 2 meter, which keeps barrel region well under 1.0
df.loc[:, 'rc'] = df['rc'].values / 2000.0
df.loc[:, 'layer_type'] = np.floor(df['layer']/100.).astype(int)
df.loc[df['layer_type'] == 1, 'eta'] = 0.
# null specified columns (for scfi which does not have eta info)
nullcols = [c.strip() for c in args.null_columns.split(',')]
for col in nullcols:
df.loc[:, col] = 0.
gc.collect()
# features and data-shape
featcols = [f.strip() for f in args.features.split(',')]
dshape = [int(i.strip()) for i in args.shape.split(',')]
dshape.append(len(featcols))
event_ids = df['event'].unique()
data = df.set_index('event')[featcols].clip(0, 1).values.reshape([len(event_ids)] + dshape)
tags = dfm.loc[event_ids, ['pdgID', 'p', 'pT', 'eta', 'phi', 'mass']]
# also save Edep per layers
# merge the sandwich layers
df['layer'] = df['layer'] % 100
edeps = df.groupby(['event', 'layer'])['eMeV'].sum().unstack()
""" TEST
from matplotlib import pyplot as plt
fig, axs = plt.subplots(3, 1)
axs.flat[0].hist(df.loc[df['edep'] > 0, 'eta'].values, bins=np.linspace(-2, 2, 101), ec='k')
axs.flat[1].hist(df.loc[df['edep'] > 0, 'phi'].values, bins=np.linspace(-2, 2, 101), ec='k')
axs.flat[2].hist(df.loc[df['edep'] > 0, 'edep'].values, bins=np.linspace(-2, 2, 101), ec='k')
fig.savefig('test.png')
"""
print('Training data set shape = {}'.format(data.shape))
print('Tagging data set shape = {}'.format(tags.shape))
print('Edep/layer data set shape = {}'.format(edeps.shape))
np.save(os.path.join(args.outdir, 'data.npy'), data)
np.save(os.path.join(args.outdir, 'tag.npy'), tags.values)
np.save(os.path.join(args.outdir, 'edep.npy'), edeps.values)
...@@ -18,12 +18,14 @@ parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100, ...@@ -18,12 +18,14 @@ parser.add_argument('-n', '--numberOfEvents', dest='nev', type=int, default=100,
parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.') parser.add_argument('-t', '--nametag', type=str, default='IMCAL_ML', help='Name tag for output files.')
parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.') parser.add_argument('--seed', type=int, default=-1, help='Random seed to scripts.')
parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).') parser.add_argument('--process', type=str, default='sim, rec, tag', help='Processes to be executed (sim, rec, tag).')
parser.add_argument('--numberOfLayers', dest='nlayer', type=int, default=20, help='number of layers in ML data.') parser.add_argument('--nlayers', dest='nlayers', type=int, default=9, help='number of layers in ML data.')
parser.add_argument('--numberOfHits', dest='nhit', type=int, default=20, help='number of hits in ML data.') parser.add_argument('--nhits', dest='nhits', type=int, default=20, help='number of hits in ML data.')
parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".') parser.add_argument('--particles', type=str, default='electron', help='Partcile names, separated by \",\".')
parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.') parser.add_argument('--pmin', type=float, default=0.5, help='Minimum momentum of particles.')
parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.') parser.add_argument('--pmax', type=float, default=10, help='Maximum momentum of particles.')
parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.') parser.add_argument('--compact', type=str, default=default_compact, help='Path to detector compact file.')
parser.add_argument('--combine-method', type=str, default='interlayer', help='Path to detector compact file.')
parser.add_argument('--physics-list', type=str, default='FTFP_BERT', help='Path to detector compact file.')
args = parser.parse_args() args = parser.parse_args()
kwargs = vars(args) kwargs = vars(args)
...@@ -35,7 +37,6 @@ gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwa ...@@ -35,7 +37,6 @@ gen_file = os.path.join('gen_data', '{nametag}_{pmin}_{pmax}.hepmc'.format(**kwa
sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) sim_file = os.path.join('sim_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs))
rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs)) rec_file = os.path.join('rec_data', '{nametag}_{pmin}_{pmax}.root'.format(**kwargs))
tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs)) tag_dir = os.path.join('tag_data', '{nametag}_{pmin}_{pmax}'.format(**kwargs))
os.makedirs(tag_dir, exist_ok=True)
procs = [p.strip() for p in args.process.split(',')] procs = [p.strip() for p in args.process.split(',')]
sdir = os.path.dirname(os.path.realpath(__file__)) sdir = os.path.dirname(os.path.realpath(__file__))
...@@ -45,7 +46,8 @@ if 'sim' in procs: ...@@ -45,7 +46,8 @@ if 'sim' in procs:
gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file, gen_cmd = ['python', os.path.join(sdir, 'scripts', 'gen_particles.py'), gen_file,
'-n', '{}'.format(args.nev), '-n', '{}'.format(args.nev),
'-s', '{}'.format(args.seed), '-s', '{}'.format(args.seed),
'--angmin', '50', '--angmax', '130', # '--angmin', '45', '--angmax', '135',
'--angmin', '85', '--angmax', '95',
'--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax), '--pmin', '{}'.format(args.pmin), '--pmax', '{}'.format(args.pmax),
'--particles', args.particles] '--particles', args.particles]
subprocess.run(gen_cmd) subprocess.run(gen_cmd)
...@@ -54,6 +56,7 @@ if 'sim' in procs: ...@@ -54,6 +56,7 @@ if 'sim' in procs:
'--part.minimalKineticEnergy', '1*TeV', '--part.minimalKineticEnergy', '1*TeV',
'--numberOfEvents', '{}'.format(args.nev), '--numberOfEvents', '{}'.format(args.nev),
'--runType', 'batch', '--runType', 'batch',
# '--physics.list', args.physics_list,
'--inputFiles', gen_file, '--inputFiles', gen_file,
'--outputFile', sim_file, '--outputFile', sim_file,
'--compact', args.compact, '--compact', args.compact,
...@@ -70,16 +73,23 @@ if 'sim' in procs: ...@@ -70,16 +73,23 @@ if 'sim' in procs:
if 'rec' in procs: if 'rec' in procs:
# export to environment variables (used to pass arguments to the option file) # export to environment variables (used to pass arguments to the option file)
os.environ['JUGGLER_SIM_FILE'] = sim_file run_env = os.environ.copy()
os.environ['JUGGLER_REC_FILE'] = rec_file run_env.update({
os.environ['JUGGLER_COMPACT_PATH'] = args.compact 'JUGGLER_SIM_FILE': sim_file,
os.environ['JUGGLER_N_EVENTS'] = '{}'.format(args.nev) 'JUGGLER_REC_FILE': rec_file,
os.environ['IMCAL_ML_N_LAYERS'] = '{}'.format(args.nlayer) 'JUGGLER_COMPACT_PATH': args.compact,
os.environ['IMCAL_ML_N_HITS'] = '{}'.format(args.nhit) 'JUGGLER_N_EVENTS': str(args.nev),
'IMCAL_ML_IMG_NLAYERS': str(args.nlayers),
'IMCAL_ML_NHITS': str(args.nhits),
'IMCAL_ML_COMBINE': str(args.combine_method),
})
juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv') juggler_xenv = os.path.join(os.environ.get('JUGGLER_INTALL_PREFIX', '../local'), 'Juggler.xenv')
rec_cmd = ['xenv', '-x', juggler_xenv, 'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')] rec_cmd = [
return_code = subprocess.run(rec_cmd).returncode 'xenv', '-x', juggler_xenv, # v35+ do not need xenv anymore
'gaudirun.py', os.path.join(sdir, 'options', 'imaging_ml_data.py')
]
return_code = subprocess.run(rec_cmd, env=run_env).returncode
print(return_code) print(return_code)
if return_code is not None and return_code < 0: if return_code is not None and return_code < 0:
print("ERROR running juggler (reconstruction)!") print("ERROR running juggler (reconstruction)!")
...@@ -88,9 +98,43 @@ if 'rec' in procs: ...@@ -88,9 +98,43 @@ if 'rec' in procs:
if 'tag' in procs: if 'tag' in procs:
os.makedirs(tag_dir, exist_ok=True)
"""
# imaging layers
tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file, tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
'-o', tag_dir, '-o', tag_dir,
'--shape', '{nlayer},{nhit},3'.format(**kwargs)] '--branch', 'EcalBarrelImagingHitsML',
'--shape', '{nlayers}, {nhits}'.format(**kwargs),
'--features', 'energy, rc, eta, phi',
]
return_code = subprocess.run(tag_cmd).returncode
print(return_code)
if return_code is not None and return_code < 0:
print("ERROR running ML data tagging!")
exit(return_code)
# scfi layers
tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
'-o', tag_dir + '_scfi',
'--branch', 'EcalBarrelScFiHitsML',
'--shape', '{nlayers}, {nhits}'.format(**kwargs),
# '--shape', '{}, {}'.format(20 - kwargs['nlayers'], kwargs['nhits']),
'--features', 'energy, rc, eta, phi',
'--null-columns', 'eta',
]
return_code = subprocess.run(tag_cmd).returncode
print(return_code)
if return_code is not None and return_code < 0:
print("ERROR running ML data tagging!")
exit(return_code)
"""
# combined
tag_cmd = ['python', os.path.join(sdir, 'scripts', 'prepare_tf_dataset.py'), rec_file,
'-o', tag_dir + '_combined',
'--branch', 'EcalBarrelHitsCombinedML',
'--shape', '{}, {}'.format(20 + kwargs['nlayers'], kwargs['nhits']),
'--features', 'layer_type, energy, rc, eta, phi',
]
return_code = subprocess.run(tag_cmd).returncode return_code = subprocess.run(tag_cmd).returncode
print(return_code) print(return_code)
if return_code is not None and return_code < 0: if return_code is not None and return_code < 0:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment