From a6d784845a085e9c54d0345d697729d7e62d684b Mon Sep 17 00:00:00 2001 From: Shujie Li <shujie@jlab.org> Date: Thu, 14 Oct 2021 12:58:59 -0400 Subject: [PATCH] add new tracking performance plots with gaus fit --- .../tracking/run_tracking_benchmarks.py | 9 + .../scripts/tracking_performance_fit.py | 415 ++++++++++++++++++ 2 files changed, 424 insertions(+) create mode 100644 benchmarks/tracking/scripts/tracking_performance_fit.py diff --git a/benchmarks/tracking/run_tracking_benchmarks.py b/benchmarks/tracking/run_tracking_benchmarks.py index 32a8a149..05cd2e88 100755 --- a/benchmarks/tracking/run_tracking_benchmarks.py +++ b/benchmarks/tracking/run_tracking_benchmarks.py @@ -38,6 +38,7 @@ script_dir = os.path.dirname(os.path.realpath(__file__)) gen_script = os.path.join(script_dir, 'scripts', 'gen_particles.py') option_script = os.path.join(script_dir, 'options', args.options) analysis_script = os.path.join(script_dir, 'scripts', 'tracking_performance.py') +analysis_script_fit = os.path.join(script_dir, 'scripts', 'tracking_performance_fit.py') gen_file = 'gen_{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs) sim_file = 'sim_{nametag}_{pmin}_{pmax}.root'.format(**kwargs) @@ -99,6 +100,14 @@ if 'ana' in procs: print('ERROR running analysis ({})!'.format(ana_cmd)) exit(1) + ## new performance plot with fit + ana_cmd = ['python', analysis_script_fit, rec_file, + '--mc-collection', 'mcparticles', + '--tracking-collection', 'outputTrackParameters', + '-o', 'results', '-t', args.nametag] + return_code = subprocess.run(ana_cmd).returncode + if return_code is not None and return_code != 0: + print('ERROR running analysis ({})!'.format(ana_cmd)) # upload generated data files if it was small enough (< 10 Mb) for upload in [rec_file]: diff --git a/benchmarks/tracking/scripts/tracking_performance_fit.py b/benchmarks/tracking/scripts/tracking_performance_fit.py new file mode 100644 index 00000000..083e9294 --- /dev/null +++ b/benchmarks/tracking/scripts/tracking_performance_fit.py @@ -0,0 +1,415 @@ +import os +import ROOT +import pandas as pd +import numpy as np +import argparse +from matplotlib import pyplot as plt +import matplotlib.ticker as ticker +# pd.set_option(20, None, 'display.max_columns', None) +pd.options.display.max_rows = 40 +pd.options.display.max_columns = 100 +# pd.set_option('display.max_rows', None)#, 'display.max_columns', None) +import matplotlib.cm as cm +import matplotlib as mpl +import matplotlib.pylab as plt +import awkward as ak +import uproot as ur +# import uproot3 +print('Uproot version: ' + ur.__version__) +deg2rad = np.pi/180.0 + +plt.rcParams['figure.figsize'] = [12.0, 8.0] +# plt.rcParams['figure.dpi'] = 80 + +SMALL_SIZE = 13 +MEDIUM_SIZE = 14 +BIGGER_SIZE = 16 + +plt.rc('font', size=SMALL_SIZE) # controls default text sizes +plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title +plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels +plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels +plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels +plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize +plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title] + + +# class Inputs: +# def __init__(self, worksim, rec_file): +# self.worksim = worksim +# self.rec_file = worksim+"/"+rec_file +# self.outdir = self.worksim +# macros = 'rootlogon.C' +# nametag = "tracking" +# mc = "mcparticles" +# fit = "outputTrackParameters" +# rec = "ReconstructedParticles" + +# append kin variables to dataframe, mcparticles "ps", reconstructedparticles "p" +PARTICLES = pd.DataFrame({ + "pion0": (111, 0, 0.1349766), # pi0 + "pion+": (211, 1, 0.13957018), # pi+ + "pion-": (-211,-1,0.13957018), # pi- + "kaon0L": (130, 0, 0.497648), # K0L + "kaon0S": (310, 0, 0.497648), # K0S + "kaon0": (311, 0, 0.497648), # K0 + "kaon+": (321, 1, 0.493677), # K+ + "kaon-": (-321, -1, 0.493677), # K- + "proton": (2212, 1, 0.938272), # proton + "neutron": (2112, 0, 0.939565), # neutron + "electron": (11, -1, 0.51099895e-3), # electron + "positron": (-11, 1, 0.51099895e-3),# positron + "photon": (22, 0, 0), # photon +},index=["pid","charge","mass"]).transpose() + +def add_kin(data,prefix="p"): + + get_4vecs = np.vectorize(lambda x, y, z, m: ROOT.Math.PxPyPzMVector(x, y, z, m)) + fourvecs = get_4vecs(*data[[prefix+'.x', prefix+'.y', prefix+'.z', 'mass']].values.T) + + data.loc[:, 'p'] = [v.P() for v in fourvecs] + data.loc[:, 'theta'] = [v.Theta() for v in fourvecs] + data.loc[:, 'phi'] = [v.Phi() for v in fourvecs] + data.loc[:, 'eta'] = [v.Eta() for v in fourvecs] + data.loc[:, 'pt'] = [v.Pt() for v in fourvecs] + return data + + +#1 d histogram and gaus fit +from lmfit.models import GaussianModel +def hist_gaus(dataset,ax, bins=100,klog=0,header=None): + + ## select data in range if bins is provided as an array + if not np.isscalar(bins): + c1 = dataset<=bins[-1] + c2 = dataset>=bins[0] + dataset=dataset[c1&c2] + + + ## 1st fit + n, bins, patches = ax.hist(dataset, bins,density=False, facecolor='b', alpha=0.3) + xx = bins[0:-1]+(bins[1]-bins[0])/2.0 + + ymax = np.max(n) + std = np.std(dataset) + mean = np.mean(dataset) + # # print(mean,std) + c1 = xx<=(mean+2*std) + c2 = xx>=(mean-2*std) + cond = c1&c2 + while len(n[cond])<len(bins)/4.0: + ax.cla() + diff = (bins[-1]-bins[0])/2.0/2.0 + n, bins, patches = ax.hist(dataset, np.linspace(bins[0]+diff,bins[-1]-diff,len(bins)),density=False, facecolor='b', alpha=0.3) + xx = bins[0:-1]+(bins[1]-bins[0])/2.0 + c1 = xx<=(mean+2*std) + c2 = xx>=(mean-2*std) + cond = c1&c2 + model = GaussianModel() + # create parameters with initial guesses: + params = model.make_params(center=np.median(dataset), amplitude=np.max(n), sigma=np.std(dataset)) + result = model.fit(n, params, x=xx) + + # 2nd fit + std = result.params['sigma'].value + c1 = xx<=(mean+2*std) + c2 = xx>=(mean-2*std) + cond = c1&c2 + model = GaussianModel() + params = model.make_params(center=np.median(dataset), amplitude=np.max(n), sigma=np.std(dataset)) + result = model.fit(n[cond], params, x=xx[cond]) + print(result.success) + + # print(result.fit_report()) + # print (result.best_values) + # plt.plot(xx, result.best_fit, 'r-', label='best fit') + if len(result.best_fit)>0: + ax.plot(xx[cond], result.best_fit, 'r-', label='sigma=%g,err=%g' %(result.params['sigma'].value,result.params['sigma'].stderr)) + ax.legend(title=header, frameon=False,loc='upper left') + if klog: + ax.set_yscale('log') + ax.set_ylim(1,ymax*10) + else: + ax.set_ylim(0,ymax*1.3) + + return result.params['sigma'].value,result.params['sigma'].stderr + +def read_data(args, rows=-1): + + fname = args.rec_file + rdf_rec = ROOT.RDataFrame('events',fname) + if rows>0: + rdf_rec=rdf_rec.Range(0,rows) + print("read in ",rdf_rec.Count().GetValue(), "rows") + return rdf_rec + +# read from RDataFrame and flatten a given collection, return pandas dataframe +def flatten_collection(rdf, collection, cols=None): + if not cols: + cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))] + else: + cols = ['{}.{}'.format(collection, c) for c in cols] + if not cols: + print('cannot find any branch under collection {}'.format(collection)) + return pd.DataFrame() + for cname in cols: + if "[" in cname: + cols.remove(cname) ## can not convert array to python. drop for now +# print(cols) + + data = rdf.AsNumpy(cols) + # flatten the data, add an event id to identify clusters from different events + evns = [] + for i, vec in enumerate(data[cols[0]]): + evns += [i]*vec.size() + for n, vals in data.items(): + # make sure ints are not converted to floats + typename = vals[0].__class__.__name__.lower() + dtype = np.int64 if 'int' in typename or 'long' in typename else np.float64 + # type safe creation + data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype) + # build data frame + dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()}) + dfp.loc[:, 'event'] = evns + + dfp.rename(columns={c: c.replace(collection + '.', '') for c in dfp.columns}, inplace=True) + return dfp + + + +def pre_proc(rdf_rec,pid=0): + + + df_fit = flatten_collection(rdf_rec,args.fit) + df_fit = df_fit.groupby('event').first().reset_index() + df_fit["p"] = 1./df_fit["qOverP"] + df_fit["eta"]= -np.log(np.tan(df_fit['direction.theta'].values/2.)) + df_fit["pt"] = df_fit["p"]*np.sin(df_fit["direction.theta"]) + df_fit["theta"] = df_fit["direction.theta"] + df_fit["phi"] = df_fit["direction.phi"] + + # if len(df_fit)<1: + print (args.fit,df_fit.columns,len(df_fit)) + + df_rec = flatten_collection(rdf_rec, args.rec) + print (args.rec,df_rec.columns,len(df_rec)) + + + if len(df_rec)>0: + # df_rec=df_rec[df_rec.pid==pid] + df_rec = df_rec.groupby('event').first().reset_index() + # df_rec = df_rec.reset_index() + # if "mass" not in df_rec.columns: + if pid!=0: + # df_rec["mass"] = [PARTICLES.loc[PARTICLES.pid==pid]['mass'].values for pid in df_rec.pid] + mass = PARTICLES.loc[PARTICLES.pid==pid]['mass'].values[0] + df_rec["mass"] = mass*np.ones(len(df_rec)) + df_rec = add_kin(df_rec,"p") + + df_mc = flatten_collection(rdf_rec,args.mc) + if pid!=0: + df_mc = df_mc[df_mc.pdgID==pid] + df_mc = df_mc[(df_mc.genStatus==1)].reset_index(drop=False) + # df_mc = df_mc.set_index('event').loc[df['event'].values] + print (args.mc,df_mc.columns,len(df_mc)) + df_mc = add_kin(df_mc,"ps") + + + return df_mc, df_rec,df_fit + + +## --------tracking performance plots------------------- +def performance_plot(df,dfc,dfm,dirname=""): + + dp_lim=5 #% + th_lim=0.003 #rad + ph_lim=0.003 + fig, axs = plt.subplots(3, 2, figsize=(20, 14)) + for ax in axs.flat: + ax.tick_params(direction='in', which='both')#, labelsize=20) + ax.grid(linestyle=':') + ax.set_axisbelow(True) + + # ---------------tracking efficiency v.s. eta----------------- + ax = axs.flat[0] + eta_bins = np.linspace(-4, 4, 21) + + + sim_eta, _ = np.histogram(dfm['eta'].values, bins=eta_bins) + ## ?? outputTrackParameters.direction.phi = mcparticles2.theta??? + rec_eta, _ = np.histogram(df['eta'], bins=eta_bins) + # rec_eta, _ = np.histogram(-np.log(np.tan(df['direction.theta'].values/2.)), bins=eta_bins) + track_eff_total = np.sum(rec_eta)/np.sum(sim_eta) + + eta_centers = (eta_bins[1:] + eta_bins[:-1])/2. + eta_binsize = np.mean(np.diff(eta_centers)) + # cut off not simulated bins + mask = sim_eta > 0 + sim_eta = sim_eta[mask] + rec_eta = rec_eta[mask] + eta_centers = eta_centers[mask] + track_eff = np.array(rec_eta)/np.array(sim_eta) + # binary distribution, pq*sqrt(N) + # TODO check the errors + eff = np.mean(track_eff) + track_err = eff*(1. - eff)*np.reciprocal(np.sqrt(sim_eta)) + # rec_err = eff*(1. - eff)*np.sqrt(rec_eta) + # track_eff_lower = track_eff - np.maximum(np.zeros(shape=rec_eta.shape), (rec_eta - rec_err)/sim_eta) + # track_eff_upper = np.minimum(np.ones(shape=rec_eta.shape), (rec_eta + rec_err)/sim_eta) - track_eff + track_eff_lower = track_eff - np.maximum(np.zeros(shape=rec_eta.shape), track_eff - track_err) + track_eff_upper = np.minimum(np.ones(shape=rec_eta.shape), track_eff + track_err) - track_eff + + + ax.errorbar(eta_centers, track_eff, xerr=eta_binsize/2., yerr=[track_eff_lower, track_eff_upper], + fmt='o', capsize=3) + ax.set_ylim(0., 1.1) + ax.set_xlim(-4.5, 4.5) + ax.set_ylabel('Tracking Efficiency')#, fontsize=20) + ax.set_xlabel(r'$\eta$')#, fontsize=20) + ax.text(-4,1,"recon/generated events= %d / %d =%.2f" %(len(df),len(dfm),len(df)/len(dfm))) + + + + + # ---------------theta distribution------------ + ax = axs.flat[1] + sim_th_deg = dfm.groupby('event')['theta'].first().values*180/np.pi + rec_th_deg = df.groupby('event')['theta'].first().values*180/np.pi + + hval, hbins, _ = ax.hist(sim_th_deg, bins=np.linspace(-0, 180,61), + ec='k',alpha=0.3,label="Generated") + hval, hbins, _ = ax.hist(rec_th_deg, bins=np.linspace(-0,180, 61), + ec='k',alpha=0.3, label="Reconstructed") + nbins = hbins.shape[0] - 1 + # ax.set_ylabel('Counts / {:d} Bins'.format(nbins))#, fontsize=20) + ax.set_xlabel(r'$\theta$(degree)')#, fontsize=20) + + + # ----------momentum resolution-------------- + ax = axs.flat[2] + +# rec_p = dfc['charge'].values/df['qOverP'].values + + rec_p = df['p'].values + sim_p = dfc['p'].values + + dp_p = (rec_p - sim_p)/sim_p +# hval, hbins, _ = ax.hist(dp_p*100., bins=np.linspace(-5, 5, 101), +# weights=np.repeat(1./float(rec_p.shape[0]), rec_p.shape[0]), ec='k') + nbins = 100 + + sig_p,err_p=hist_gaus(dp_p*100.,ax,np.linspace(-dp_lim, dp_lim, nbins+1),klog=0,header=None) + + # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20) + ax.set_xlabel(r'$\delta p$ (%)')#, fontsize=20) + + + # ----------------pT resolution---------------------- + + ax = axs.flat[3] + rec_pt = df.groupby('event')['pt'].first().values + sim_pt = dfc.groupby('event')['pt'].first().values + dp_pt = (rec_pt - sim_pt)/sim_pt*100 + # print(dfm.head(),dfc.head()) + # print(np.vstack([rec_th, sim_th]).T) + sig_pt,err_pt=hist_gaus(dp_pt,ax,np.linspace(-dp_lim, dp_lim, nbins+1),klog=0,header=None) + + # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20) + ax.set_xlabel(r'$\delta p_T$(%)')#, fontsize=20) + + + + # ----------------theta resolution---------------------- + nbins = 100 + + ax = axs.flat[4] + rec_th = df.groupby('event')['theta'].first().values + sim_th = dfc.groupby('event')['theta'].first().values + + # print(dfm.head(),dfc.head()) + dth_th = (rec_th - sim_th) + # print(np.vstack([rec_th, sim_th]).T) + + sig_th,err_th=hist_gaus(dth_th,ax,np.linspace(-th_lim, th_lim, nbins+1),klog=0,header=None) + + # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20) + ax.set_xlabel(r'$d\theta$ (rad)')#, fontsize=20) + + # -----------------phi resolution---------------------- + ax = axs.flat[5] + rec_th = df.groupby('event')['phi'].first().values + sim_th = dfc.groupby('event')['phi'].first().values + dth_th = (rec_th - sim_th) + + # print(np.vstack([rec_th, sim_th]).T) +# hval, hbins, _ = ax.hist(dth_th, bins=np.linspace(-0.002, 0.002, 101), +# weights=np.repeat(1./float(rec_th.shape[0]), rec_th.shape[0]), ec='k') + sig_ph,err_ph=hist_gaus(dth_th,ax,np.linspace(-ph_lim, ph_lim, nbins+1),klog=0,header=None) + # ax.set_ylabel('Normalized Counts / {:d} Bins'.format(nbins))#, fontsize=20) + ax.set_xlabel(r'$d\phi$ (rad)')#, fontsize=20) + + # eta distribution +# ax = axs.flat[4] +# sim_eta = dfm.groupby('event')['eta'].first().values +# rec_eta = -np.log(np.tan(df.groupby('event')['direction.theta'].first().values/2.)) +# hval, hbins, _ = ax.hist(sim_eta, bins=np.linspace(-4, 4, 41), ec='k',alpha=0.3,label="Generated") +# hval, hbins, _ = ax.hist(rec_eta, bins=np.linspace(-4, 4, 41), ec='k',alpha=0.3, label="Reconstructed") +# nbins = hbins.shape[0] - 1 +# ax.set_ylabel('Counts / {:d} Bins'.format(nbins), fontsize=20) +# ax.set_xlabel(r'$\eta$', fontsize=20) +# ax.legend() + # -------phi distribution +# ax = axs.flat[6] +# sim_th_deg = dfm.groupby('event')['phi'].first().values*180/np.pi +# rec_th_deg = df.groupby('event')['direction.phi'].first().values*180/np.pi + +# hval, hbins, _ = ax.hist(sim_th_deg, bins=np.linspace(-180, 180,61), +# ec='k',alpha=0.3,label="Generated") +# hval, hbins, _ = ax.hist(rec_th_deg, bins=np.linspace(-180,180, 61), +# ec='k',alpha=0.3, label="Reconstructed") +# nbins = hbins.shape[0] - 1 +# ax.set_ylabel('Counts / {:d} Bins'.format(nbins), fontsize=20) +# ax.set_xlabel(r'$\phi$(degree)', fontsize=20) + + + fig.text(0.5, 0.95, 'Tracking Benchmark (Truth Init.)', fontsize=18, ha='center') + fig.savefig(os.path.join(args.outdir, '{}_performance_fit.png'.format(args.nametag))) + + return track_eff_total,sig_p,err_p,sig_pt,err_pt,sig_th,err_th,sig_ph,err_ph + + +def eta_theta(xx, inverse=0): + if inverse==1: + return np.arctan((np.e)**(-xx))*2 + else: + return -np.log(np.tan(xx/2.)) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('rec_file', help='Path to reconstruction output file.') + parser.add_argument('-o', dest='outdir', default='.', help='Output directory.') + parser.add_argument('-t', '--nametag', type=str, default='tracking', help='Name tag for output files.') + parser.add_argument('-m', '--macros', dest='macros', default='rootlogon.C', + help='Macro files to be loaded by root, separated by \",\".') + parser.add_argument('--mc-collection', dest='mc', default='mcparticles', + help='Collection name of MC particles truth info.') + parser.add_argument('--rec-collection', dest='rec', default='ReconstructedParticles', + help='Collection name of reconstructed particles') + parser.add_argument('--tracking-collection', dest='fit', default='outputTrackParameters', help='Collection name of track info from fitting') + args = parser.parse_args() + + # multi-threading for RDataFrame + nthreads = os.cpu_count()//2 + ROOT.ROOT.EnableImplicitMT(nthreads) + print('CPU available {}, using {}'.format(os.cpu_count(), nthreads)) + + os.makedirs(args.outdir, exist_ok=True) + rdf=read_data(args,-1) ## -1: optional argument for number of entries to read + df_mc,df_rec,df_fit=pre_proc(rdf,0) ## 0: optional argument for pid + dfc = df_mc.loc[df_fit["event"].values].reset_index() + dfm = df_mc + df = df_fit + + _=performance_plot(df_fit,dfc,df_mc) -- GitLab