From a6d784845a085e9c54d0345d697729d7e62d684b Mon Sep 17 00:00:00 2001
From: Shujie Li <shujie@jlab.org>
Date: Thu, 14 Oct 2021 12:58:59 -0400
Subject: [PATCH]  add new tracking performance plots with gaus fit

---
 .../tracking/run_tracking_benchmarks.py       |   9 +
 .../scripts/tracking_performance_fit.py       | 415 ++++++++++++++++++
 2 files changed, 424 insertions(+)
 create mode 100644 benchmarks/tracking/scripts/tracking_performance_fit.py

diff --git a/benchmarks/tracking/run_tracking_benchmarks.py b/benchmarks/tracking/run_tracking_benchmarks.py
index 32a8a149..05cd2e88 100755
--- a/benchmarks/tracking/run_tracking_benchmarks.py
+++ b/benchmarks/tracking/run_tracking_benchmarks.py
@@ -38,6 +38,7 @@ script_dir = os.path.dirname(os.path.realpath(__file__))
 gen_script = os.path.join(script_dir, 'scripts', 'gen_particles.py')
 option_script = os.path.join(script_dir, 'options', args.options)
 analysis_script = os.path.join(script_dir, 'scripts', 'tracking_performance.py')
+analysis_script_fit = os.path.join(script_dir, 'scripts', 'tracking_performance_fit.py')
 
 gen_file = 'gen_{nametag}_{pmin}_{pmax}.hepmc'.format(**kwargs)
 sim_file = 'sim_{nametag}_{pmin}_{pmax}.root'.format(**kwargs)
@@ -99,6 +100,14 @@ if 'ana' in procs:
         print('ERROR running analysis ({})!'.format(ana_cmd))
         exit(1)
 
+    ## new performance plot with fit
+    ana_cmd = ['python', analysis_script_fit, rec_file,
+               '--mc-collection', 'mcparticles',
+               '--tracking-collection', 'outputTrackParameters',
+               '-o', 'results', '-t', args.nametag]
+    return_code = subprocess.run(ana_cmd).returncode
+    if return_code is not None and return_code != 0:
+        print('ERROR running analysis ({})!'.format(ana_cmd))
 
 # upload generated data files if it was small enough (< 10 Mb)
 for upload in [rec_file]:
diff --git a/benchmarks/tracking/scripts/tracking_performance_fit.py b/benchmarks/tracking/scripts/tracking_performance_fit.py
new file mode 100644
index 00000000..083e9294
--- /dev/null
+++ b/benchmarks/tracking/scripts/tracking_performance_fit.py
@@ -0,0 +1,415 @@
+import os
+import ROOT
+import pandas as pd
+import numpy as np
+import argparse
+from matplotlib import pyplot as plt
+import matplotlib.ticker as ticker
+# pd.set_option(20, None, 'display.max_columns', None)
+pd.options.display.max_rows = 40
+pd.options.display.max_columns = 100
+# pd.set_option('display.max_rows', None)#, 'display.max_columns', None)
+import matplotlib.cm as cm
+import matplotlib as mpl
+import matplotlib.pylab as plt
+import awkward as ak
+import uproot as ur
+# import uproot3
+print('Uproot version: ' + ur.__version__)
+deg2rad = np.pi/180.0
+
+plt.rcParams['figure.figsize'] = [12.0, 8.0]
+# plt.rcParams['figure.dpi'] = 80
+            
+SMALL_SIZE = 13
+MEDIUM_SIZE = 14
+BIGGER_SIZE = 16
+
+plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
+plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
+plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title]
+
+
+# class Inputs:
+#     def __init__(self, worksim, rec_file):
+#         self.worksim  = worksim
+#         self.rec_file = worksim+"/"+rec_file
+#         self.outdir = self.worksim
+#     macros = 'rootlogon.C'
+#     nametag = "tracking"
+#     mc  = "mcparticles"
+#     fit = "outputTrackParameters"
+#     rec = "ReconstructedParticles"
+
+# append kin variables to dataframe, mcparticles "ps", reconstructedparticles "p"
+PARTICLES = pd.DataFrame({
+    "pion0": (111, 0, 0.1349766),       # pi0
+    "pion+": (211, 1, 0.13957018),      # pi+
+    "pion-": (-211,-1,0.13957018),     # pi-
+    "kaon0L": (130, 0, 0.497648),       # K0L
+    "kaon0S": (310, 0, 0.497648),       # K0S
+    "kaon0": (311, 0,  0.497648),        # K0
+    "kaon+": (321, 1,  0.493677),        # K+
+    "kaon-": (-321, -1,  0.493677),       # K-
+    "proton": (2212, 1,  0.938272),      # proton
+    "neutron": (2112, 0, 0.939565),     # neutron
+    "electron": (11, -1,  0.51099895e-3), # electron
+    "positron": (-11, 1, 0.51099895e-3),# positron
+    "photon": (22, 0, 0),               # photon
+},index=["pid","charge","mass"]).transpose()
+
+def add_kin(data,prefix="p"):
+
+    get_4vecs = np.vectorize(lambda x, y, z, m: ROOT.Math.PxPyPzMVector(x, y, z, m))
+    fourvecs = get_4vecs(*data[[prefix+'.x', prefix+'.y', prefix+'.z', 'mass']].values.T)
+
+    data.loc[:, 'p'] = [v.P() for v in fourvecs]
+    data.loc[:, 'theta'] = [v.Theta() for v in fourvecs]
+    data.loc[:, 'phi'] = [v.Phi() for v in fourvecs]
+    data.loc[:, 'eta'] = [v.Eta() for v in fourvecs]
+    data.loc[:, 'pt'] = [v.Pt() for v in fourvecs]
+    return data
+
+
+#1 d histogram and gaus fit
+from lmfit.models import GaussianModel
+def hist_gaus(dataset,ax, bins=100,klog=0,header=None):
+
+    ## select data in range if bins is provided as an array
+    if not np.isscalar(bins):
+        c1 = dataset<=bins[-1]
+        c2 = dataset>=bins[0]
+        dataset=dataset[c1&c2]
+
+
+    ## 1st fit
+    n, bins, patches = ax.hist(dataset, bins,density=False, facecolor='b', alpha=0.3)
+    xx    = bins[0:-1]+(bins[1]-bins[0])/2.0
+
+    ymax  = np.max(n)
+    std   = np.std(dataset)
+    mean  = np.mean(dataset)
+    # #     print(mean,std)
+    c1 = xx<=(mean+2*std)
+    c2 = xx>=(mean-2*std)
+    cond  = c1&c2
+    while len(n[cond])<len(bins)/4.0:
+        ax.cla()
+        diff = (bins[-1]-bins[0])/2.0/2.0
+        n, bins, patches = ax.hist(dataset, np.linspace(bins[0]+diff,bins[-1]-diff,len(bins)),density=False, facecolor='b', alpha=0.3)
+        xx    = bins[0:-1]+(bins[1]-bins[0])/2.0
+        c1 = xx<=(mean+2*std)
+        c2 = xx>=(mean-2*std)
+        cond  = c1&c2
+    model = GaussianModel()
+    # create parameters with initial guesses:
+    params = model.make_params(center=np.median(dataset), amplitude=np.max(n), sigma=np.std(dataset))  
+    result = model.fit(n, params, x=xx)
+    
+    # 2nd fit
+    std = result.params['sigma'].value
+    c1 = xx<=(mean+2*std)
+    c2 = xx>=(mean-2*std)
+    cond = c1&c2
+    model = GaussianModel()
+    params = model.make_params(center=np.median(dataset), amplitude=np.max(n), sigma=np.std(dataset))  
+    result = model.fit(n[cond], params, x=xx[cond])
+    print(result.success)
+    
+    #     print(result.fit_report())
+    #     print (result.best_values)
+    # plt.plot(xx, result.best_fit, 'r-', label='best fit')
+    if len(result.best_fit)>0:
+        ax.plot(xx[cond], result.best_fit, 'r-', label='sigma=%g,err=%g' %(result.params['sigma'].value,result.params['sigma'].stderr))
+    ax.legend(title=header, frameon=False,loc='upper left')
+    if klog:
+        ax.set_yscale('log')
+        ax.set_ylim(1,ymax*10)
+    else:
+        ax.set_ylim(0,ymax*1.3)
+
+    return result.params['sigma'].value,result.params['sigma'].stderr
+
+def read_data(args, rows=-1):
+
+    fname   = args.rec_file
+    rdf_rec = ROOT.RDataFrame('events',fname)
+    if rows>0:
+        rdf_rec=rdf_rec.Range(0,rows)
+    print("read in ",rdf_rec.Count().GetValue(), "rows")
+    return rdf_rec
+
+# read from RDataFrame and flatten a given collection, return pandas dataframe
+def flatten_collection(rdf, collection, cols=None):
+    if not cols:
+        cols = [str(c) for c in rdf.GetColumnNames() if str(c).startswith('{}.'.format(collection))]
+    else:
+        cols = ['{}.{}'.format(collection, c) for c in cols]
+    if not cols:
+        print('cannot find any branch under collection {}'.format(collection))
+        return pd.DataFrame()
+    for cname in cols:
+        if "[" in cname:
+            cols.remove(cname) ## can not convert array to python. drop for now
+#     print(cols)
+
+    data = rdf.AsNumpy(cols)
+    # flatten the data, add an event id to identify clusters from different events
+    evns = []
+    for i, vec in enumerate(data[cols[0]]):
+        evns += [i]*vec.size()
+    for n, vals in data.items():
+        # make sure ints are not converted to floats
+        typename = vals[0].__class__.__name__.lower()
+        dtype = np.int64 if 'int' in typename or 'long' in typename else np.float64
+        # type safe creation
+        data[n] = np.asarray([v for vec in vals for v in vec], dtype=dtype)
+    # build data frame
+    dfp = pd.DataFrame({c: pd.Series(v) for c, v in data.items()})
+    dfp.loc[:, 'event'] = evns
+    
+    dfp.rename(columns={c: c.replace(collection + '.', '') for c in dfp.columns}, inplace=True)
+    return dfp
+
+
+
+def pre_proc(rdf_rec,pid=0):
+    
+    
+    df_fit       = flatten_collection(rdf_rec,args.fit)
+    df_fit       = df_fit.groupby('event').first().reset_index()
+    df_fit["p"]  = 1./df_fit["qOverP"]
+    df_fit["eta"]= -np.log(np.tan(df_fit['direction.theta'].values/2.)) 
+    df_fit["pt"] = df_fit["p"]*np.sin(df_fit["direction.theta"])
+    df_fit["theta"] = df_fit["direction.theta"]
+    df_fit["phi"] = df_fit["direction.phi"]
+
+    #     if len(df_fit)<1:
+    print (args.fit,df_fit.columns,len(df_fit))
+        
+    df_rec   = flatten_collection(rdf_rec, args.rec)
+    print (args.rec,df_rec.columns,len(df_rec))
+
+
+    if len(df_rec)>0:
+    #             df_rec=df_rec[df_rec.pid==pid]
+        df_rec = df_rec.groupby('event').first().reset_index()
+    #         df_rec = df_rec.reset_index()
+    #         if "mass" not in df_rec.columns:
+        if pid!=0:
+    #             df_rec["mass"] = [PARTICLES.loc[PARTICLES.pid==pid]['mass'].values for pid in df_rec.pid]
+            mass = PARTICLES.loc[PARTICLES.pid==pid]['mass'].values[0]
+            df_rec["mass"] = mass*np.ones(len(df_rec))
+            df_rec = add_kin(df_rec,"p")
+
+    df_mc   = flatten_collection(rdf_rec,args.mc)
+    if pid!=0:
+        df_mc = df_mc[df_mc.pdgID==pid]
+    df_mc  = df_mc[(df_mc.genStatus==1)].reset_index(drop=False)
+    #     df_mc  = df_mc.set_index('event').loc[df['event'].values]
+    print (args.mc,df_mc.columns,len(df_mc))
+    df_mc  = add_kin(df_mc,"ps")
+
+    
+    return df_mc, df_rec,df_fit
+
+    
+## --------tracking performance plots-------------------
+def performance_plot(df,dfc,dfm,dirname=""):
+    
+    dp_lim=5 #%
+    th_lim=0.003 #rad
+    ph_lim=0.003
+    fig, axs = plt.subplots(3, 2, figsize=(20, 14))
+    for ax in axs.flat:
+        ax.tick_params(direction='in', which='both')#, labelsize=20)
+        ax.grid(linestyle=':')
+        ax.set_axisbelow(True)
+        
+    # ---------------tracking efficiency v.s. eta-----------------
+    ax = axs.flat[0]
+    eta_bins = np.linspace(-4, 4, 21)
+
+    
+    sim_eta, _ = np.histogram(dfm['eta'].values, bins=eta_bins)
+    ## ?? outputTrackParameters.direction.phi = mcparticles2.theta???
+    rec_eta, _ = np.histogram(df['eta'], bins=eta_bins)
+    # rec_eta, _ = np.histogram(-np.log(np.tan(df['direction.theta'].values/2.)), bins=eta_bins)
+    track_eff_total = np.sum(rec_eta)/np.sum(sim_eta)
+    
+    eta_centers = (eta_bins[1:] + eta_bins[:-1])/2.
+    eta_binsize = np.mean(np.diff(eta_centers))
+    # cut off not simulated bins
+    mask = sim_eta > 0
+    sim_eta = sim_eta[mask]
+    rec_eta = rec_eta[mask]
+    eta_centers = eta_centers[mask]
+    track_eff = np.array(rec_eta)/np.array(sim_eta)
+    # binary distribution, pq*sqrt(N)
+    # TODO check the errors
+    eff = np.mean(track_eff)
+    track_err = eff*(1. - eff)*np.reciprocal(np.sqrt(sim_eta))
+    # rec_err = eff*(1. - eff)*np.sqrt(rec_eta)
+    # track_eff_lower = track_eff - np.maximum(np.zeros(shape=rec_eta.shape), (rec_eta - rec_err)/sim_eta)
+    # track_eff_upper = np.minimum(np.ones(shape=rec_eta.shape), (rec_eta + rec_err)/sim_eta) - track_eff
+    track_eff_lower = track_eff - np.maximum(np.zeros(shape=rec_eta.shape), track_eff - track_err)
+    track_eff_upper = np.minimum(np.ones(shape=rec_eta.shape), track_eff + track_err) - track_eff
+
+
+    ax.errorbar(eta_centers, track_eff, xerr=eta_binsize/2., yerr=[track_eff_lower, track_eff_upper],
+                fmt='o', capsize=3)
+    ax.set_ylim(0., 1.1)
+    ax.set_xlim(-4.5, 4.5)
+    ax.set_ylabel('Tracking Efficiency')#, fontsize=20)
+    ax.set_xlabel(r'$\eta$')#, fontsize=20)
+    ax.text(-4,1,"recon/generated events= %d / %d =%.2f" %(len(df),len(dfm),len(df)/len(dfm)))
+
+
+
+    
+    # ---------------theta distribution------------
+    ax = axs.flat[1]
+    sim_th_deg =  dfm.groupby('event')['theta'].first().values*180/np.pi
+    rec_th_deg = df.groupby('event')['theta'].first().values*180/np.pi
+
+    hval, hbins, _ = ax.hist(sim_th_deg, bins=np.linspace(-0, 180,61),
+                              ec='k',alpha=0.3,label="Generated")
+    hval, hbins, _ = ax.hist(rec_th_deg, bins=np.linspace(-0,180, 61),
+                              ec='k',alpha=0.3, label="Reconstructed")
+    nbins = hbins.shape[0] - 1
+    # ax.set_ylabel('Counts / {:d} Bins'.format(nbins))#, fontsize=20)
+    ax.set_xlabel(r'$\theta$(degree)')#, fontsize=20)
+    
+
+    # ----------momentum resolution--------------
+    ax = axs.flat[2]
+
+#     rec_p = dfc['charge'].values/df['qOverP'].values
+
+    rec_p = df['p'].values
+    sim_p = dfc['p'].values
+
+    dp_p = (rec_p - sim_p)/sim_p
+#     hval, hbins, _ = ax.hist(dp_p*100., bins=np.linspace(-5, 5, 101),
+#                              weights=np.repeat(1./float(rec_p.shape[0]), rec_p.shape[0]), ec='k')
+    nbins = 100
+
+    sig_p,err_p=hist_gaus(dp_p*100.,ax,np.linspace(-dp_lim, dp_lim, nbins+1),klog=0,header=None)
+    
+    # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20)
+    ax.set_xlabel(r'$\delta p$ (%)')#, fontsize=20)
+    
+    
+     # ----------------pT resolution----------------------
+        
+    ax = axs.flat[3]
+    rec_pt = df.groupby('event')['pt'].first().values
+    sim_pt = dfc.groupby('event')['pt'].first().values
+    dp_pt = (rec_pt - sim_pt)/sim_pt*100
+    # print(dfm.head(),dfc.head())
+    # print(np.vstack([rec_th, sim_th]).T)
+    sig_pt,err_pt=hist_gaus(dp_pt,ax,np.linspace(-dp_lim, dp_lim, nbins+1),klog=0,header=None)
+
+    # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20)
+    ax.set_xlabel(r'$\delta p_T$(%)')#, fontsize=20)
+
+
+    
+    # ----------------theta resolution----------------------
+    nbins = 100
+
+    ax = axs.flat[4]
+    rec_th = df.groupby('event')['theta'].first().values
+    sim_th = dfc.groupby('event')['theta'].first().values
+
+    # print(dfm.head(),dfc.head())
+    dth_th = (rec_th - sim_th)
+    # print(np.vstack([rec_th, sim_th]).T)
+
+    sig_th,err_th=hist_gaus(dth_th,ax,np.linspace(-th_lim, th_lim, nbins+1),klog=0,header=None)
+
+    # ax.set_ylabel('Normalized Density / {:d} Bins'.format(nbins))#, fontsize=20)
+    ax.set_xlabel(r'$d\theta$ (rad)')#, fontsize=20)
+
+    # -----------------phi resolution----------------------
+    ax = axs.flat[5]
+    rec_th = df.groupby('event')['phi'].first().values
+    sim_th = dfc.groupby('event')['phi'].first().values
+    dth_th = (rec_th - sim_th)
+
+    # print(np.vstack([rec_th, sim_th]).T)
+#     hval, hbins, _ = ax.hist(dth_th, bins=np.linspace(-0.002, 0.002, 101),
+#                              weights=np.repeat(1./float(rec_th.shape[0]), rec_th.shape[0]), ec='k')
+    sig_ph,err_ph=hist_gaus(dth_th,ax,np.linspace(-ph_lim, ph_lim, nbins+1),klog=0,header=None)
+    # ax.set_ylabel('Normalized Counts / {:d} Bins'.format(nbins))#, fontsize=20)
+    ax.set_xlabel(r'$d\phi$ (rad)')#, fontsize=20)
+
+    # eta distribution
+#     ax = axs.flat[4]
+#     sim_eta =  dfm.groupby('event')['eta'].first().values
+#     rec_eta =  -np.log(np.tan(df.groupby('event')['direction.theta'].first().values/2.))
+#     hval, hbins, _ = ax.hist(sim_eta, bins=np.linspace(-4, 4, 41), ec='k',alpha=0.3,label="Generated")
+#     hval, hbins, _ = ax.hist(rec_eta, bins=np.linspace(-4, 4, 41), ec='k',alpha=0.3, label="Reconstructed")
+#     nbins = hbins.shape[0] - 1
+#     ax.set_ylabel('Counts / {:d} Bins'.format(nbins), fontsize=20)
+#     ax.set_xlabel(r'$\eta$', fontsize=20)
+#     ax.legend()
+    # -------phi distribution
+#     ax = axs.flat[6]
+#     sim_th_deg =  dfm.groupby('event')['phi'].first().values*180/np.pi
+#     rec_th_deg = df.groupby('event')['direction.phi'].first().values*180/np.pi
+
+#     hval, hbins, _ = ax.hist(sim_th_deg, bins=np.linspace(-180, 180,61),
+#                               ec='k',alpha=0.3,label="Generated")
+#     hval, hbins, _ = ax.hist(rec_th_deg, bins=np.linspace(-180,180, 61),
+#                               ec='k',alpha=0.3, label="Reconstructed")
+#     nbins = hbins.shape[0] - 1
+#     ax.set_ylabel('Counts / {:d} Bins'.format(nbins), fontsize=20)
+#     ax.set_xlabel(r'$\phi$(degree)', fontsize=20)
+    
+    
+    fig.text(0.5, 0.95, 'Tracking Benchmark (Truth Init.)', fontsize=18, ha='center')
+    fig.savefig(os.path.join(args.outdir, '{}_performance_fit.png'.format(args.nametag)))
+
+    return track_eff_total,sig_p,err_p,sig_pt,err_pt,sig_th,err_th,sig_ph,err_ph
+
+
+def eta_theta(xx, inverse=0):
+    if inverse==1:
+        return np.arctan((np.e)**(-xx))*2
+    else:
+        return -np.log(np.tan(xx/2.))
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('rec_file', help='Path to reconstruction output file.')
+    parser.add_argument('-o', dest='outdir', default='.', help='Output directory.')
+    parser.add_argument('-t', '--nametag', type=str, default='tracking', help='Name tag for output files.')
+    parser.add_argument('-m', '--macros', dest='macros', default='rootlogon.C',
+            help='Macro files to be loaded by root, separated by \",\".')
+    parser.add_argument('--mc-collection', dest='mc', default='mcparticles',
+            help='Collection name of MC particles truth info.')
+    parser.add_argument('--rec-collection', dest='rec', default='ReconstructedParticles',
+            help='Collection name of reconstructed particles')
+    parser.add_argument('--tracking-collection', dest='fit', default='outputTrackParameters', help='Collection name of track info from fitting')
+    args = parser.parse_args()
+
+    # multi-threading for RDataFrame
+    nthreads = os.cpu_count()//2
+    ROOT.ROOT.EnableImplicitMT(nthreads)
+    print('CPU available {}, using {}'.format(os.cpu_count(), nthreads))
+
+    os.makedirs(args.outdir, exist_ok=True)
+    rdf=read_data(args,-1) ## -1: optional argument for number of entries to read
+    df_mc,df_rec,df_fit=pre_proc(rdf,0) ## 0: optional argument for pid
+    dfc = df_mc.loc[df_fit["event"].values].reset_index()
+    dfm = df_mc
+    df  = df_fit
+
+    _=performance_plot(df_fit,dfc,df_mc) 
-- 
GitLab