From da2fe86d50849181dfa5b127d2433a1356fa7201 Mon Sep 17 00:00:00 2001
From: Klaus Rabbertz <klaus.rabbertz@cern.ch>
Date: Tue, 30 Jan 2024 10:01:06 +0100
Subject: [PATCH] Update nnlojet-combine from version 29.05.2020 to 22.12.2023

---
 tools/nnlojet/nnlojet-combine.py | 152 ++++++++++--------
 tools/nnlojet/nnlojet_algo.py    |  55 ++++++-
 tools/nnlojet/nnlojet_plot.py    | 143 +++++++++++++++++
 tools/nnlojet/nnlojet_util.py    | 268 ++++++++++++++++++++++++-------
 4 files changed, 485 insertions(+), 133 deletions(-)
 create mode 100755 tools/nnlojet/nnlojet_plot.py

diff --git a/tools/nnlojet/nnlojet-combine.py b/tools/nnlojet/nnlojet-combine.py
index dc7a772f..a770c051 100755
--- a/tools/nnlojet/nnlojet-combine.py
+++ b/tools/nnlojet/nnlojet-combine.py
@@ -8,22 +8,22 @@ import glob
 import re
 import ast
 import multiprocessing as mp
-#import nnlojet_plot
 
 import logging
 from logging.handlers import QueueHandler, QueueListener
 
 from nnlojet_util import NNLOJETHistogram, NNLOJETContainer
+import nnlojet_plot
 
 
 class Task_part():
     # merging options and default values
-    _trim_threshold     = 4.0
-    _trim_max_frac      = 0.05
-    _kscan_maxdev_unwgt = None  # needs another safety feature (inactive by default)
-    _kscan_nsteps       = 3
-    _kscan_maxdev_steps = 0.2
-    _merge_weighted     = True
+    _default_trim_threshold     = 3.5
+    _default_trim_max_frac      = 0.01
+    _default_kscan_maxdev_unwgt = None  # needs another safety feature (inactive by default)
+    _default_kscan_nsteps       = 3
+    _default_kscan_maxdev_steps = 0.5
+    _default_merge_weighted     = True
 
     def __init__(self, **kwargs):
         self._nx = kwargs.get('nx', 3)
@@ -32,6 +32,14 @@ class Task_part():
         self._qweights = kwargs.get('weights', False)
         self._columns = kwargs.get('columns', None)
         self._rebin = kwargs.get('rebin', None)
+        self._cumulant = kwargs.get('cumulant', None)
+        # copy to instance variable to preserve between processes...
+        self._trim_threshold     = Task_part._default_trim_threshold
+        self._trim_max_frac      = Task_part._default_trim_max_frac
+        self._kscan_maxdev_unwgt = Task_part._default_kscan_maxdev_unwgt
+        self._kscan_nsteps       = Task_part._default_kscan_nsteps
+        self._kscan_maxdev_steps = Task_part._default_kscan_maxdev_steps
+        self._merge_weighted     = Task_part._default_merge_weighted
 
     def __str__(self):
         return '{} [{} file(s)]'.format(self._outfile, len(self._files))
@@ -42,13 +50,13 @@ class Task_part():
         container = NNLOJETContainer(size=len(self._files), weights=self._qweights)
         for file in self._files:
             try:
-                container.append(NNLOJETHistogram(nx=self._nx, filename=file, columns=self._columns, rebin=self._rebin))
+                container.append(NNLOJETHistogram(nx=self._nx, filename=file, columns=self._columns, rebin=self._rebin, cumulant=self._cumulant))
             except ValueError as e:
                 print(e)
                 print("error reading file:", file)
 
         # start the merge
-        
+
         outfile_unwgt = self._outfile[:-4] + '.UNWGT.dat'
         hist = container.merge()
         hist.write_to_file(outfile_unwgt)
@@ -83,7 +91,7 @@ def process_parts_queue(parts_queue):
         parts_queue.task_done()
 
 
-def read_APPLfast(wgt_file):
+def read_weights(wgt_file):
     with open(wgt_file, 'rt') as file:
         hist_acc = NNLOJETHistogram()
         nx = 0
@@ -104,7 +112,7 @@ def read_APPLfast(wgt_file):
 def obs_generator(obs_list):
     cross_pattern = re.compile(r'.*cross.*')
     for obs_line in obs_list:
-        obs_parse = [ it.strip() for it in obs_line.split('>') ]
+        obs_parse = [it.strip() for it in obs_line.split('>')]
         if len(obs_parse) == 1:
             obs_in = obs_parse[0]
             obs_out = obs_parse[0]
@@ -114,7 +122,19 @@ def obs_generator(obs_list):
         else:
             raise ValueError('invalid observables specification: {}'.format(obs_line))
         nx = 0 if re.match(cross_pattern, obs_in) else 3
-        yield (obs_line, obs_in, obs_out, nx)
+        #> do we have any cumulant labels?
+        obs_cum = None
+        obs_parse = obs_in.split()
+        if (len(obs_parse) == 2) and (nx == 3):
+            if obs_in == obs_out:
+                raise ValueError('cumulants need name override: {}'.format(obs_line))
+            obs_in = obs_parse[0]
+            obs_cum = obs_parse[1]
+            nx = 1
+        elif len(obs_parse) != 1:
+            raise ValueError('invalid observables specification: {}'.format(obs_line))
+        # print([obs_line, obs_in, obs_out, nx, obs_cum])
+        yield (obs_line, obs_in, obs_out, nx, obs_cum)
 
 
 def main():
@@ -126,9 +146,9 @@ def main():
     # multi-process
     parser.add_argument('-j', '--jobs', type=int, nargs='?', action='store', default='1',
                         help='Specifies the number of jobs to run simultaneously.')
-    # read in APPLfast weight table
-    parser.add_argument('--APPLfast', action='store', default=None,
-                        help='Read in an APPLfast weight table and combine.')
+    # read in weights weight table
+    parser.add_argument('--weights', action='store', default=None,
+                        help='Read in a weight table and combine.')
     # Parse the input arguments!
     args = parser.parse_args()
 
@@ -136,10 +156,10 @@ def main():
     config.optionxform = lambda option: option  # do not convert to lower case
     config.read(args.config)
 
-    # read in APPLfast weight table?
-    wgt_file = args.APPLfast
+    # read in weight table?
+    wgt_file = args.weights
     if wgt_file is not None:
-        read_APPLfast(wgt_file)
+        read_weights(wgt_file)
         return
 
     cross_pattern = re.compile(r'.*cross.*')
@@ -157,12 +177,12 @@ def main():
     qrecursive = config.getboolean('Options', 'recursive', fallback=False)
     print("recursive search: {}".format('ON' if qrecursive else 'OFF'))
 
-    # # debug plots?
-    # qplot = config.getboolean('Options', 'plot', fallback=False)
-    # print("plotting: {}".format('ON' if qplot else 'OFF'))
-    # if qplot:
-    #     outdir_plots = config.get('Paths', 'out_dir') + '/Plots'
-    #     os.makedirs(outdir_plots, exist_ok=True)
+    # debug plots
+    qplot = config.getboolean('Options', 'plot', fallback=True)
+    print("plotting: {}".format('ON' if qplot else 'OFF'))
+    if qplot:
+        outdir_plots = config.get('Paths', 'out_dir') + '/Plots'
+        os.makedirs(outdir_plots, exist_ok=True)
 
     # output weight data for the combined files?
     qweights = config.getboolean('Options', 'weights', fallback=False)
@@ -181,15 +201,15 @@ def main():
         trim = ast.literal_eval(trim)
         if type(trim) is bool:
             if not trim:
-                Task_part._trim_threshold = None
-                Task_part._trim_max_frac  = None
+                Task_part._default_trim_threshold = None
+                Task_part._default_trim_max_frac  = None
         elif type(trim) is int or type(trim) is float:
-            Task_part._trim_threshold = trim
-            Task_part._trim_max_frac  = None
+            Task_part._default_trim_threshold = trim
+            Task_part._default_trim_max_frac  = None
         elif type(trim) is list or type(trim) is tuple:
             if len(trim) == 2:
-                Task_part._trim_threshold = trim[0]
-                Task_part._trim_max_frac  = trim[1]
+                Task_part._default_trim_threshold = trim[0]
+                Task_part._default_trim_max_frac  = trim[1]
             else:
                 print("trim list invalid: {}".format(trim))
                 raise
@@ -197,11 +217,11 @@ def main():
             print("trim option invalid: {}".format(trim))
             raise
     # digest settings
-    print("trim: ({},{})".format(Task_part._trim_threshold, Task_part._trim_max_frac))
+    print("trim: ({},{})".format(Task_part._default_trim_threshold, Task_part._default_trim_max_frac))
 
     # read optional flag to control weighted vs. unweighted average
-    Task_part._merge_weighted = config.getboolean('Options', 'weighted', fallback=True)
-    print("weighted: {}".format(Task_part._merge_weighted))
+    Task_part._default_merge_weighted = config.getboolean('Options', 'weighted', fallback=True)
+    print("weighted: {}".format(Task_part._default_merge_weighted))
 
     # read optional k-optimisation settings
     kscan = config.get('Options', 'k-scan', fallback=None)
@@ -209,22 +229,22 @@ def main():
         kscan = ast.literal_eval(kscan)
         if type(kscan) is bool:
             if not kscan:
-                Task_part._kscan_maxdev_unwgt = None
-                Task_part._kscan_nsteps       = None
-                Task_part._kscan_maxdev_steps = None
+                Task_part._default_kscan_maxdev_unwgt = None
+                Task_part._default_kscan_nsteps       = None
+                Task_part._default_kscan_maxdev_steps = None
         elif type(kscan) is int or type(kscan) is float:
-            Task_part._kscan_maxdev_unwgt = kscan
-            Task_part._kscan_nsteps       = None
-            Task_part._kscan_maxdev_steps = None
+            Task_part._default_kscan_maxdev_unwgt = kscan
+            Task_part._default_kscan_nsteps       = None
+            Task_part._default_kscan_maxdev_steps = None
         elif type(kscan) is list or type(kscan) is tuple:
             if len(kscan) == 2:
-                Task_part._kscan_maxdev_unwgt = None
-                Task_part._kscan_nsteps       = kscan[0]
-                Task_part._kscan_maxdev_steps = kscan[1]
+                Task_part._default_kscan_maxdev_unwgt = None
+                Task_part._default_kscan_nsteps       = kscan[0]
+                Task_part._default_kscan_maxdev_steps = kscan[1]
             elif len(kscan) == 3:
-                Task_part._kscan_maxdev_unwgt = kscan[0]
-                Task_part._kscan_nsteps       = kscan[1]
-                Task_part._kscan_maxdev_steps = kscan[2]
+                Task_part._default_kscan_maxdev_unwgt = kscan[0]
+                Task_part._default_kscan_nsteps       = kscan[1]
+                Task_part._default_kscan_maxdev_steps = kscan[2]
             else:
                 print("k-scan list invalid: {}".format(kscan))
                 raise
@@ -232,7 +252,7 @@ def main():
             print("k-scan option invalid: {}".format(kscan))
             raise
     # digest settings
-    print("k-scan: ({},{},{})".format(Task_part._kscan_maxdev_unwgt, Task_part._kscan_nsteps, Task_part._kscan_maxdev_steps))
+    print("k-scan: ({},{},{})".format(Task_part._default_kscan_maxdev_unwgt, Task_part._default_kscan_nsteps, Task_part._default_kscan_maxdev_steps))
 
     # set up the log file
     #@todo: use package `multiprocessing-logging` in the future?
@@ -280,13 +300,13 @@ def main():
 
 
     print("""
-        
+
 ///////////
 // Parts //
 ///////////
 """)
-#    for (obs_line, obs_in, obs_out, nx) in obs_generator(obs_set):
-    for (obs_line, obs_in, obs_out, nx) in obs_generator(obs_list):
+#    for (obs_line, obs_in, obs_out, nx, obs_cum) in obs_generator(obs_set):
+    for (obs_line, obs_in, obs_out, nx, obs_cum) in obs_generator(obs_list):
         print("processing observable {}...".format(obs_out))
 
         # combine the different observables
@@ -306,7 +326,7 @@ def main():
                 #rebin = [ float(i) for i in rebin ]
                 #print('rebin = ', rebin)
 
-            task = Task_part(nx=nx, files=files, outfile=outfile, weights=qweights, columns=columns, rebin=rebin)
+            task = Task_part(nx=nx, files=files, outfile=outfile, weights=qweights, columns=columns, rebin=rebin, cumulant=obs_cum)
             # print(" > submitting {}".format(outfile))
             parts_queue.put(task)
 
@@ -318,12 +338,12 @@ def main():
     parts_queue.join()
 
 #    # now loop over renames & re-binnings
-#    for (obs_line, obs_in, obs_out, nx) in obs_generator(obs_list):
+#    for (obs_line, obs_in, obs_out, nx, obs_cum) in obs_generator(obs_list):
 #
 #        rebin = config.get('Observables', obs_line, fallback=None)
 #
 #        # skip cases which we already dealt with above
-#        if (rebin is None) and (obs_in == obs_out): 
+#        if (rebin is None) and (obs_in == obs_out):
 #            continue
 #
 #        if rebin is not None:
@@ -343,7 +363,7 @@ def main():
 #            hist.write_to_file(outfile)
 
 
-    # merge 
+    # merge
     if config.has_section('Merge'):
         print("""
 
@@ -351,7 +371,7 @@ def main():
 // Merge //
 ///////////
 """)
-        for (obs_line, obs_in, obs_out, nx) in obs_generator(obs_list):
+        for (obs_line, obs_in, obs_out, nx, obs_cum) in obs_generator(obs_list):
             print("processing observable {}...".format(obs_out))
             for mrg in config.options('Merge'):
                 parts = config.get('Merge', mrg)
@@ -375,6 +395,10 @@ def main():
                         ptfile = outdir_parts + '/' + mappt + '.' + obs_out + '.dat'
                         hist = hist + NNLOJETHistogram(nx=nx, filename=ptfile, weights=qweights)
                     hist.write_to_file(outfile)
+                    # debug plots
+                    if qplot:
+                        with open(outdir_plots + '/' + mrg + '.' + obs_out + '.plus.plt', 'wt') as fout:
+                            print(nnlojet_plot.plot_merge_plus(path='../Parts', obs=obs_out, merge=mrg, parts=' '.join(parts)), file=fout)
 
                 elif "|" in parts:
                     parts = list(map(str.strip, parts.split('|')))
@@ -385,7 +409,7 @@ def main():
                         ptfile = outdir_parts + '/' + mappt + '.' + obs_out + '.dat'
                         hist = hist.overwrite(NNLOJETHistogram(nx=nx, filename=ptfile, weights=qweights))
                     hist.write_to_file(outfile)
-    
+
                 elif "&" in parts:
                     parts = list(map(str.strip, parts.split('&')))
                     print(" > Merge[&]: {} = {}...".format(mrg, parts))
@@ -402,12 +426,10 @@ def main():
                     # weighted average
                     hist = container.merge(weighted=True)
                     hist.write_to_file(outfile)
-                    # # debug plots
-                    # if qplot:
-                    #     reldir_parts = '../Parts'
-                    #     with open(outdir_plots + '/' + mrg + '.' + obs_out + '.plt', 'wt') as fout:
-                    #     print(gnuplot_header(mrg + '.' + obs_out + '.pdf'), file=fout)
-                    #     print(gnuplot_footer(), file=fout)
+                    # debug plots
+                    if qplot:
+                        with open(outdir_plots + '/' + mrg + '.' + obs_out + '.ratio.plt', 'wt') as fout:
+                            print(nnlojet_plot.plot_merge_and(path='../Parts', obs=obs_out, merge=mrg, parts=' '.join(parts)), file=fout)
 
                 else:
                     raise ValueError("couldn't find '|', '&' nor '+' in the merge:", parts)
@@ -419,7 +441,7 @@ def main():
 // Final //
 ///////////
 """)
-    for (obs_line, obs_in, obs_out, nx) in obs_generator(obs_list):
+    for (obs_line, obs_in, obs_out, nx, obs_cum) in obs_generator(obs_list):
         print("processing observable {}...".format(obs_out))
         # assemble final histograms
         for fin in config.options('Final'):
@@ -433,9 +455,9 @@ def main():
             hist.write_to_file(outfile)
             # write to the file
             if qweights:
-                with open(outdir_final + '/' + fin + '.' + obs_out + '.APPLfast.txt', 'wt') as fout:
-                    print(hist.to_APPLfast_weight(), file=fout)
-            
+                with open(outdir_final + '/' + fin + '.' + obs_out + '.weights.txt', 'wt') as fout:
+                    print(hist.to_weights(), file=fout)
+
 
 if __name__ == "__main__":
     main()
diff --git a/tools/nnlojet/nnlojet_algo.py b/tools/nnlojet/nnlojet_algo.py
index c9cae8cb..04c3d439 100755
--- a/tools/nnlojet/nnlojet_algo.py
+++ b/tools/nnlojet/nnlojet_algo.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 
 import numpy as np
+import math
+
+# import matplotlib.pyplot as plt
 
 
 def is_outlier_MAD(points, thresh=3.5):
@@ -17,22 +20,57 @@ def is_outlier_MAD(points, thresh=3.5):
     return modified_z_score > thresh
 
 
-def is_outlier_doubleMAD(points, thresh=3.5):
+def is_outlier_doubleMAD(points, thresh=3.5, qplt=False):
+    #> maybe even want to swith to the "Harrell-Davis quantile estimator" in the future?
 
     median = np.nanmedian(points)
     abs_diff = np.abs(points - median)
-
     left_mad = np.nanmedian(abs_diff[points <= median])
     right_mad = np.nanmedian(abs_diff[points >= median])
-
     if left_mad == 0. or right_mad == 0.:
+        # means all entries are a constant (propbably zero)
         modified_z_score = 0. * abs_diff
-
-    else: 
-        y_mad = left_mad * np.ones(len(points))
+    else:
+        y_mad = np.zeros(len(points))
+        y_mad[points < median] = left_mad
         y_mad[points > median] = right_mad
+        y_mad[points == median] = 0.5* (left_mad+right_mad)
         modified_z_score = 0.6745 * abs_diff / y_mad
-        modified_z_score[points == median] = 0.
+
+    # if qplt:
+    #     print(">>> doubleMAD [ {} | {} | {} ]".format(left_mad,median,right_mad))
+    #     nbins = 50
+    #     range = [ min(points), max(points) ]
+    #     plt.hist(points, range=range, bins=nbins, log=True, label='all')
+    #     plt.hist(points[modified_z_score > thresh], range=range, bins=nbins, log=True, label='trim')
+    #     plt.show()
+
+    return modified_z_score > thresh
+
+
+def is_outlier_dynMAD(points, thresh=3.5, dsigma=0.6745, qplt=False):
+    frac = math.erf(dsigma / math.sqrt(2.))
+    #print(">>> dsigma = {}; frac = {}".format(dsigma, frac))
+    low, med, upp = np.nanquantile(points,
+                                   [0.5 * (1. - frac), 0.5, 0.5 * (1. + frac)],
+                                   method="normal_unbiased")
+    if low == med or upp == med or med == np.nan:
+        # means all entries are a constant (propbably zero)
+        modified_z_score = np.zeros(len(points))
+    else:
+        y_mad = np.zeros(len(points))
+        y_mad[points < med] = abs(med - low)  # left_mad
+        y_mad[points > med] = abs(upp - med)  # right_mad
+        y_mad[points == med] = 0.5 * abs(upp - low)
+        modified_z_score = dsigma * np.abs(points - med) / y_mad
+
+    # if qplt:
+    #     print(">>> dyn MAD [ {} | {} | {} ]".format(left_mad,median,right_mad))
+    #     nbins = 50
+    #     range = [ min(points), max(points) ]
+    #     plt.hist(points, range=range, bins=nbins, log=True, label='all')
+    #     plt.hist(points[modified_z_score > thresh], range=range, bins=nbins, log=True, label='trim')
+    #     plt.show()
 
     return modified_z_score > thresh
 
@@ -52,3 +90,6 @@ def is_outlier_IQR(points, thresh=1.5):
 
     # return (points < lower_fence) | (points > upper_fence)
     return np.array([ (pt < lower_fence) | (pt > upper_fence) if np.isfinite(pt) else False for pt in points ])
+
+
+
diff --git a/tools/nnlojet/nnlojet_plot.py b/tools/nnlojet/nnlojet_plot.py
new file mode 100755
index 00000000..ba19bdcc
--- /dev/null
+++ b/tools/nnlojet/nnlojet_plot.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+
+def plot_merge_and(**kwargs):
+    gnuplot = r'''
+set terminal pdfcairo enhanced color size 10cm,10cm dashed  # font "Iosevka,7" fontscale 0.65
+set encoding utf8
+set style fill transparent pattern border
+set pointintervalbox 0.001
+'''
+    #> insert all the info into the template
+    gnuplot += r'''
+PATH  = '{path}'
+OBS   = '{obs}'
+MERGE = '{merge}'
+PARTS = '{parts}'
+'''.format(**kwargs)
+    gnuplot += r'''
+merged = PATH.'/'.MERGE.'.'.OBS.'.dat'
+
+#> parse the merged histogram file to excract some information on the data structure we're dealing with
+eval system("awk 'BEGIN{bin_acc=0.;bin_num=-1;bin_lst=0.;}$1!~/^#/{bin_num++;if(bin_num>0){bin_acc+=($3-$1)/bin_lst;}bin_lst=$3-$1;}$1~/^#labels:/{printf(\"ncol = %d;\", gensub(/.+\\[([0-9]+)\\]$/, \"\\\\1\", \"g\", $(NF)));}$1~/^#nx:/{printf(\"nx = %d;\", $2);}END{printf(\"bin_fac = %e;\",bin_acc/bin_num);}' ".merged)
+#> only plots for distributions
+if (nx != 3) quit;
+
+label(i) = system(sprintf("awk '$1~/^#labels:/{print $(%d)}' ",i).merged)  # comment prefix causes one offset
+
+#> (x-a)/(x+a) & only take into account error from `x`, values in ‰
+ratio(part) = "< paste ".merged." ".part." | ".sprintf("awk 'BEGIN{nx=%d;ncol=%d;}$1!~/^#/&&$1==$(ncol+1)&&$(nx)==$(ncol+nx){for(i=1;i<=nx;i++){printf(\"%e \",$(i))}for(i=nx+1;i<ncol;i+=2){j=ncol+i;x=$(j);a=$(i);if(x==0.&&a==0.){val=0.;err=0.;}else{val=(x-a)/(x+a);err=$(j+1)*2.*a/(x+a)**2};printf(\" %e %e \",val*1e3,err*1e3);}printf(\"\\n\");}' 2> /dev/null", nx,ncol)
+#> (x-a)/Δa & only take into account error from `x`
+dev(part) = "< paste ".merged." ".part." | ".sprintf("awk 'BEGIN{nx=%d;ncol=%d;}$1!~/^#/&&$1==$(ncol+1)&&$(nx)==$(ncol+nx){for(i=1;i<=nx;i++){printf(\"%e \",$(i))}for(i=nx+1;i<ncol;i+=2){j=ncol+i;x=$(j);a=$(i);if(x==0.&&a==0.){val=0.;err=0.;}else{val=(x-a)/$(i+1);err=$(j+1)/$(i+1)};printf(\" %e %e \",val,err);}printf(\"\\n\");}' 2> /dev/null", nx,ncol)
+
+set output MERGE.'.'.OBS.'.pdf'
+
+if (bin_fac > 1.1) {
+  set log x
+  bin_loc(xlow,xupp,pos) = exp(log(xlow) + pos*(log(xupp)-log(xlow)))
+} else {
+  unset log x
+  bin_loc(xlow,xupp,pos) = xlow + pos*(xupp-xlow)
+}
+set format y '%g'
+
+set lmargin 10
+set key top left horizontal
+unset colorbox  # remove palette box
+
+#> @TODO: injection point of custom commands?
+
+do for [t = 1:(ncol-nx)/2] {
+  col = nx+2*t-1;
+  #set title label(col+1) noenhanced
+  set multiplot layout 2,1 title label(col+1) noenhanced
+  set ylabel '(X − ref) / (X + ref)  [‰]' noenhanced
+  set format x ''
+  unset xlabel
+  plot \
+    ratio(PATH.'/'.MERGE.'.'.OBS.'.dat')." | awk '$1!~/^#/{print $0;$1=$3;print $0;}'" u 1:(column(col)+column(col+1)):(column(col)-column(col+1)) w filledc lc rgbcolor 0 fs transparent solid 0.3 t MERGE noenhanced, \
+    for [i=1:words(PARTS)] ratio(PATH.'/'.word(PARTS, i).'.'.OBS.'.dat') u (bin_loc($1,$3,0.25+i*0.5/(words(PARTS)+1.))):col:(column(col+1)) w err lc palette frac i/(words(PARTS)+1.) pt i ps 0.2 t word(PARTS, i) noenhanced
+  set ylabel '(X − ref) / Δref' noenhanced
+  set format x '%g'
+  set xlabel OBS noenhanced
+  plot \
+    dev(PATH.'/'.MERGE.'.'.OBS.'.dat')." | awk '$1!~/^#/{print $0;$1=$3;print $0;}'" u 1:(column(col)+column(col+1)):(column(col)-column(col+1)) w filledc lc rgbcolor 0 fs transparent solid 0.3 notitle, \
+    for [i=1:words(PARTS)] dev(PATH.'/'.word(PARTS, i).'.'.OBS.'.dat') u (bin_loc($1,$3,0.25+i*0.5/(words(PARTS)+1.))):col:(column(col+1)) w err lc palette frac i/(words(PARTS)+1.) pt i ps 0.2 notitle
+  unset multiplot
+}
+
+unset output
+'''
+    return gnuplot
+
+
+def plot_merge_plus(**kwargs):
+    gnuplot = r'''
+set terminal pdfcairo enhanced color size 10cm,10cm dashed  # font "Iosevka,7" fontscale 0.65
+set encoding utf8
+set style fill transparent pattern border
+set pointintervalbox 0.001
+'''
+    #> insert all the info into the template
+    gnuplot += r'''
+PATH  = '{path}'
+OBS   = '{obs}'
+MERGE = '{merge}'
+PARTS = '{parts}'
+'''.format(**kwargs)
+    gnuplot += r'''
+merged = PATH.'/'.MERGE.'.'.OBS.'.dat'
+
+#> parse the merged histogram file to excract some information on the data structure we're dealing with
+eval system("awk 'BEGIN{bin_acc=0.;bin_num=-1;bin_lst=0.;}$1!~/^#/{bin_num++;if(bin_num>0){bin_acc+=($3-$1)/bin_lst;}bin_lst=$3-$1;}$1~/^#labels:/{printf(\"ncol = %d;\", gensub(/.+\\[([0-9]+)\\]$/, \"\\\\1\", \"g\", $(NF)));}$1~/^#nx:/{printf(\"nx = %d;\", $2);}END{printf(\"bin_fac = %e;\",bin_acc/bin_num);}' ".merged)
+#> only plots for distributions
+if (nx != 3) quit;
+
+label(i) = system(sprintf("awk '$1~/^#labels:/{print $(%d)}' ",i).merged)  # comment prefix causes one offset
+
+#> x/a & only take into account error from `x`, values in %
+ratio(part) = "< paste ".merged." ".part." | ".sprintf("awk 'BEGIN{nx=%d;ncol=%d;}$1!~/^#/&&$1==$(ncol+1)&&$(nx)==$(ncol+nx){for(i=1;i<=nx;i++){printf(\"%e \",$(i))}for(i=nx+1;i<ncol;i+=2){j=ncol+i;x=$(j);a=$(i);if(x==0.&&a==0.){val=0.;err=0.;}else{val=x/a;err=$(j+1)/a};printf(\" %e %e \",val*1e2,err*1e2);}printf(\"\\n\");}' 2> /dev/null", nx,ncol)
+
+set output MERGE.'.'.OBS.'.pdf'
+
+if (bin_fac > 1.1) {
+  set log x
+  bin_loc(xlow,xupp,pos) = exp(log(xlow) + pos*(log(xupp)-log(xlow)))
+} else {
+  unset log x
+  bin_loc(xlow,xupp,pos) = xlow + pos*(xupp-xlow)
+}
+set format y '%g'
+
+set lmargin 10
+set key top left horizontal
+unset colorbox  # remove palette box
+
+#> @TODO: injection point of custom commands
+
+do for [t = 1:(ncol-nx)/2] {
+  col = nx+2*t-1;
+  #set title label(col+1) noenhanced
+  set multiplot layout 2,1 title label(col+1) noenhanced
+  set ylabel 'Δ' noenhanced
+  set format x ''
+  unset xlabel
+  set log y
+  plot \
+    "< awk '$1!~/^#/{print $0;$1=$3;print $0;}' ".merged u 1:(column(col+1)) w l lc rgbcolor 0 t MERGE noenhanced, \
+    for [i=1:words(PARTS)] PATH.'/'.word(PARTS, i).'.'.OBS.'.dat' u (bin_loc($1,$3,0.5)):(column(col+1)) w l lc palette frac i/(words(PARTS)+1.) dt i t word(PARTS, i) noenhanced
+  set ylabel 'X / ref  [%]' noenhanced
+  set format x '%g'
+  set xlabel OBS noenhanced
+  unset log y
+  plot \
+    ratio(PATH.'/'.MERGE.'.'.OBS.'.dat')." | awk '$1!~/^#/{print $0;$1=$3;print $0;}'" u 1:(column(col)+column(col+1)):(column(col)-column(col+1)) w filledc lc rgbcolor 0 fs transparent solid 0.3 notitle, \
+    for [i=1:words(PARTS)] ratio(PATH.'/'.word(PARTS, i).'.'.OBS.'.dat') u (bin_loc($1,$3,0.25+i*0.5/(words(PARTS)+1.))):col:(column(col+1)) w err lc palette frac i/(words(PARTS)+1.) pt i ps 0.2 notitle
+  unset multiplot
+}
+
+unset output
+'''
+    return gnuplot
+
+
diff --git a/tools/nnlojet/nnlojet_util.py b/tools/nnlojet/nnlojet_util.py
index 43bfa858..15628784 100755
--- a/tools/nnlojet/nnlojet_util.py
+++ b/tools/nnlojet/nnlojet_util.py
@@ -37,9 +37,10 @@ class NNLOJETHistogram():
             rebin = kwargs.get('rebin', None)
             if rebin is not None and self.nx <= 0:
                 raise ValueError('rebinning requires x-columns!')
-            self._read_dat(columns=columns, rebin=rebin)
+            cumulant = kwargs.get('cumulant', None)
+            self._read_dat(columns=columns, rebin=rebin, cumulant=cumulant)
 
-        # to keep track of weights
+        #> to keep track of weights
         self._files_wgt = None  # list of the files
         self._wgt = None
         if 'weights' in kwargs and kwargs.get('weights', False):
@@ -72,7 +73,7 @@ class NNLOJETHistogram():
         return NNLOJETHistogram.filename_suff(self.filename)
 
 
-    # without the setter, we do not allow setting the value after init
+    #> without the setter, we do not allow setting the value after init
     # @nx.setter
     # def nx(self, nx): self._nx = nx
 
@@ -81,8 +82,9 @@ class NNLOJETHistogram():
 
         columns = kwargs.get('columns', None)
         rebin = kwargs.get('rebin', None)
+        cumulant = kwargs.get('cumulant', None)
 
-        # step 1: parse label line
+        #> step 1: parse label line
         with open(self.filename, 'rt') as histfile:
             lines = histfile.readlines()
 
@@ -92,21 +94,22 @@ class NNLOJETHistogram():
                 self._labels = [ re.sub(r'\[[0-9]+\]', '', lab) for lab in line.split()[1:] ]  # remove "#labels" part
                 # break
             if re.match(r'^\s*' + _comment_prefix + 'nx', line, re.IGNORECASE):
+                #> nx comment line overrides defaults set by filename
                 self._nx = int(line.split()[1]) # remove "#nx" part
                 # print("using nx = {}".format(self._nx))
         # fh.close()
 
-        # warning if no labels line could be found
+        #> warning if no labels line could be found
         if self._labels is None:
             logging.warning("WARNING: couldn't find a labels line {}".format(self.filename))
             return
 
-        # default init
+        #> default init
         self._ioverflow = None
         nrows = 0
         ncols = len(self._labels)  # raw column size
 
-        # build index list (only y-columns; no x-val)
+        #> build index list (only y-columns; no x-val)
         if columns is not None:
             idx_cols = []
             for col in columns:
@@ -123,8 +126,8 @@ class NNLOJETHistogram():
         else:
             idx_cols = range(self.nx, ncols, 2)
 
-        # step 2: read in the data
-        comment_pattern = re.compile(r'^\s*' + _comment_prefix) 
+        #> step 2: read in the data
+        comment_pattern = re.compile(r'^\s*' + _comment_prefix)
 
         # dynamic arrays: python lists so much faster than numpy arrays
         data_xval = [] if self.nx > 0 else None
@@ -135,10 +138,10 @@ class NNLOJETHistogram():
         for line in lines:
             #----- parse comments
             if re.match(comment_pattern, line):
-                # neval
+                #> neval
                 if re.match(r'^\s*' + _comment_prefix + 'neval', line, re.IGNORECASE):
                     self._neval = int(line.split()[1])
-                # overflow
+                #> overflow
                 if re.match(r'^\s*' + _comment_prefix + 'overflow', line, re.IGNORECASE):
                     data = line.split()  # make a list
                     if len(data) != ncols:
@@ -161,13 +164,13 @@ class NNLOJETHistogram():
                 nrows += 1
         # fh.close()
 
-        # set neval to 1 if there was none
+        #> set neval to 1 if there was none
         if self._neval is None:
             raise ValueError("couldn't find the neval information")
             print("couldn't find the neval info: {}".format(self.filename), file=sys.stderr)
             self._neval = 1
-        
-        # step 3: rebin 
+
+        #> step 3: rebin
         if rebin is not None:
             if self.nx != 3:
                 raise ValueError('rebinning can only handle nx=3 for now')
@@ -179,36 +182,42 @@ class NNLOJETHistogram():
             i_rebin = 0
             i_overflow = None
             for i_row in range(nrows):
-                # overflow is added immediately
+                #> overflow is added immediately
                 if i_row == self._ioverflow:
                     rebin_xval.append(data_xval[i_row])
                     rebin_yval.append(data_yval[i_row])
                     rebin_yerr.append(data_yerr[i_row])
                     i_overflow = i_rebin
+                    #> in order to merge with "left-over" bins, need to accumulate things properly
+                    for i_col in range(len(rebin_yval[i_overflow])):
+                        rebin_yval[i_overflow][i_col] =  self._neval * data_yval[i_row][i_col]
+                        rebin_yerr[i_overflow][i_col] = (self._neval * data_yerr[i_row][i_col])**2 + self._neval * (data_yval[i_row][i_col])**2
                     #> do *not* increment! otherwise, we lose a bin
                     #i_rebin += 1
                     continue
                 # # debug
                 # print("data_xval[{}][0] = ".format(i_row), type(data_xval[i_row][0]), data_xval[i_row][0])
                 # print("rebin[i_rebin] = ".format(i_rebin), type(rebin[i_rebin]), rebin[i_rebin])
-                # find the first matching bin
-                if data_xval[i_row][0] < rebin[i_rebin] and data_xval[i_row][2] <= rebin[i_rebin]:
-                    continue
-                # skip trailing bins
-                if i_rebin == len(rebin)-1:
+                # find the first matching bin or deal with the trailing bins
+                if (data_xval[i_row][0] < rebin[i_rebin] and data_xval[i_row][2] <= rebin[i_rebin]) or (i_rebin >= len(rebin)-1):
+                    #> these are "left-over" bins and should be accumulated into the overflow
+                    delta = data_xval[i_row][2] - data_xval[i_row][0]
+                    for i_col in range(len(rebin_yval[i_overflow])):
+                        rebin_yval[i_overflow][i_col] +=  self._neval * delta*data_yval[i_row][i_col]
+                        rebin_yerr[i_overflow][i_col] += (self._neval * delta*data_yerr[i_row][i_col])**2 + self._neval * (delta*data_yval[i_row][i_col])**2
                     continue
-                # init
+                #> init
                 if xlow is None:
                     rebin_row_val = [0.] * len(data_yval[i_row])
                     rebin_row_err = [0.] * len(data_yval[i_row])
                     xlow = data_xval[i_row][0]
-                # accumulate data
+                #> accumulate data
                 if data_xval[i_row][0] >= rebin[i_rebin] and data_xval[i_row][2] <= rebin[i_rebin+1]:
                     delta = data_xval[i_row][2] - data_xval[i_row][0]
                     for i_col in range(len(rebin_row_val)):
                         rebin_row_val[i_col] +=  self._neval * delta*data_yval[i_row][i_col]
                         rebin_row_err[i_col] += (self._neval * delta*data_yerr[i_row][i_col])**2 + self._neval * (delta*data_yval[i_row][i_col])**2
-                # finalise
+                #> finalise row
                 if i_row == nrows-1 or data_xval[i_row+1][0] >= rebin[i_rebin+1]:
                     xupp = data_xval[i_row][2]
                     delta = xupp - xlow
@@ -218,33 +227,84 @@ class NNLOJETHistogram():
                     for i_col in range(len(rebin_row_val)):
                         rebin_row_val[i_col] /= self._neval
                         rebin_row_err[i_col] = math.sqrt( rebin_row_err[i_col] - self._neval * rebin_row_val[i_col]**2 ) / self._neval
-                        # differential:
+                        #> differential:
                         rebin_row_val[i_col] /= delta
                         rebin_row_err[i_col] /= delta
                     rebin_yval.append(rebin_row_val)
                     rebin_yerr.append(rebin_row_err)
                     i_rebin += 1
-                    # reset accumulators
+                    #> reset accumulators
                     rebin_row_val, rebin_row_err = None, None
                     xlow, xupp = None, None
                     continue
 
+            # finalise the overflow bin
+            for i_col in range(len(rebin_yval[i_overflow])):
+                rebin_yval[i_overflow][i_col] /= self._neval
+                rebin_yerr[i_overflow][i_col] = math.sqrt( rebin_yerr[i_overflow][i_col] - self._neval * rebin_yval[i_overflow][i_col]**2 ) / self._neval
             self._ioverflow = i_overflow
             data_xval = rebin_xval
             data_yval = rebin_yval
             data_yerr = rebin_yerr
 
-        # step 4: store to member vars as numpy arrays
+        #> step 4: cumulant option
+        if cumulant is not None:
+            if self.nx != 3:
+                raise ValueError('cumulant can only handle nx=3 for now')
+            if (cumulant == '+') or (cumulant == '++'):
+                cum_range = reversed(range(len(data_yval)))
+                cum_xval = [[data_xval[i_row][0]] for i_row in range(len(data_yval))]
+                if self._labels is not None:
+                    self._labels.pop(1)
+                    self._labels.pop(1)
+            elif (cumulant == '-') or (cumulant == '--'):
+                cum_range = range(len(data_yval))
+                cum_xval = [[data_xval[i_row][2]] for i_row in range(len(data_yval))]
+                if self._labels is not None:
+                    self._labels.pop(0)
+                    self._labels.pop(0)
+            else:
+                raise ValueError('cumulant label invalid: {}'.format(cumulant))
+            #> cumulants only have one x-value
+            self._nx = 1
+            #> the accumulator & new x-values
+            cum_row_val = [0.] * len(data_yval[0])
+            cum_row_err = [0.] * len(data_yval[0])
+            #> deal with the overflow first
+            if (cumulant == '++') or (cumulant == '--'):
+                i_overflow = self._ioverflow
+                for i_col in range(len(data_yval[i_overflow])):
+                    cum_row_val[i_col] +=  self._neval * data_yval[i_overflow][i_col]
+                    cum_row_err[i_col] += (self._neval * data_yerr[i_overflow][i_col])**2 + self._neval * (data_yval[i_overflow][i_col])**2
+                    #> used up => reset
+                    data_yval[i_overflow][i_col] = 0.
+                    data_yerr[i_overflow][i_col] = 0.
+            #> sum up the bins with the order determined above
+            for i_row in cum_range:
+                if i_row == self._ioverflow:
+                    continue
+                delta = data_xval[i_row][2] - data_xval[i_row][0]
+                for i_col in range(len(data_yval[i_row])):
+                    cum_row_val[i_col] +=  self._neval * delta*data_yval[i_row][i_col]
+                    cum_row_err[i_col] += (self._neval * delta*data_yerr[i_row][i_col])**2 + self._neval * (delta*data_yval[i_row][i_col])**2
+                    #> overwrite
+                    data_yval[i_row][i_col] = cum_row_val[i_col] / self._neval
+                    data_yerr[i_row][i_col] = math.sqrt( cum_row_err[i_col] - cum_row_val[i_col]**2 / self._neval ) / self._neval
+            #> cumulants only have one x-value: overwrite
+            data_xval = cum_xval
+            self._nx = 1
+
+        #> step 5: store to member vars as numpy arrays
         if data_xval is not None:
-            self._xval = np.array(data_xval, dtype='float') 
-        self._yval = np.array(data_yval, dtype='float') 
-        self._yerr = np.array(data_yerr, dtype='float') 
+            self._xval = np.array(data_xval, dtype='float')
+        self._yval = np.array(data_yval, dtype='float')
+        self._yerr = np.array(data_yerr, dtype='float')
 
-        # warning if all entries are zero
+        #> warning if all entries are zero
         if (not np.any(self._yval)) and (not np.any(self._yerr)):
             logging.warning("WARNING: all entries zero for {}".format(self.filename))
 
-        # warning if nan or inf
+        #> warning if nan or inf
         if np.any(np.logical_not(np.isfinite(self._yval))) or np.any(np.logical_not(np.isfinite(self._yerr))):
             logging.warning("WARNING: NaN or Inf encountered in {}".format(self.filename))
 
@@ -313,7 +373,7 @@ class NNLOJETHistogram():
         return '\n'.join(lines)
 
 
-    def to_APPLfast_weight(self):
+    def to_weights(self):
         # construct string line by line
         lines = []
 
@@ -494,7 +554,7 @@ class NNLOJETContainer():
         self._mask = None
 
         # it's much more efficient pre-allocating a sufficiently
-        # big array, instead of dynamically growingit
+        # big array, instead of dynamically adjusting it
         self._buffer_size = kwargs.get('size', None)
         self._size = 0
 
@@ -545,7 +605,7 @@ class NNLOJETContainer():
                 self._yerr = np.zeros((ncols, nrows, self._buffer_size), dtype='float')
                 self._mask = np.ones((ncols, nrows, self._buffer_size), dtype='int')
                 self._mask *= 2  # means invalid
-                # copy over 
+                # copy over
                 if self._size != 0:
                     raise ValueError('first append and non-zero size?!')
                 self._yval[:, :, self._size] = hist._yval[:, :]
@@ -579,7 +639,7 @@ class NNLOJETContainer():
                 self._yerr[:, :, self._size] = hist._yerr[:, :]
                 self._mask[:, :, self._size] = np.logical_or(np.logical_not(np.isfinite(hist._yval[:, :])), np.logical_not(np.isfinite(hist._yerr[:, :]))).astype(int) * 2
                 # self._mask[:, :, self._size] = np.isnan(hist._yerr[:, :]).astype(int) * 2
-                self._size += 1                
+                self._size += 1
             else:
                 # the inefficient implementation...
                 self._yval = np.append(self._yval, np.expand_dims(hist._yval, axis=2), axis=2)
@@ -593,7 +653,7 @@ class NNLOJETContainer():
 
 
     def _recursive_k_weights(self, i_row, i_col, wgts_in):
-        n_files = len(wgts_in)        
+        n_files = len(wgts_in)
         wgts_out = [0.] * n_files
 
         leafs = [ [] for i in range(n_files) ]
@@ -642,7 +702,7 @@ class NNLOJETContainer():
 #     def _recursive_k_weights(self, i_row, i_col, wgts_in):
 #         wgts_out = np.zeros(wgts_in.shape)
 #         n_files = len(wgts_in)
-# 
+#
 #         # recursively search where the runs have been merged into
 #         # do this once at the beginning to save time (one inner loop)
 #         targets = - np.ones(wgts_in.shape)
@@ -657,10 +717,10 @@ class NNLOJETContainer():
 #             if self._mask[i_row, i_col, target] > 0:
 #                 raise ValueError('leaf terminating on a trimmed entry?!')
 #             targets[i_file] = target
-# 
+#
 #         for i_file in range(n_files):
 #             # find all the leafs that were merged into current `i_file`
-#             i_leafs = set(np.where(targets == i_file)[0])            
+#             i_leafs = set(np.where(targets == i_file)[0])
 #             # compute ntot needed for the unweighted partial weight
 #             ntot = 0
 #             for i_leaf in i_leafs:
@@ -668,7 +728,7 @@ class NNLOJETContainer():
 #             # take the weight in `i_file` and distribute them
 #             for i_leaf in i_leafs:
 #                 wgts_out[i_leaf] = wgts_in[i_file] * self._neval[i_leaf] / ntot
-# 
+#
 #         return wgts_out
 
 
@@ -694,7 +754,7 @@ class NNLOJETContainer():
 
         if ntot != 0:
             yval /= ntot
-            if self._qwgt and not skip_wgt: 
+            if self._qwgt and not skip_wgt:
                 wgts /= ntot
                 wgts = self._recursive_k_weights(i_row, i_col, wgts)
             yerr = math.sqrt( yerr - ntot * yval**2 ) / ntot
@@ -765,18 +825,18 @@ class NNLOJETContainer():
             for i_col in range(n_cols):
 
                 if weighted:
-                    ( result._yval[i_row, i_col], 
-                      result._yerr[i_row, i_col], 
+                    ( result._yval[i_row, i_col],
+                      result._yerr[i_row, i_col],
                       wgts ) = self._merge_weighted_bin(i_row, i_col)
 
                 else:
-                    ( result._yval[i_row, i_col], 
-                      result._yerr[i_row, i_col], 
+                    ( result._yval[i_row, i_col],
+                      result._yerr[i_row, i_col],
                       wgts ) = self._merge_bin(i_row, i_col)
 
                 if self._qwgt:
                     result._wgt[i_row, i_col, :] = wgts[:]
-        
+
         return result
 
 
@@ -790,7 +850,7 @@ class NNLOJETContainer():
         #     self._buffer_size[:, :, self._size:] = 1
 
 
-    def mask_outliers(self, thresh=4., maxfrac=0.05):
+    def mask_outliers(self, thresh=3.5, maxfrac=0.01):
         if thresh <= 0.:
             raise ValueError("non-positive threshold value: {}".format(thresh))
         if maxfrac <= 0. or maxfrac >= 1.:
@@ -799,40 +859,125 @@ class NNLOJETContainer():
         self.unmask()
 
         # what about running this *after* optimise_k ?! (mask[i,j,k] < 0)
+        # -> probably should avoid it as neval information is not saved after k-opt
 
         (n_rows, n_cols, n_files) = self._yval.shape
         if self._buffer_size is not None:
             n_files = self._size
 
+        #> leave room for one outlier
+        if n_files > 1:
+            maxfrac = max(maxfrac, 1.5/n_files)
+
+        # # for an interval [med-fac*sigma, med+fac*sigma] of a normal distribution
+        # # the fraction of accepted jobs is:  erf(fac/sqrt(2))
+        # # we will match the l-estimator at 1/2 of the threshold (to have something dynamical)
+        # fac = thresh/2.
+        # frac = math.erf(fac/math.sqrt(2.))
+        # # match where we can accommodate an outlier
+        # while (1.-frac < 1./n_files) and (frac > 0.5):
+        #     fac /= 1.1
+        #     frac = math.erf(fac/math.sqrt(2.))
+        # # want to have at least the median version (= double mad)
+        # if frac < 0.5:
+        #     fac = 0.6745
+        #     frac = 0.5
+        # print(">>> fac = {};  frac = {}; 1/n = {}".format(fac, frac, 1./n_files))
+
+        # for an interval [med-fac*sigma, med+fac*sigma] of a normal distribution
+        # the fraction of accepted jobs is:  erf(fac/sqrt(2))
+        fac = thresh
+        frac = math.erf(fac/math.sqrt(2.))
+        # match where we can accommodate outliers
+        while (1.-frac < maxfrac):
+            fac /= 1.05
+            frac = math.erf(fac/math.sqrt(2.))
+        #> done: now match at a lower point but don't go lower than the median
+        fac = max(0.5*fac, 0.6745)
+        frac = math.erf(fac/math.sqrt(2.))
+        # print(">>> fac = {};  frac = {}; 1/N = {}".format(fac, frac, 1./n_files))
+
+        bin_data = np.zeros(n_files, dtype='float')  # single bin data
         for i_row in range(n_rows):
             for i_col in range(n_cols):
+                #> copy data & mask with nan where necessary
+                bin_data = np.where(
+                    np.logical_and(self._yerr[i_row, i_col, :] != 0,
+                                   self._mask[i_row, i_col, :] == 0),
+                    self._yval[i_row, i_col, :], np.nan)
+                if np.all(np.isnan(bin_data)):
+                    continue
+                # print("+++++++++ ({},{})".format(i_row, i_col))
+                # print("mask: {}".format(self._mask[i_row, i_col, :]))
+                # print("yerr: {}".format(self._yerr[i_row, i_col, :]))
+                # print("yval: {}".format(self._yval[i_row, i_col, :]))
+                # print("data: {}".format(bin_data))
+                # print("+++++++++")
                 dyn_thresh = thresh
-                # mask = nnlojet_algo.is_outlier_doubleMAD(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
-                mask = nnlojet_algo.is_outlier_IQR(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+                dyn_fac = fac
+                # mask = nnlojet_algo.is_outlier_doubleMAD(bin_data, dyn_thresh)
+                # mask = nnlojet_algo.is_outlier_IQR(bin_data, dyn_thresh)
+                mask = nnlojet_algo.is_outlier_dynMAD(bin_data, dyn_thresh, dyn_fac)
                 cutfrac = np.count_nonzero(mask) / n_files
-                # print("#", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
-                while cutfrac > maxfrac:
+                #zerofrac = np.count_nonzero(self._yerr[i_row, i_col, :] == 0.) / n_files
+                #print("#", i_row, i_col, dyn_thresh, cutfrac, maxfrac, zerofrac, file=sys.stderr)
+                while (cutfrac > maxfrac) and (dyn_fac < 2.*fac):
                     dyn_thresh *= 1.1
-                    if dyn_thresh > 100.:
-                        # unmask this bin for safety?
-                        break
-                    # mask = nnlojet_algo.is_outlier_doubleMAD(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
-                    mask = nnlojet_algo.is_outlier_IQR(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+                    dyn_fac *= 1.05
+                    # mask = nnlojet_algo.is_outlier_doubleMAD(bin_data, dyn_thresh)
+                    # mask = nnlojet_algo.is_outlier_IQR(bin_data, dyn_thresh)
+                    mask = nnlojet_algo.is_outlier_dynMAD(bin_data, dyn_thresh, dyn_fac)
                     cutfrac = np.count_nonzero(mask) / n_files
-                    # print(">", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
-                # print("#", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
+                    # print(">", i_row, i_col, dyn_thresh, dyn_fac, cutfrac, maxfrac, file=sys.stderr)
+                # print("#", i_row, i_col, dyn_thresh, dyn_fac, cutfrac, maxfrac, file=sys.stderr)
+                # mask = nnlojet_algo.is_outlier_dynMAD(bin_data, dyn_thresh, dyn_fac, True)
+                # print("out:  {}".format(mask))
+                # print("+++++++++")
                 self._mask[i_row, i_col, mask == True] = 1
 
 
+#    def mask_outliers(self, thresh=4., maxfrac=0.05):
+#        if thresh <= 0.:
+#            raise ValueError("non-positive threshold value: {}".format(thresh))
+#        if maxfrac <= 0. or maxfrac >= 1.:
+#            raise ValueError("invalid maxfrac range: {}".format(maxfrac))
+#
+#        self.unmask()
+#
+#        # what about running this *after* optimise_k ?! (mask[i,j,k] < 0)
+#
+#        (n_rows, n_cols, n_files) = self._yval.shape
+#        if self._buffer_size is not None:
+#            n_files = self._size
+#
+#        for i_row in range(n_rows):
+#            for i_col in range(n_cols):
+#                dyn_thresh = thresh
+#                mask = nnlojet_algo.is_outlier_doubleMAD(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+#                # mask = nnlojet_algo.is_outlier_IQR(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+#                cutfrac = np.count_nonzero(mask) / n_files
+#                # print("#", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
+#                while cutfrac > maxfrac:
+#                    dyn_thresh *= 1.1
+#                    mask = nnlojet_algo.is_outlier_doubleMAD(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+#                    # mask = nnlojet_algo.is_outlier_IQR(self._yval[i_row, i_col, :], dyn_thresh).astype(int)
+#                    # if dyn_thresh > 100.:
+#                    #     # unmask this bin for safety?
+#                    #     break
+#                    cutfrac = np.count_nonzero(mask) / n_files
+#                    # print(">", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
+#                # print("#", i_row, i_col, dyn_thresh, cutfrac, maxfrac, file=sys.stderr)
+#                self._mask[i_row, i_col, mask == True] = 1
+
     def optimise_k(self, **kwargs):
 
         # parse arguments for the merge settings
         maxdev_unwgt = kwargs.get('maxdev_unwgt', None)
         maxdev_steps = kwargs.get('maxdev_steps', None)
-        nsteps = kwargs.get('nsteps', None)
+        nsteps = kwargs.get('nsteps', 0)
 
         # set up params
-        if nsteps is None: maxdev_steps = None
+        if nsteps == 0: maxdev_steps = None
         if maxdev_unwgt is None and maxdev_steps is None: return
             # raise ValueError("optimise_k termination condition missing")
 
@@ -883,6 +1028,7 @@ class NNLOJETContainer():
                             for jstep in range(istep-1,-nsteps-1,-1):
                                 delta = abs(history[istep][0] - history[jstep][0])
                                 sigma = math.sqrt(history[istep][1]**2 + history[jstep][1]**2)
+                                # sigma = max(abs(history[istep][1]-history[jstep][1]),min(history[istep][1],history[jstep][1]))
                                 term_cond2 = term_cond2 and delta < maxdev_steps * sigma
 
                     # --- termination of merges ---
@@ -917,7 +1063,7 @@ class NNLOJETContainer():
                         # skip invalid upper
                         while iupp > 0:
                             upp = sort_index[iupp]
-                            if (self._mask[i_row, i_col, upp] == 0 and 
+                            if (self._mask[i_row, i_col, upp] == 0 and
                                 neval[upp] != 0):
                                 break
                             iupp -= 1
-- 
GitLab