Work on event rate summary plot

80e8ee4f · Klaus Rabbertz · a026f022 · 80e8ee4f
Commit 80e8ee4f authored 4 years ago by Klaus Rabbertz
--- a/tools/plotting/fastnnlo_runtime.py
+++ b/tools/plotting/fastnnlo_runtime.py
@@ -4,6 +4,8 @@
 import glob
 import argparse
 import glob
+import os
+import re
 import sys
 import matplotlib as mpl
 import matplotlib.gridspec as gridspec
@@ -51,6 +53,14 @@ import matplotlib.pyplot as plt
 # numpy
 import numpy as np

+
+# Redefine ScalarFormatter
+class ScalarFormatterForceFormat(ScalarFormatter):
+    # Override function that finds format to use.
+    def _set_format(self, vmin, vmax):
+        self.format = "%1.2f"  # Give format here
+
+
 # Action class to allow comma-separated (or empty) list in options
 class SplitArgs(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
@@ -62,6 +72,9 @@ class SplitArgs(argparse.Action):
 # Some global definitions
 _debug = False
 _formats = {'eps': 0, 'pdf': 1, 'png': 2, 'svg': 3}
+_channels = ['LO', 'R', 'V', 'RRa', 'RRb', 'RV', 'VV', 'ALL']
+_channel_number = {'LO': 0, 'R': 1, 'V': 2, 'RRa': 3, 'RRb': 4, 'RV': 5, 'VV': 6}
+_channel_colors = ['tab:green', 'tab:cyan', 'tab:blue', 'tab:red', 'tab:orange', 'tab:pink', 'tab:purple']

 #####################################################################################

@@ -77,7 +90,16 @@ def main():
    args = arguments()

    # extract correct paths for input and outputfiles
-    logfiles = get_files(args['logfiles'])
+    files = get_files(args['logfiles'])
+    logfiles = []
+    runfiles = []
+    for file in files:
+        runfile = re.sub('.log$', '.run', file)
+        if not os.path.isfile(runfile):
+            print('[fastnnlo_runtime]: WARNING! No matching runcard found for log file {}, skipped!')
+        else:
+            logfiles.append(file)
+            runfiles.append(runfile)
    outputpath = args['output']
    print('[fastnnlo_runtime]: Output path argument is: {}'.format(outputpath))
    outputname = args['filename']
@@ -100,15 +122,17 @@ def main():
    # get all the information from logfiles as dict
    # dict contains: runtime, runtime_unit, channel, events
    loginformation = get_loginformation(logfiles)
+    runinformation = get_runinformation(runfiles)
+    info = {**loginformation, **runinformation}

    # plot all the information
    if args['CPUtime']:
-        plot_elapsed_time(loginformation, outputpath, outputname, formats)
+        plot_elapsed_time(info, outputpath, outputname, formats)
    if args['Events']:
-        plot_events_per_hour(loginformation, outputpath, outputname, formats)
+        plot_events_per_hour(info, outputpath, outputname, formats)
    if not args['CPUtime'] and not args['Events']:
-        plot_elapsed_time(loginformation, outputpath, outputname, formats)
-        plot_events_per_hour(loginformation, outputpath, outputname, formats)
+        plot_elapsed_time(info, outputpath, outputname, formats)
+        plot_events_per_hour(info, outputpath, outputname, formats)

    exit(0)

@@ -143,17 +167,16 @@ def get_files(files):
            print('fastnnlo_runtime: ERROR! Aborted, only one log file found: {}'.format(files[0]))
            exit(3)

+    # sort unsorted glob list
+    files.sort()
    return files

 def get_loginformation(files):

-    run_time = []
-    number_events = []
-    channel = None
+    runtimes = []

    for file in files:
-        event = False
-        run_time_temp = []
+        runtimes_temp = []

        with open(file) as origin:
            for line in origin:
@@ -165,138 +188,182 @@ def get_loginformation(files):
                    seconds = float(line[3])

                    if hours != 0.:
-                        run_time_temp.append(hours + minutes/60 + seconds/360)
+                        runtimes_temp.append(hours + minutes/60 + seconds/360)
                        unit = 'hours'
                    else:
-                        run_time_temp.append(minutes + seconds/60)
+                        runtimes_temp.append(minutes + seconds/60)
                        unit = 'minutes'

-                # extract channel name
-                if 'Tablename' in line and not channel:
-                    line = line.split()
-                    tablename = line[2].split('.')
-                    channel = tablename[0] + '.' + tablename[1]
-                # extract total events
-                if 'ncalltot=' in line and not event:
-                    line = line.split(',')
-                    number_events.append(float(line[4][10:]))
-                    event = True
+        runtimes.append(runtimes_temp[-1])
+
+    runtimes = np.array(runtimes)
+
+    information = {
+        'runtime': runtimes,
+        'runtime_unit': unit
+    }

-        run_time.append(run_time_temp[-1])
+    return information

+def get_runinformation(files):

-    run_time = np.array(run_time)
-    number_events = np.array(number_events)
+    nevents  = []
+    channels = []
+
+    for file in files:
+        with open(file) as origin:
+            for line in origin:
+                # extract total events
+                if 'Number of events' in line:
+                    line = line.split()
+                    nevents.append(line[0])
+                if 'Job name id' in line:
+                    line  = line.split()
+                    parts = line[0].split('-')
+                    channels.append(parts[0])
+
+    nevents  = np.array(nevents)
+    channels = np.array(channels)

    information = {
-        'runtime': run_time,
-        'runtime_unit': unit,
-        'channel': channel,
-        'events': number_events
+        'events': nevents,
+        'channels': channels
    }

    return information

-def plot_elapsed_time(informationdict, out_path, out_name, formats):
+def plot_elapsed_time(infodict, out_path, out_name, formats):

-    time = informationdict['runtime']
-    unit = informationdict['runtime_unit']
-    channel = informationdict['channel']
-    basename = 'runtime'
+    times = infodict['runtime']
+    unit = infodict['runtime_unit']
+    channels = infodict['channels']

    # get relevant values
-    mean = np.mean(time)
-    std = np.std(time)
-    median = np.median(time)
-    iqd = np.subtract(*np.percentile(time, [75, 25], interpolation='linear'))/2.
+    mean = np.mean(times)
+    std = np.std(times)
+    median = np.median(times)
+    iqd = np.subtract(*np.percentile(times, [75, 25], interpolation='linear'))/2.

-    CPUtime = np.sum(time) / (1 if unit == 'hours' else 60)
+    CPUtime = np.sum(times) / (1 if unit == 'hours' else 60)

    # set figure
    fig = plt.figure(figsize=(16, 12))
    ax = fig.gca()

    # plot histogram
-    n, batches, _ = ax.hist(time, bins=20, color='deepskyblue', edgecolor='black', label='Total CPU time: {0:0.0f} hours'.format(CPUtime))
+    n, batches, _ = ax.hist(times, bins=20, color='deepskyblue', edgecolor='black', label='Total CPU time: {0:0.0f} hours'.format(CPUtime))

    # plot mean and median
    ax.vlines(mean, 0, max(n), colors='red', linestyles='dashed', label=r'Mean: {0:0.1f}$\pm${2:0.1f} {1}'.format(mean, unit, std))
-    ax.vlines(median, 0, max(n), colors='green', linestyles='dashed', label=r'Median: {0:0.1f}$\pm${2:0.1f} {1}'.format(median, unit, iqd))
+    ax.vlines(median, 0, max(n), colors='green', linestyles='dashdot', label=r'Median: {0:0.1f}$\pm${2:0.1f} {1}'.format(median, unit, iqd))

    # finish and save figure
-    chnlabel = channel
+    chnlabel = channels[0]
    if out_name:
        chnlabel = out_name
    ax.set_title('Elapsed time of ' + chnlabel + ' production', fontsize=20)
    ax.set_xlabel('CPU time [' + unit + ']', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20)
-    ax.set_ylabel('frequency', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20)
+    ax.set_ylabel('# jobs', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20, labelpad=20)
    ax.set_yscale('log')
    ax.tick_params(axis='both', which='major', labelsize=20)
+    ax.ticklabel_format(axis='x', style='plain', useOffset=False)

    ax.legend(loc='best', fontsize=20)
    ax.grid()
    ax.set_axisbelow(True)

    # set saving location
+    basename = 'runtime'
    for fmt in formats:
        filename = out_path + ('' if out_path[-1] == '/' else '/')
        if out_name:
            filename += out_name + '.' + basename + '.' + fmt
        else:
-            filename += channel  + '.' + basename + '.' + fmt
+            filename += channels[0]  + '.' + basename + '.' + fmt
        print('[fastnnlo_runtime]: Saving runtime plot {}'.format(filename))
        fig.savefig(filename)

-def plot_events_per_hour(informationdict, out_path, out_name, formats):
-
-    time = informationdict['runtime']
-    unit = informationdict['runtime_unit']
-    channel = informationdict['channel']
-    events = informationdict['events']
-    basename = 'evtrate'
-
+def plot_events_per_hour(infodict, out_path, out_name, formats):
+
+    # get input
+    channels = infodict['channels']
+    events   = infodict['events']
+    times    = infodict['runtime']
+    unit     = infodict['runtime_unit']
+
+    # prepare input
+    unique_channels = set(channels)
+    if len(unique_channels) == 0:
+        print('fastnnlo_runtime: ERROR! Aborted, no channel info found.')
+        exit(11)
+    eph = []
+    for i, time in enumerate(times):
        if unit == 'hours':
-        eph = events/time
+            eph.append(float(events[i])/time)
        else:
-        eph = events/(time/60)
+            eph.append(float(events[i])/(time/60))
+    ephchn = []
+    for i, val in enumerate(eph):
+        for j, chn in enumerate(_channels):
+            if channels[i] == chn:
+                ephchn.append([val, j])
+    ephchn = np.array(ephchn)
+    masks = []
+    for i, chn in enumerate(_channels):
+        masks.append(ephchn[:,1] == i)

    # get relevant values
    mean    = np.mean(eph)
    std     = np.std(eph)
    median  = np.median(eph)
    iqd     = np.subtract(*np.percentile(eph, [75, 25], interpolation='linear'))/2.
+    ephmin  = np.min(eph)
+    ephmax  = np.max(eph)
+    logbins = np.geomspace(ephmin, ephmax, 100)
+    CPUtime = np.sum(times) / (1 if unit == 'hours' else 60)

-    CPUtime = np.sum(time) / (1 if unit == 'hours' else 60)
-
-    # set figure
+    # create figure
    fig = plt.figure(figsize=(16, 12))
    ax = fig.gca()

-    # plot histogram
-    n, batches, _ = ax.hist(eph, bins=20, color='deepskyblue', edgecolor='black', label='Total CPU time: {0:0.0f} hours'.format(CPUtime))
-
+    # plot (multistack-)histogram
+    evrs = []
+    lastch = 'LO'
+    for chn in _channels:
+        if chn in unique_channels:
+            evrs.append(ephchn[masks[_channel_number[chn]]][:,0])
+            lastch = chn
+    if len(unique_channels) == 1:
+        n, batches, _ = ax.hist(evrs, log=True, bins=20, edgecolor='black', color=_channel_colors[_channel_number[lastch]], label='Total CPU time: {0:0.0f} hours'.format(CPUtime))
        # plot mean and median
        ax.vlines(mean, 0, max(n), colors='red', linestyles='dashed', label=r'Mean: {0:0.1e}$\pm${1:0.1e} events/hour'.format(mean, std))
-    ax.vlines(median, 0, max(n), colors='green', linestyles='dashed', label=r'Median: {0:0.2e}$\pm${1:0.2e} events/hour'.format(median, iqd))
+        ax.vlines(median, 0, max(n), colors='green', linestyles='dashdot', label=r'Median: {0:0.2e}$\pm${1:0.2e} events/hour'.format(median, iqd))
+        ax.ticklabel_format(axis='x', style='plain', useOffset=False)
+    else:
+        n, batches, _ = ax.hist(evrs, histtype='barstacked', log=True, stacked=True, bins=logbins, edgecolor='black', color=_channel_colors, label=_channels)
+        ax.set_xlim(0.9*ephmin, 1.1*ephmax)
+        ax.set_xscale('log')

    # finish and save figure
-    ax.set_title('Event rate of ' + channel + ' production', fontsize=20)
+    chnlabel = channels[0]
+    if out_name:
+        chnlabel = out_name
+    ax.set_title('Event rate of ' + chnlabel + ' production', fontsize=20)
    ax.set_xlabel('event rate [1/hour]', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20)
-    ax.set_ylabel('frequency', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20)
-    ax.set_yscale('log')
+    ax.set_ylabel('# jobs', horizontalalignment='right', x=1.0, verticalalignment='top', y=1.0, fontsize=20, labelpad=20)
    ax.tick_params(axis='both', which='major', labelsize=20)
-
    ax.legend(loc='best', fontsize=20)
    ax.grid()
    ax.set_axisbelow(True)

    # set saving location
+    basename = 'evtrate'
    for fmt in formats:
        filename = out_path + ('' if out_path[-1] == '/' else '/')
        if out_name:
            filename += out_name + '.' + basename + '.' + fmt
        else:
-            filename += channel  + '.' + basename + '.' + fmt
+            filename += channels[0]  + '.' + basename + '.' + fmt
        print('[fastnnlo_runtime]: Saving event rate plot {}'.format(filename))
        fig.savefig(filename)