* Replaced data rate and total file size calculation by a more performant one

32941fc9 · Christoph Heidecker · a8ef5b14 · 32941fc9
Commit 32941fc9 authored 6 years ago by Christoph Heidecker
--- a/navimon/NaviEvaluate.py
+++ b/navimon/NaviEvaluate.py
@@ -24,9 +24,14 @@ class EvaluateMonitoring:
        self.save_plots = True

        try:
-            self.get_file_size()
+            self.get_data_rate()
        except Exception as e:
-            print("Python-Error: " + str(e) + "\ncould not get size of files -> abort")
+            print("Python-Error: " + str(e) + "\ncould not get data rate of jobs -> abort")
+            exit()
+        try:
+            self.get_total_file_size()
+        except Exception as e:
+            print("Python-Error: " + str(e) + "\ncould not get total file size requested by jobs -> abort")
            exit()
        try:
            self.get_cached_percentage_on_worker_node()
@@ -90,24 +95,20 @@ class EvaluateMonitoring:
                  + str(column))
            pass

-    def get_file_size(self):
-        for index, row in self.df.iterrows():
-            input_files = eval(str(row['input_files']))
-            network_input = float(row['NetworkInputMb'])
-            file_names_for_row = []
-            file_size_for_row = []
-            for value in input_files:
-                file_names_for_row.append(value.rsplit(":", 1)[0])
-                file_size_for_row.append(value.rsplit(":", 1)[1])
-            total_file_size_for_row = float(sum(int(i) for i in file_size_for_row)) / (1024 * 1024)
-            self.df.at[index, 'size_of_all_requested_files'] = total_file_size_for_row
-            self.df.at[index, 'data_rate'] = (
-                # use HTCondor measured network input value:
-                network_input / float(row['Runtime']) if float(row['Runtime']) > 0 and network_input > 0 else np.nan
-                # use total size of requested files:
-                # total_file_size_for_row / float(row['Runtime']) if float(row['Runtime']) > 0 else -1
-                )
-            self.df.at[index, 'total_file_size'] = total_file_size_for_row
+    def get_data_rate(self):
+        try:
+            self.df['data_rate'] = self.df.loc[self.df['Runtime'] > 0.0]['NetworkInputMb'].divide(self.df['Runtime'])
+        except Exception as e:
+            print("Python-Error: " + str(e) + "\nFailed to calculate data rate of jobs!")
+            exit(-1)
+
+    def get_total_file_size(self):
+        try:
+            self.df['total_file_size'] = self.df['input_files'].apply(lambda file_list: sum(map(
+                lambda x: float(x.rsplit(":", 1)[1]) / (1024 * 1024), eval(file_list))))
+        except Exception as e:
+            print("Python-Error: " + str(e) + "\nFailed to calculate total file size requested by jobs!")
+            exit(-1)

    def get_cached_percentage_on_worker_node(self):
        self.df["cached_percentage_on_worker_node"] = 0.0