Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
monitor-NaviX
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ETP-HTC
monitor-NaviX
Commits
d9756db9
Commit
d9756db9
authored
6 years ago
by
Christoph Heidecker
Browse files
Options
Downloads
Patches
Plain Diff
* Changed runtime output and removed not other necessary outputs
parent
acd18acc
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
NaviXMon.py
+70
-68
70 additions, 68 deletions
NaviXMon.py
navimon/NaviEvaluate.py
+5
-4
5 additions, 4 deletions
navimon/NaviEvaluate.py
navimon/NaviPlotFunctions.py
+7
-7
7 additions, 7 deletions
navimon/NaviPlotFunctions.py
with
82 additions
and
79 deletions
NaviXMon.py
+
70
−
68
View file @
d9756db9
...
...
@@ -14,7 +14,6 @@ c_end = '\033[0m'
def
__init__
():
print
(
''
)
print
(
'
##################################################################
'
)
print
(
'
# \ / _ #
'
)
...
...
@@ -32,13 +31,13 @@ def __init__():
print
(
c_red
+
"
Python-Error:
"
+
str
(
e
)
+
"
\n
ERROR while deleting old plots -> abort
"
+
c_end
)
exit
()
start
=
time
()
monitor_dict
=
dict
()
df_file
=
pd
.
DataFrame
()
#
export_as = '.pdf'
export_as
=
'
.png
'
export_as
=
'
.pdf
'
#
export_as = '.png'
print
(
'
Loading server configuration file:
'
)
known_servers
=
read_server_list
()
if
known_servers
==
-
1
:
exit
()
...
...
@@ -76,14 +75,14 @@ def __init__():
# navix_monitor_file_list = [r'data/copy/02-tuned-SSDs-Raid0/NaviX.mon.180815']
# -> 3. test: bug in update hook leads to percentage shift (60 jobs, 5 files/job)
# navix_monitor_file_list = [r'data/copy/03-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180817']
# ->
2
. test: shift was corrected in data (60 jobs, 5 files/job)
# ->
3
. test: shift was corrected in data (60 jobs, 5 files/job)
# navix_monitor_file_list = [r'data/copy/03-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180817-0%-corr',
# r'data/copy/03-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180817-10%-90%-corr',
# r'data/copy/03-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180817-100%-corr']
# ->
3
. test: multiple runs for more statistics (2 runs, 60 jobs, 10 files/job)
# ->
4
. test: multiple runs for more statistics (2 runs, 60 jobs, 10 files/job)
navix_monitor_file_list
=
[
r
'
data/copy/04-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180818.newlog
'
,
r
'
data/copy/04-tuned-SSDs-Raid0-bugfix/NaviX.mon.Copy.180819.newlog
'
]
# ->
4
. test: new monitoring log maybe with bug in update hook( 60 jobs, 5 or 10 files/job)
# ->
5
. test: new monitoring log maybe with bug in update hook( 60 jobs, 5 or 10 files/job)
# navix_monitor_file_list = [r'data/copy/05-tuned-SSDs-Raid0-new-log/NaviX.mon.debug.copy.small.180823']
# navix_monitor_file_list = [r'data/copy/05-tuned-SSDs-Raid0-new-log/NaviX.mon.Copy.180824']
...
...
@@ -100,17 +99,15 @@ def __init__():
# --------
# navix_monitor_file_list = [r'NaviX.mon']
for
file
in
navix_monitor_file_list
:
print
(
'
Loading monitoring log file:
'
,
file
)
navix_monitor_file
=
open
(
file
)
get_data_from_file
(
monitor_dict
,
navix_monitor_file
)
get_data_from_file
(
monitor_dict
,
navix_monitor_file_list
)
print
(
'
Converting data format to pandas dataframes:
'
)
monitor_df
=
dict_to_pandas
(
monitor_dict
)
start
=
time
()
evaluate_monitor
=
EvaluateMonitoring
(
monitor_df
,
df_file
,
known_servers
)
print
(
"
Runtime pandas:
"
+
str
(
time
()
-
start
))
print
(
"
Runtime for getting and preparing data:
"
+
str
(
time
()
-
start
)
+
'
s
'
)
evaluate_monitor
.
print_some_mean_err
()
start
=
time
()
plot_hist_cache_rate
(
monitor_df
,
export_as
)
...
...
@@ -134,7 +131,7 @@ def __init__():
# plot_kde_file_size_vs_duplicity(df_file)
# plot_errorbar_file_percentage_vs_processing_time(monitor_df)
print
(
"
Runtime plotting and reviewing graphs:
"
+
str
(
time
()
-
start
))
print
(
"
Runtime
for
plotting and reviewing graphs:
"
+
str
(
time
()
-
start
)
+
'
s
'
)
def
read_server_list
():
...
...
@@ -142,10 +139,8 @@ def read_server_list():
# Read config file
# to resolve IP Addresses to HTCondor Server names
##########################################################
start
=
time
()
try
:
known_servers
=
yaml
.
safe_load
(
open
(
'
known_servers.yml
'
))
print
(
"'
known_servers.yml
'
found and opened
"
)
tmp
=
known_servers
.
copy
()
is_empty
=
True
'
#Test given IPs
'
...
...
@@ -172,7 +167,7 @@ def read_server_list():
known_servers
[
IP
][
'
wn_ip
'
]
=
[
new_ip
]
'
#Show given hosts to terminal and logfile
'
print
(
"
Known hosts:
"
)
print
(
'
Loading server configuration file:
\n
'
)
for
IP
,
server
in
iter
(
known_servers
.
items
()):
print
(
'
XRootD cache IP(s):
'
+
str
(
IP
))
print
(
'
=> HTCondor constraint:
'
+
str
(
server
.
get
(
'
wn_constraint
'
))
+
'
, worker node IP(s):
'
+
...
...
@@ -187,13 +182,12 @@ def read_server_list():
return
-
1
except
Exception
as
e
:
print
(
c_red
+
"
Could not read config
\"
known_servers.yml
\"
-> stopping server
\n
Python-ERROR:
"
+
str
(
e
)
+
c_end
)
print
(
str
(
e
))
return
-
1
print
(
"
Time to read from file:
"
+
str
(
time
()
-
start
))
return
known_servers
def
dict_to_pandas
(
monitor_dict
):
print
(
'
\n
Converting data...
'
)
start
=
time
()
monitor_df
=
pd
.
DataFrame
()
try
:
...
...
@@ -202,65 +196,73 @@ def dict_to_pandas(monitor_dict):
df
.
index
.
name
=
'
timestamp
'
df
=
df
.
reset_index
()
monitor_df
=
df
.
sort_values
(
df
.
columns
[
0
],
ascending
=
True
)
print
(
"
Time to export to pandas data frame and sort by start time:
"
+
str
(
time
()
-
start
))
except
Exception
as
e
:
print
(
c_red
+
'
Failed to convert dict to pandas!
\n
Python-ERROR:
'
+
str
(
e
)
+
c_end
)
exit
()
print
(
'
... (Runtime:
'
+
str
(
time
()
-
start
)
+
'
s)
\n
'
)
return
monitor_df
def
get_data_from_file
(
monitor_dict
,
navix_monitor_file
):
def
get_data_from_file
(
monitor_dict
,
navix_monitor_file
_list
):
################################################################################################################
#
# read data from file
#
################################################################################################################
time
()
monitor_dict_tmp
=
dict
()
current_instance
=
'
start
'
line
=
navix_monitor_file
.
readline
()
while
line
:
if
line
[:
1
]
is
'
1
'
:
current_instance
=
float
(
line
[:
-
3
])
if
current_instance
not
in
monitor_dict_tmp
:
monitor_dict_tmp
[
current_instance
]
=
{}
elif
current_instance
==
'
start
'
:
# do nothing
pass
elif
line
[:
5
]
==
'
Rank:
'
:
monitor_dict_tmp
[
current_instance
][
'
rank
'
]
=
eval
(
line
[
6
:
-
1
])
elif
line
[:
6
]
==
'
JobID:
'
:
monitor_dict_tmp
[
current_instance
][
'
JobID
'
]
=
line
[
7
:
-
1
]
elif
line
[:
8
]
==
'
Done at:
'
:
monitor_dict_tmp
[
current_instance
][
'
Finish
'
]
=
float
(
line
[
9
:
-
1
])
elif
line
[:
8
]
==
'
Runtime:
'
:
monitor_dict_tmp
[
current_instance
][
'
Runtime
'
]
=
float
(
line
[
9
:
-
1
])
elif
line
[:
12
]
==
'
RequestCpus:
'
:
monitor_dict_tmp
[
current_instance
][
'
RequestCpus
'
]
=
float
(
line
[
13
:
-
1
])
elif
line
[:
13
]
==
'
RemoteSysCpu:
'
:
monitor_dict_tmp
[
current_instance
][
'
RemoteSysCpu
'
]
=
float
(
line
[
14
:
-
1
])
elif
line
[:
14
]
==
'
RemoteUserCpu:
'
:
monitor_dict_tmp
[
current_instance
][
'
RemoteUserCpu
'
]
=
float
(
line
[
15
:
-
1
])
elif
line
[:
12
]
==
'
Requirement:
'
:
monitor_dict_tmp
[
current_instance
][
'
Requirement
'
]
=
line
[
13
:
-
1
]
elif
line
[:
15
]
==
'
NetworkInputMb:
'
:
monitor_dict_tmp
[
current_instance
][
'
NetworkInputMb
'
]
=
float
(
line
[
16
:
-
1
])
elif
line
[:
15
]
==
'
LastRemoteHost:
'
:
monitor_dict_tmp
[
current_instance
][
'
LastRemoteHost
'
]
=
line
[
16
:
-
1
]
elif
line
[:
13
]
==
'
LastRemoteIP:
'
:
monitor_dict_tmp
[
current_instance
][
'
LastRemoteIP
'
]
=
test_and_transform_ip
(
line
[
14
:
-
1
])
elif
line
[:
16
]
==
'
Requested files:
'
:
monitor_dict_tmp
[
current_instance
][
'
requested_files
'
]
=
line
[
17
:
-
1
]
elif
line
[:
21
]
==
'
File found on server:
'
:
monitor_dict_tmp
[
current_instance
][
'
files_found_on_server
'
]
=
eval
(
line
[
22
:
-
1
])
elif
line
[:
12
]
==
'
Input_Files:
'
:
monitor_dict_tmp
[
current_instance
][
'
input_files
'
]
=
line
[
13
:
-
1
]
elif
line
and
not
line
==
'
\n
'
:
print
(
line
)
print
(
"
Unexpected input!
"
)
line
=
navix_monitor_file
.
readline
()
monitor_dict
.
update
(
monitor_dict_tmp
)
print
(
'
\n
Loading monitoring log files:
\n
'
)
try
:
for
file
in
navix_monitor_file_list
:
navix_monitor_file
=
open
(
file
)
monitor_dict_tmp
=
dict
()
current_instance
=
'
start
'
line
=
navix_monitor_file
.
readline
()
while
line
:
if
line
[:
1
]
is
'
1
'
:
current_instance
=
float
(
line
[:
-
3
])
if
current_instance
not
in
monitor_dict_tmp
:
monitor_dict_tmp
[
current_instance
]
=
{}
elif
current_instance
==
'
start
'
:
# do nothing
pass
elif
line
[:
5
]
==
'
Rank:
'
:
monitor_dict_tmp
[
current_instance
][
'
rank
'
]
=
eval
(
line
[
6
:
-
1
])
elif
line
[:
6
]
==
'
JobID:
'
:
monitor_dict_tmp
[
current_instance
][
'
JobID
'
]
=
line
[
7
:
-
1
]
elif
line
[:
8
]
==
'
Done at:
'
:
monitor_dict_tmp
[
current_instance
][
'
Finish
'
]
=
float
(
line
[
9
:
-
1
])
elif
line
[:
8
]
==
'
Runtime:
'
:
monitor_dict_tmp
[
current_instance
][
'
Runtime
'
]
=
float
(
line
[
9
:
-
1
])
elif
line
[:
12
]
==
'
RequestCpus:
'
:
monitor_dict_tmp
[
current_instance
][
'
RequestCpus
'
]
=
float
(
line
[
13
:
-
1
])
elif
line
[:
13
]
==
'
RemoteSysCpu:
'
:
monitor_dict_tmp
[
current_instance
][
'
RemoteSysCpu
'
]
=
float
(
line
[
14
:
-
1
])
elif
line
[:
14
]
==
'
RemoteUserCpu:
'
:
monitor_dict_tmp
[
current_instance
][
'
RemoteUserCpu
'
]
=
float
(
line
[
15
:
-
1
])
elif
line
[:
12
]
==
'
Requirement:
'
:
monitor_dict_tmp
[
current_instance
][
'
Requirement
'
]
=
line
[
13
:
-
1
]
elif
line
[:
15
]
==
'
NetworkInputMb:
'
:
monitor_dict_tmp
[
current_instance
][
'
NetworkInputMb
'
]
=
float
(
line
[
16
:
-
1
])
elif
line
[:
15
]
==
'
LastRemoteHost:
'
:
monitor_dict_tmp
[
current_instance
][
'
LastRemoteHost
'
]
=
line
[
16
:
-
1
]
elif
line
[:
13
]
==
'
LastRemoteIP:
'
:
monitor_dict_tmp
[
current_instance
][
'
LastRemoteIP
'
]
=
test_and_transform_ip
(
line
[
14
:
-
1
])
elif
line
[:
16
]
==
'
Requested files:
'
:
monitor_dict_tmp
[
current_instance
][
'
requested_files
'
]
=
line
[
17
:
-
1
]
elif
line
[:
21
]
==
'
File found on server:
'
:
monitor_dict_tmp
[
current_instance
][
'
files_found_on_server
'
]
=
eval
(
line
[
22
:
-
1
])
elif
line
[:
12
]
==
'
Input_Files:
'
:
monitor_dict_tmp
[
current_instance
][
'
input_files
'
]
=
line
[
13
:
-
1
]
elif
line
and
not
line
==
'
\n
'
:
print
(
line
)
print
(
"
Unexpected input!
"
)
line
=
navix_monitor_file
.
readline
()
monitor_dict
.
update
(
monitor_dict_tmp
)
print
(
'
=> Successfully loaded
'
,
file
)
except
Exception
as
e
:
print
(
c_red
+
"
Could not read data from
"
+
str
(
file
)
+
"
-> stopping server
\n
Python-ERROR:
"
+
str
(
e
)
+
c_end
)
exit
()
return
...
...
This diff is collapsed.
Click to expand it.
navimon/NaviEvaluate.py
+
5
−
4
View file @
d9756db9
...
...
@@ -13,13 +13,14 @@ class EvaluateMonitoring:
save_plot
=
True
def
__init__
(
self
,
monitor_df
,
df_file
,
known_servers
):
start
=
time
()
print
(
'
\n
Evaluating data...
'
)
self
.
df
=
monitor_df
self
.
df_file
=
df_file
self
.
known_servers
=
known_servers
start
=
time
()
self
.
show_plots
=
False
self
.
save_plots
=
True
...
...
@@ -70,12 +71,12 @@ class EvaluateMonitoring:
print
(
c_red
+
"
Python-Error:
"
+
str
(
e
)
+
"
\n
ERROR while getting file duplicity -> abort
"
+
c_end
)
exit
()
print
(
"
Over all data frame is ready for plots after:
"
+
str
(
time
()
-
start
)
+
"
s
"
)
self
.
df
.
to_csv
(
"
debug.csv
"
,
sep
=
'
;
'
,
encoding
=
'
utf-8
'
)
self
.
df_cached_node
=
self
.
df
[
self
.
df
.
cached_percentage_on_worker_node
!=
0
]
self
.
df_not_cached
=
self
.
df
[
self
.
df
.
cached_percentage_on_worker_node
==
0
]
print
(
'
... (Runtime:
'
+
str
(
time
()
-
start
)
+
'
s)
\n
'
)
def
print_some_mean_err
(
self
):
print
(
"
\n\n
###################################################################################################
"
+
"
\n
Some interesting values:
\n
------------------
\n
"
)
...
...
This diff is collapsed.
Click to expand it.
navimon/NaviPlotFunctions.py
+
7
−
7
View file @
d9756db9
...
...
@@ -54,7 +54,7 @@ def plot_1d_hist(data_frame_list, weights_list, label_list, title='', x_label=''
box
=
ax1
.
get_position
()
ax1
.
set_position
([
box
.
x0
,
box
.
y0
,
box
.
width
*
0.8
,
box
.
height
])
ax1
.
legend
(
loc
=
legend_loc
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Tru
e
,
bbox_to_anchor
=
(
1
,
0.5
))
ax1
.
legend
(
loc
=
legend_loc
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Fals
e
,
bbox_to_anchor
=
(
1
.
,
0.5
))
fig
.
suptitle
(
title
,
fontsize
=
12
)
fig
.
set_size_inches
(
8
,
4.5
)
...
...
@@ -102,11 +102,11 @@ def plot_2d_line(x_array_list, y_array_list, label_list, title='', x_label='', y
if
y_log
:
plt
.
yscale
(
'
log
'
)
if
not
label_list
[
0
]
==
''
:
# legend = ax1.legend(loc='best', numpoints=1, frameon=
Tru
e, framealpha=1.0)
# legend = ax1.legend(loc='best', numpoints=1, frameon=
Fals
e, framealpha=1.0)
# legend.get_frame().set_facecolor('white')
box
=
ax1
.
get_position
()
ax1
.
set_position
([
box
.
x0
,
box
.
y0
,
box
.
width
*
0.8
,
box
.
height
])
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Tru
e
,
bbox_to_anchor
=
(
1
,
0.5
))
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Fals
e
,
bbox_to_anchor
=
(
1
,
0.5
))
fig
.
suptitle
(
title
,
fontsize
=
12
)
fig
.
set_size_inches
(
8
,
4.5
)
...
...
@@ -141,7 +141,7 @@ def plot_2d_error(x_array_list, y_array_list, x_std_array_list, y_std_array_list
if
not
label_list
[
0
]
==
''
:
box
=
ax1
.
get_position
()
ax1
.
set_position
([
box
.
x0
,
box
.
y0
,
box
.
width
*
0.8
,
box
.
height
])
legend
=
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Tru
e
,
bbox_to_anchor
=
(
1
,
0.5
))
legend
=
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Fals
e
,
bbox_to_anchor
=
(
1
,
0.5
))
legend
.
get_frame
().
set_facecolor
(
'
white
'
)
fig
.
suptitle
(
title
,
fontsize
=
12
)
...
...
@@ -180,7 +180,7 @@ def plot_2d_scatter_old(data_frame_list, label_list, x='', y='', title='', x_lab
plt
.
xscale
(
'
log
'
)
if
y_log
:
plt
.
yscale
(
'
log
'
)
legend
=
ax1
.
legend
(
loc
=
'
best
'
,
numpoints
=
1
,
frameon
=
Tru
e
,
framealpha
=
1.0
)
legend
=
ax1
.
legend
(
loc
=
'
best
'
,
numpoints
=
1
,
frameon
=
Fals
e
,
framealpha
=
1.0
)
legend
.
get_frame
().
set_facecolor
(
'
white
'
)
fig
.
suptitle
(
title
,
fontsize
=
12
)
...
...
@@ -375,7 +375,7 @@ def plot_2d_scatter(data_frame_list, label_list, x='', y='', title='', x_label='
ax1
.
xscale
(
'
log
'
)
if
y_log
:
ax1
.
yscale
(
'
log
'
)
legend
=
ax1
.
legend
(
loc
=
'
best
'
,
numpoints
=
1
,
frameon
=
Tru
e
,
framealpha
=
1.0
)
legend
=
ax1
.
legend
(
loc
=
'
best
'
,
numpoints
=
1
,
frameon
=
Fals
e
,
framealpha
=
1.0
)
legend
.
get_frame
().
set_facecolor
(
'
white
'
)
# play with this factor ot ensure a better design, as well as the distance_between_ax1_ax3
...
...
@@ -387,7 +387,7 @@ def plot_2d_scatter(data_frame_list, label_list, x='', y='', title='', x_label='
ax2
.
set_position
([
box2
.
x0
,
box2
.
y0
,
box2
.
width
*
shrink_factor
,
box2
.
height
])
box3
=
ax3
.
get_position
()
ax3
.
set_position
([
box3
.
x0
-
box1
.
width
*
(
1
-
shrink_factor
),
box3
.
y0
,
box3
.
width
*
shrink_factor
,
box3
.
height
])
legend
=
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Tru
e
,
legend
=
ax1
.
legend
(
loc
=
'
center left
'
,
framealpha
=
1.0
,
numpoints
=
1
,
frameon
=
Fals
e
,
bbox_to_anchor
=
((
distance_between_ax1_ax3
+
box1
.
width
)
*
shrink_factor
,
0.5
))
legend
.
get_frame
().
set_facecolor
(
'
white
'
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment