Skip to content

Commit 371718f

Browse files
committed
Some revisions
Signed-off-by: Ralph Castain <[email protected]>
1 parent c169074 commit 371718f

File tree

2 files changed

+117
-68
lines changed

2 files changed

+117
-68
lines changed

Chap_API_Job_Mgmt.tex

Lines changed: 116 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -701,13 +701,13 @@ \subsection{\code{PMIx_Process_monitor}}
701701
%%%%
702702
\descr
703703

704-
Request that application processes be monitored via several possible methods.
705-
For example, that the server monitor this process for periodic heartbeats as an indication that the process has not become ``wedged''.
704+
Request that application processes and/or files be monitored via several possible methods.
705+
For example, that the server monitor a given process for periodic heartbeats as an indication that the process has not become ``wedged''.
706706
When a monitor detects the specified alarm condition, it will generate an event notification using the provided error code and passing along any available relevant information.
707707
It is up to the caller to register a corresponding event handler.
708708

709709
The \refarg{monitor} argument is an attribute indicating the type of monitor being requested.
710-
For example, \refattr{PMIX_MONITOR_FILE} to indicate that the requestor is asking that a file be monitored.
710+
For example, \refattr{PMIX_MONITOR_FILE_CHANGES} to indicate that the requestor is asking that a file be monitored.
711711

712712
The \refarg{error} argument is the status code to be used when generating an event notification alerting that the monitor has been triggered.
713713
The range of the notification defaults to \refconst{PMIX_RANGE_NAMESPACE}.
@@ -770,13 +770,13 @@ \subsection{\code{PMIx_Process_monitor_nb}}
770770
\item \refattr{PMIX_MONITOR_HEARTBEAT_DROPS}
771771
\end{itemize}
772772
\item \refattr{PMIX_SEND_HEARTBEAT}
773-
\item \refattr{PMIX_MONITOR_FILE} The associated \refarg{directives} array may include any of the following:
773+
\item \refattr{PMIX_MONITOR_FILE_CHANGES} The associated \refarg{directives} array may include any of the following:
774774
\begin{itemize}
775-
\item \refattr{PMIX_MONITOR_FILE_SIZE}
776-
\item \refattr{PMIX_MONITOR_FILE_ACCESS}
777-
\item \refattr{PMIX_MONITOR_FILE_MODIFY}
778775
\item \refattr{PMIX_MONITOR_FILE_CHECK_TIME}
779776
\item \refattr{PMIX_MONITOR_FILE_DROPS}
777+
\item \refattr{PMIX_MONITOR_TARGET_FILES}
778+
\item \refattr{PMIX_MONITOR_TARGET_NODES}. Monitor the given files on the specified nodes, where present.
779+
\item \refattr{PMIX_MONITOR_TARGET_NODEIDS}. Monitor the given files on the specified nodes, where present.
780780
\end{itemize}
781781
\item \refattr{PMIX_MONITOR_PROC_RESOURCE_USAGE} The associated \refarg{directives} array may include any of the following:
782782
\begin{itemize}
@@ -793,6 +793,22 @@ \subsection{\code{PMIx_Process_monitor_nb}}
793793
\item \refattr{PMIX_MONITOR_TARGET_NODEIDS}
794794
\item \refattr{PMIX_MONITOR_TARGET_PROCS}. Monitor the nodes where the specified processes are located.
795795
\end{itemize}
796+
\item \refattr{PMIX_MONITOR_DISK_RESOURCE_USAGE} The associated \refarg{directives} array may include any of the following:
797+
\begin{itemize}
798+
\item \refattr{PMIX_MONITOR_RESOURCE_RATE}
799+
\item \refattr{PMIX_MONITOR_TARGET_DISKS}
800+
\item \refattr{PMIX_MONITOR_TARGET_NODES}
801+
\item \refattr{PMIX_MONITOR_TARGET_NODEIDS}
802+
\item \refattr{PMIX_MONITOR_TARGET_PROCS}. Monitor the nodes where the specified processes are located.
803+
\end{itemize}
804+
\item \refattr{PMIX_MONITOR_NETWORK_RESOURCE_USAGE} The associated \refarg{directives} array may include any of the following:
805+
\begin{itemize}
806+
\item \refattr{PMIX_MONITOR_RESOURCE_RATE}
807+
\item \refattr{PMIX_MONITOR_TARGET_NETS}
808+
\item \refattr{PMIX_MONITOR_TARGET_NODES}
809+
\item \refattr{PMIX_MONITOR_TARGET_NODEIDS}
810+
\item \refattr{PMIX_MONITOR_TARGET_PROCS}. Monitor the nodes where the specified processes are located.
811+
\end{itemize}
796812
\end{itemize}
797813

798814
In addition to action-specific directives, the \refarg{directives} array may include:
@@ -860,17 +876,18 @@ \subsection{Monitoring Datatypes}
860876
\subsubsection{Node PID Structure}
861877
\declarestructProvisional{pmix_node_pid_t}
862878

863-
The \refstruct{pmix_node_pid_t} structure contains the hostname and pid of a process executing on that host.
879+
The \refstruct{pmix_node_pid_t} structure contains the hostname (or nodeID) and pid of a process executing on that host.
864880
Since a pid is uniquely associated with a given host, this creates a conjugate pair.
865881

866882
\copySignature{pmix_node_pid_t}{6.0}{
867883
typedef struct pmix_node_pid \{ \\
868884
\hspace*{4\sigspace}char *hostname; \\
885+
\hspace*{4\sigspace}uint32_t nodeid; \\
869886
\hspace*{4\sigspace}pid_t pid; \\
870887
\} pmix_node_pid_t;
871888
}
872889

873-
The \refarg{pid} field contains the \code{pid_t} of the process, while the \refarg{hostname} is the name of the node where the process is executing.
890+
The \refarg{pid} field contains the \code{pid_t} of the process, while the \refarg{hostname} and/or \refarg{nodeid} identify the node where the process is executing.
874891

875892
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
876893
\subsubsection{Node PID support functions}
@@ -973,20 +990,34 @@ \subsection{Monitoring attributes}
973990
Number of heartbeats that can be missed before generating the event.
974991
}
975992
%
976-
\declareAttribute{PMIX_MONITOR_FILE}{"pmix.monitor.fmon"}{char*}{
977-
Register to monitor file for signs of life - the value contains the filename to be monitored.
993+
\declareAttributeProvisional{PMIX_MONITOR_FILE_CHANGES}{"pmix.monitor.fchg"}{pmix_data_array_t*}{
994+
Monitor the file characteristics specified in the provided \refstruct{pmix_data_array_t} of \refstruct{pmix_info_t}. If the provided array
995+
is \code{NULL}, then all optional file characteristics shall be monitored. Target filenames **must** be provided in the associated
996+
\refarg{directives} array. If no target nodes are specified, then the target files will be monitored on all nodes in the session where they are present. Note that the values in the provided structures will be
997+
ignored (i.e., only the attribute keys are relevant) except where noted. Optional
998+
attributes include:
999+
1000+
\begin{itemize}
1001+
\refattr{PMIX_MONITOR_FILE_SIZE}
1002+
\refattr{PMIX_MONITOR_FILE_ACCESS}
1003+
\refattr{PMIX_MONITOR_FILE_MODIFY}
1004+
\end{itemize}
1005+
}
1006+
%
1007+
\declareAttributeProvisional{PMIX_MONITOR_TARGET_FILES}{"pmix.monitor.fmon"}{pmix_data_array_t*}{
1008+
Array of string filenames to be monitored for signs of life.
9781009
}
9791010
%
9801011
\declareAttribute{PMIX_MONITOR_FILE_SIZE}{"pmix.monitor.fsize"}{bool}{
981-
Monitor given file is growing in size to determine if the application is running.
1012+
Monitor that file is growing in size to determine if the application is running.
9821013
}
9831014
%
9841015
\declareAttribute{PMIX_MONITOR_FILE_ACCESS}{"pmix.monitor.faccess"}{bool}{
985-
Monitor time since last access to determine if the application is running.
1016+
Monitor that time since last access has changed to determine if the application is running.
9861017
}
9871018
%
9881019
\declareAttribute{PMIX_MONITOR_FILE_MODIFY}{"pmix.monitor.fmod"}{bool}{
989-
Monitor time since last modified to determine if the application is running.
1020+
Monitor that time since last modified has changed to determine if the application is running.
9901021
}
9911022
%
9921023
\declareAttribute{PMIX_MONITOR_FILE_CHECK_TIME}{"pmix.monitor.ftime"}{uint32_t}{
@@ -1013,17 +1044,28 @@ \subsection{Monitoring attributes}
10131044
}
10141045
%
10151046
\declareAttributeProvisional{PMIX_MONITOR_TARGET_NODES}{"pmix.monitor.tgtnode"}{pmix_data_array_t*}{
1016-
Array of host names to be monitored
1047+
Array of string host names to be monitored
10171048
}
10181049
%
10191050
\declareAttributeProvisional{PMIX_MONITOR_TARGET_NODEIDS}{"pmix.monitor.tgtndids"}{pmix_data_array_t*}{
10201051
Array of node IDs (\code{uint32_t}) to be monitored
10211052
}
10221053
%
1023-
\declareAttributeProvisional{PMIX_MONITOR_RESOURCE_RATE}{"pmix.monitor.resrate"}{uint64_t}{
1054+
\declareAttributeProvisional{PMIX_MONITOR_TARGET_DISKS}{"pmix.monitor.tgtdks"}{pmix_data_array_t*}{
1055+
Array of strings representing \refattr{PMIX_DISK_ID}s to be monitored
1056+
}
1057+
%
1058+
\declareAttributeProvisional{PMIX_MONITOR_TARGET_NETS}{"pmix.monitor.tgtnets"}{pmix_data_array_t*}{
1059+
Array of strings representing \refattr{PMIX_NETWORK_ID}s to be monitored
1060+
}
1061+
%
1062+
\declareAttributeProvisional{PMIX_MONITOR_RESOURCE_RATE}{"pmix.monitor.resrate"}{uint32_t}{
10241063
Monitor resource usage every N seconds, where N is the value provided by the attribute.
10251064
}
10261065
%
1066+
\declareAttributeProvisional{PMIX_MONITOR_LOCAL_ONLY}{"pmix.monitor.local"}{bool}{
1067+
Restrict data collection to the local host, regardless of any provided targets}
1068+
%
10271069
\declareAttributeProvisional{PMIX_MONITOR_PROC_RESOURCE_USAGE}{"pmix.monitor.presuse"}{pmix_data_array_t*}{
10281070
Monitor the resources specified in the provided \refstruct{pmix_data_array_t} of \refstruct{pmix_info_t}. If the provided array
10291071
is \code{NULL}, then all resources shall be monitored. If no targets are provided in the associated
@@ -1074,51 +1116,54 @@ \subsection{Monitoring attributes}
10741116
\item \refattr{PMIX_NODE_MEM_SWAP_TOTAL}
10751117
\item \refattr{PMIX_NODE_MEM_SWAP_FREE}
10761118
\item \refattr{PMIX_NODE_MEM_MAPPED}
1077-
\item \refattr{PMIX_DISK_RESOURCE_USAGE}. If the \refstruct{pmix_data_array_t} is empty, then
1078-
all disk resource usage values shall be returned for all disks attached to the node.
1079-
Optionally, the array of \refstruct{pmix_info_t} can specify the disks to be monitored (using the \refattr{PMIX_DISK_ID} attribute), and/or the particular attributes to be reported. Note that the values in the provided structures will be
1080-
ignored (i.e., only the attribute keys are relevant) except where noted, and that the
1081-
\refattr{PMIX_DISK_SAMPLE_TIME} will always be included in the returned data (there is no
1082-
need to include it in the request). Optional
1083-
attributes include:
1084-
\begin{itemize}
1085-
\item \refattr{PMIX_DISK_ID}. Optionally specify the disk to be monitored. If omitted, then all disks
1086-
attached to the node will be monitored.
1087-
\item \refattr{PMIX_DISK_READ_COMPLETED}
1088-
\item \refattr{PMIX_DISK_READ_MERGED}
1089-
\item \refattr{PMIX_DISK_READ_SECTORS}
1090-
\item \refattr{PMIX_DISK_READ_MILLISEC}
1091-
\item \refattr{PMIX_DISK_WRITE_COMPLETED}
1092-
\item \refattr{PMIX_DISK_WRITE_MERGED}
1093-
\item \refattr{PMIX_DISK_WRITE_SECTORS}
1094-
\item \refattr{PMIX_DISK_WRITE_MILLISEC}
1095-
\item \refattr{PMIX_DISK_IO_IN_PROGRESS}
1096-
\item \refattr{PMIX_DISK_IO_MILLISEC}
1097-
\item \refattr{PMIX_DISK_IO_WEIGHTED}
1098-
\item \refattr{PMIX_DISK_SAMPLE_TIME}
1099-
\end{itemize}
1100-
\item \refattr{PMIX_NETWORK_RESOURCE_USAGE}. If the \refstruct{pmix_data_array_t} is empty, then
1101-
all network resource usage values shall be returned for all interfaces on the node.
1102-
Optionally, the array of \refstruct{pmix_info_t} can specify the networks to be monitored (using the \refattr{PMIX_NETWORK_ID} attribute), and/or the particular attributes to be reported. Note that the values in the provided structures will be
1103-
ignored (i.e., only the attribute keys are relevant) except where noted, and that the
1104-
\refattr{PMIX_NET_SAMPLE_TIME} will always be included in the returned data (there is no
1105-
need to include it in the request). Optional
1106-
attributes include:
1107-
\begin{itemize}
1108-
\item \refattr{PMIX_NETWORK_ID}. Optionally specify the interface to be monitored. If omitted, then all
1109-
interfaces on the node will be monitored.
1110-
\item \refattr{PMIX_NET_RECVD_BYTES}
1111-
\item \refattr{PMIX_NET_RECVD_PCKTS}
1112-
\item \refattr{PMIX_NET_RECVD_ERRS}
1113-
\item \refattr{PMIX_NET_SENT_BYTES}
1114-
\item \refattr{PMIX_NET_SENT_PCKTS}
1115-
\item \refattr{PMIX_NET_SENT_ERRS}
1116-
\item \refattr{PMIX_NET_SAMPLE_TIME}.
1117-
\end{itemize}
11181119
\item \refattr{PMIX_NODE_SAMPLE_TIME}.
11191120
\end{itemize}
11201121
}
11211122

1123+
\declareAttributeProvisional{PMIX_MONITOR_DISK_RESOURCE_USAGE}{"pmix.monitor.dkresuse"}{pmix_data_array_t*}{
1124+
Monitor the resources specified in the provided \refstruct{pmix_data_array_t} of \refstruct{pmix_info_t}. If the provided array
1125+
is \code{NULL}, then all disk resources shall be monitored. If no \refattr{PMIX_DISK_ID} targets are provided in the associated
1126+
\refarg{directives} array, then
1127+
all disks on the local node (or on the specified target nodes, if given) will be monitored. Note that the values in the provided structures will be
1128+
ignored (i.e., only the attribute keys are relevant) except where noted, and that the
1129+
\refattr{PMIX_DISK_SAMPLE_TIME} will always be included in the returned data (there is no
1130+
need to include it in the request). Optional attributes include:
1131+
1132+
\begin{itemize}
1133+
\item \refattr{PMIX_DISK_READ_COMPLETED}
1134+
\item \refattr{PMIX_DISK_READ_MERGED}
1135+
\item \refattr{PMIX_DISK_READ_SECTORS}
1136+
\item \refattr{PMIX_DISK_READ_MILLISEC}
1137+
\item \refattr{PMIX_DISK_WRITE_COMPLETED}
1138+
\item \refattr{PMIX_DISK_WRITE_MERGED}
1139+
\item \refattr{PMIX_DISK_WRITE_SECTORS}
1140+
\item \refattr{PMIX_DISK_WRITE_MILLISEC}
1141+
\item \refattr{PMIX_DISK_IO_IN_PROGRESS}
1142+
\item \refattr{PMIX_DISK_IO_MILLISEC}
1143+
\item \refattr{PMIX_DISK_IO_WEIGHTED}
1144+
\item \refattr{PMIX_DISK_SAMPLE_TIME}
1145+
\end{itemize}
1146+
1147+
\declareAttributeProvisional{PMIX_MONITOR_NETWORK_RESOURCE_USAGE}{"pmix.monitor.netresuse"}{pmix_data_array_t*}{
1148+
Monitor the resources specified in the provided \refstruct{pmix_data_array_t} of \refstruct{pmix_info_t}. If the provided array
1149+
is \code{NULL}, then all network resources shall be monitored. If no \refattr{PMIX_NETWORK_ID} targets are provided in the associated
1150+
\refarg{directives} array, then
1151+
all network interfaces on the local node (or on the specified target nodes, if given) will be monitored. Note that the values in the provided structures will be
1152+
ignored (i.e., only the attribute keys are relevant) except where noted, and that the
1153+
\refattr{PMIX_NET_SAMPLE_TIME} will always be included in the returned data (there is no
1154+
need to include it in the request). Optional attributes include:
1155+
1156+
\begin{itemize}
1157+
\item \refattr{PMIX_NET_RECVD_BYTES}
1158+
\item \refattr{PMIX_NET_RECVD_PCKTS}
1159+
\item \refattr{PMIX_NET_RECVD_ERRS}
1160+
\item \refattr{PMIX_NET_SENT_BYTES}
1161+
\item \refattr{PMIX_NET_SENT_PCKTS}
1162+
\item \refattr{PMIX_NET_SENT_ERRS}
1163+
\item \refattr{PMIX_NET_SAMPLE_TIME}.
1164+
\end{itemize}
1165+
1166+
11221167
%%%%%%%%%%%
11231168
\versionMarkerProvisional{6.0}
11241169
\subsection{Resource usage attributes}
@@ -1136,7 +1181,8 @@ \subsubsection{Process resource usage}
11361181
the first element containing the ID of the process (marked by either the \refattr{PMIX_PROCID}
11371182
or \refattr{PMIX_PROC_PID} key)
11381183
whose usage is reported in the array. The list of included information may vary across
1139-
implementations and \acp{OS}, depending upon availability and access restrictions. Except for
1184+
implementations and \acp{OS}, depending upon availability and access restrictions, and the
1185+
provided list of requested values. Except for
11401186
the process ID as the first element, ordering of information in the array is arbitrary.
11411187
}
11421188

@@ -1179,7 +1225,7 @@ \subsubsection{Process resource usage}
11791225
\item \declareAttributeProvisional{PMIX_PROC_CPU}{"pmix.proc.cpu"}{uint16_t}{
11801226
Processor that process last executed on
11811227
}
1182-
\item \declareAttributeProvisional{PMIX_PROC_SAMPLE_TIME}{"pmix.proc.samptime"}{struct timeval}{
1228+
\item \declareAttributeProvisional{PMIX_PROC_SAMPLE_TIME}{"pmix.proc.samptime"}{time_t}{
11831229
Time when sample was taken
11841230
}
11851231
\end{itemize}
@@ -1195,7 +1241,8 @@ \subsubsection{Disk resource usage}
11951241
An array of \refstruct{pmix_info_t} describing the resource usage of the specified disk, with
11961242
the first element containing the string name of the disk (marked by the \refattr{PMIX_DISK_ID} key)
11971243
whose usage is reported in the array. The list of included information may vary across
1198-
implementations and \acp{OS}, depending upon availability and access restrictions. Except for
1244+
implementations and \acp{OS}, depending upon availability and access restrictions, and the
1245+
provided list of requested values. Except for
11991246
the disk ID as the first element, ordering of information in the array is arbitrary.
12001247
}
12011248

@@ -1236,7 +1283,7 @@ \subsubsection{Disk resource usage}
12361283
Number of IOs in progress times the number of milliseconds spent doing IO since
12371284
last update of the field - indicator of backlog that may be accumulating
12381285
}
1239-
\item \declareAttributeProvisional{PMIX_DISK_SAMPLE_TIME}{"pmix.disk.samptime"}{struct timeval}{
1286+
\item \declareAttributeProvisional{PMIX_DISK_SAMPLE_TIME}{"pmix.disk.samptime"}{time_t}{
12401287
Time when sample was taken
12411288
}
12421289
\end{itemize}
@@ -1251,7 +1298,8 @@ \subsubsection{Network resource usage}
12511298
An array of \refstruct{pmix_info_t} describing the resource usage of the specified network, with
12521299
the first element containing the string name of the interface (marked by the \refattr{PMIX_NETWORK_ID} key)
12531300
whose usage is reported in the array. The list of included information may vary across
1254-
implementations and \acp{OS}, depending upon availability and access restrictions. Except for
1301+
implementations and \acp{OS}, depending upon availability and access restrictions, and the
1302+
provided list of requested values. Except for
12551303
the network ID as the first element, ordering of information in the array is arbitrary.
12561304
}
12571305

@@ -1276,7 +1324,7 @@ \subsubsection{Network resource usage}
12761324
\item \declareAttributeProvisional{PMIX_NET_SENT_ERRS}{"pmix.net.snterr"}{uint64_t}{
12771325
Number of send errors
12781326
}
1279-
\item \declareAttributeProvisional{PMIX_NET_SAMPLE_TIME}{"pmix.net.samptime"}{struct timeval}{
1327+
\item \declareAttributeProvisional{PMIX_NET_SAMPLE_TIME}{"pmix.net.samptime"}{time_t}{
12801328
Time when sample was taken
12811329
}
12821330
\end{itemize}
@@ -1289,7 +1337,8 @@ \subsubsection{Node resource usage}
12891337
with the first element containing
12901338
the ID of the node (marked by the \refattr{PMIX_HOSTNAME} or \refattr{PMIX_NODEID} key) whose usage
12911339
is reported in the array. The list of included information may vary across
1292-
implementations and \acp{OS}, depending upon availability and access restrictions. Except for
1340+
implementations and \acp{OS}, depending upon availability and access restrictions, and the
1341+
provided list of requested values. Except for
12931342
the node ID as the first element, ordering of information in the array is arbitrary.
12941343
}
12951344

@@ -1331,9 +1380,9 @@ \subsubsection{Node resource usage}
13311380
\item \declareAttributeProvisional{PMIX_NODE_MEM_MAPPED}{"pmix.node.mmap"}{float}{
13321381
files which have been mmapped, such as libraries. Note that some kernel configurations might consider all pages part of a larger allocation (e.g., THP) as “mapped”, as soon as a single page is mapped. In MBytes
13331382
}
1334-
\item \refattr{PMIX_DISK_RESOURCE_USAGE} One for each disk attached to the node.
1335-
\item \refattr{PMIX_NETWORK_RESOURCE_USAGE} One for each network interface on the node.
1336-
\item \declareAttributeProvisional{PMIX_NODE_SAMPLE_TIME}{"pmix.node.samptime"}{struct timeval}{
1383+
\item \refattr{PMIX_DISK_RESOURCE_USAGE} One for each disk attached to the node, if requested.
1384+
\item \refattr{PMIX_NETWORK_RESOURCE_USAGE} One for each network interface on the node, if requested.
1385+
\item \declareAttributeProvisional{PMIX_NODE_SAMPLE_TIME}{"pmix.node.samptime"}{time_t}{
13371386
Time when sample was taken
13381387
}
13391388
\end{itemize}

Chap_API_Struct.tex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2548,7 +2548,7 @@ \section{Generalized Data Types Used for Packing/Unpacking}
25482548
\declareconstitemvalue{PMIX_STOR_ACCESS_TYPE}{69}
25492549
Bitmask specifying different storage system access types. (\refstruct{pmix_storage_access_type_t}).
25502550
%
2551-
\declareconstitemvalueProvisional{PMIX_NODE_PID}{70}
2551+
\declareconstitemvalueProvisional{PMIX_NODE_PID}{73}
25522552
Structure containing the hostname and pid of a process
25532553
%
25542554
\declareconstitemvalue{PMIX_DATA_TYPE_MAX}{500}

0 commit comments

Comments
 (0)