Example Configuration

Probe Configuration

This is not meant to be used as is. You will need to uncomment and edit as needed.

# Grid Proxy Certificate and VOMS Attributes
# ==========================================

[gridproxy]
# The default VOMS to use.  You can override this for specific probes by
# setting "voms" under the corresponding section.
#default_voms = ops

# Alternative 1:  Use an externally generated proxy certificate.  You can either
# export X509_USER_PROXY or point to it with
#user_proxy = /var/cache/nagios/gridproxy.pem

# Alternative 2:  Let the probe generate a proxy certificate on demand from
# a robot certificate.
#user_cert = /etc/grid-security/robotcert.pem
#user_key = /etc/grid-security/robotkey.pem


# Checking Storage Elements
# =========================

[gridstorage]

# Base directory where to store temporary files and runtime state information.
#arcnagios_spooldir = /var/spool/arc/nagios

# The ARC commands will store some files under $HOME/.arc/.  Since the home
# directory may not be set to something usable, set an appropriate value here
# to instruct the Nagios plugins to override $HOME at startup.
#home_dir = /var/spool/arc/nagios

# The log-level to use for this probe.  In valid values in order of
# decreasing verbosity are DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL.
#loglevel = WARNING


# Checking Compute Elements: Information System
# =============================================

[arcinfosys]
# Same as for [gridstorage].
#arcnagios_spooldir = /var/spool/arc/nagios
#home_dir = /var/spool/arc/nagios

# The log-level for this probe as described under [gridstorage].
# It may be useful to set this to INFO.
#loglevel = WARNING


# The glue2 entry point
# ---------------------
#
# These are also provided as command-line options.

# Use this GLUE2 schema instead of querying the CE.
#glue2_schema =

# Warn if there are no objects of these classes.
#warn_if_missing = GLUE2AdminDomain,GLUE2Service,GLUE2Endpoint

# Report critical status if there are no object of these classes.
#critical_if_missing =

# A comma-separating list of foreign key attribute types which should be
# reflected in the DIT.
#hierarchical_foreign_keys =

# Require that all foreign keys which represent aggregation or composition
# are reflected in the DIT.
#hierarchical_aggregates =


# The aris and egiis entry points
# -------------------------------
#
# Use the command-line options.


# Example tests for the aris entry point
# --------------------------------------

# Usage: --cluster-test=cache_total
[arcinfosys.aris.cache_free]
type = limit
value = float(cache_free)/cache_total
critical.min = 0.01
warning.min = 0.1

# Usage: --cluster-test=topphys
[arcinfosys.aris.topphys]
type = regex
variable = runtimeenvironment
critical.pattern = APPS/HEP/ATLAS-TOPPHYS
critical.message = Missing TOPPHYS.

# Usage: --queue-test=queue-active
[arcinfosys.aris.queue-active]
type = regex
variable = status
critical.pattern = ^active$
critical.message = Inactive queue


# Checking Compute Elements: Job Submission
# =========================================

[arcce]
# Same as for [gridstorage].
#arcnagios_spooldir = /var/spool/arc/nagios
#home_dir = /var/spool/arc/nagios

# The log-level for this probe as described under [gridstorage].
#loglevel = WARNING

[arcce.connection_urls]
# This section can be used to force specific flavours and connection URLs for
# individual CEs.  Each line takes the form
#
#     ce.example.org = FLAVOUR:URL
#
# where the right hand side corresponds to the -c argument of arcsub(1).


# Example Scripted Job Tests
# --------------------------
#
# These checks are enabled by passing "--test NAME" to the submit command,
# where NAME is the section name without the "arcce." prefix.  They injects
# pieces of shell to to the remote script and checks the output using
# regular expression patterns.

[arcce.python]
jobplugin = scripted
required_programs = python
script_line = python -V >python.out 2>&1
output_file = python.out
output_pattern = Python\s+(?P<version>\S+)
status_ok = Found Python version %(version)s.
status_critical = Python version not found in output.
service_description = ARCCE Python version

[arcce.perl]
jobplugin = scripted
required_programs = perl
script_line = perl -v >perl.out 2>&1
output_file = perl.out
output_pattern = This is perl, v(?P<version>\S+)
status_ok = Found Perl version %(version)s.
status_critical = Perl version not found in output.
service_description = ARCCE Perl version

[arcce.gcc]
jobplugin = scripted
required_programs = gcc
script_line = gcc -v >gcc.out 2>&1
output_file = gcc.out
output_pattern = gcc version (?P<version>\S+)
status_ok = Found GCC version %(version)s.
status_critical = GCC version not found in output.
service_description = ARCCE GCC version

[arcce.csh]
jobplugin = scripted
required_programs = csh
script_line = echo >csh-test.csh '#! /bin/csh'; echo >>csh-test.csh 'env >csh.out'; chmod +x csh-test.csh; ./csh-test.csh
output_file = csh.out
output_pattern = ^PATH=
status_ok = Found working csh.
status_critical = Did not find $PATH in csh environment.
service_description = ARCCE csh usability

# Example Storage Job Checks
# --------------------------
#
# These check are also enabled by passing the second componest of the
# section name to the --test option.  This will add the specified staging to
# the job description.  Input files will must exist in advance.  Output
# files will be removed after checking that they exist.

[arcce.stage_srm]
jobplugin = staging
staged_inputs = srm://srm.example.org/somedir/testfile
staged_outputs = srm://srm.example.org/somedir/srm-%(hostname)s-%(epoch_time)s
service_description = ARCCE SRM Result

[arcce.stage_gridftp]
jobplugin = staging
staged_inputs = gsiftp://srm.example.org/somedir/testfile
staged_outputs = gsiftp://srm.example.org/somedir/gsiftp-%(hostname)s-%(epoch_time)s
service_description = ARCCE GridFTP Result

[arcce.stage_lfc]
jobplugin = staging
staged_inputs = lfc://lfc.example.org/lfcdir/testfile-lfc
staged_outputs = lfc://srm://srm.example.org/somedir/lfc-%(hostname)s-%(epoch_time)s@lfc.example.org/lfcdir/lfc-%(hostname)s-%(epoch_time)s
service_description = ARCCE LFC Result

Nagios Configuration for check_arcce_*

This configuration is not meant to be used as is. It is an example which illustrates how to use the entry points of the check_arcce_* probes and define the associated passive services. Other probes are omitted here, as they are configured as independent services similar to commonly available Nagios probes.

# --------------------------------------------------------------------------
# This is an example Nagios configuration for the ARC-CE probes meant for
# documenation purposes.  It cannot be used as-is.
# --------------------------------------------------------------------------


# Contacts and Contact Groups
# ===========================

# You probably already have contacts defined in your Nagios configuration, so
# you can skip these and substitute your own below.
define contactgroup {
	contactgroup_name	nagios-operators
	members			jdoe
}
define contact {
	use			generic-contact
	contact_name		jdoe
	email			jdoe@example.org
}

# Commands Definitions
# ====================

# This is a dummy command for passive services.  You may already have something
# like it in your Nagios configuration.
define command {
	command_name		check_passive
	command_line		/bin/true
}

# This command monitors running jobs and collects those which have teminated,
# reporting passive results.
define command {
	command_name check_arcce_monitor
	command_line $USER1$/check_arcce_monitor -H $HOSTNAME$
}

# A job submission check including sub-tests which are defined in the plugin
# configuration in separate sections.  The results of the sub-tests will be
# passively reported to the service names defined in the same configuration.
define command {
	command_name check_arcce_submit
	command_line $USER1$/check_arcce_submit -H $HOSTNAME$ \
		--test python --test perl --test csh --test gcc
}

# A job submission check with staging.  The arguments to --stage-input options
# must exist.  The arguments to --stage-output options will be overwritten, and
# deleted on termination.  This command is not used below.  To use it, add an
# active service and a passive service named "ARCCE SRM Job Termination".
define command {
	command_name check_arcce_submit_staging
# Passed explicitly:
#	command_line $USER1$/check_arcce_submit \
#		-H $HOSTNAME$ --job-tag srm \
#		--termination-service 'ARCCE SRM Job Termination' \
#		--stage-input srm.txt=srm://srm.example.org/nagios/readable.txt \
#		--stage-output srm://srm.example.org/nagios/srm-$HOSTNAME$-$TIMET$.txt \
# Using a predefined job-test:
#	command_line $USER1$/check_arcce_submit \
#		-H $HOSTNAME$ --job-tag srm \
#		--termination-service 'ARCCE SRM Job Termination' \
#		--test stage_srm
}

# Host Groups and Host Templates
# ==============================

# You need to have one host definitions to which the monitoring service is
# assigned to.  This is typically the Nagios host itself, for which you
# probably already have a definition.
define host {
	name			nagios-host
	use			generic-host
	max_check_attempts	10
	contact_groups		nagios-operators
	register		0
}

# The following host group and template will be used for all CEs.
define hostgroup {
	hostgroup_name		arcce-hosts
	alias			ARCCE Hosts
}
define host {
	name			arcce-host
	use			generic-host
	max_check_attempts	10
	contact_groups		nagios-operators
	hostgroups		arcce-hosts
	register		0
}

# Service Groups and Service Templates
# ====================================

define servicegroup {
	servicegroup_name	arcce-services
	alias			ARCCE Services
}
define service {
	name			arcce-service
	use			generic-service
	servicegroups		arcce-services
	check_period		24x7
	max_check_attempts	3
	flap_detection_enabled	0
	contact_groups		nagios-operators
	notifications_enabled	0
	register		0
}
define service {
	name			arcce-monitoring-service
	use			arcce-service
	normal_check_interval	5
	retry_check_interval	5
	register		0
}
define service {
	name			arcce-submission-service
	use			arcce-service
	normal_check_interval	30
	retry_check_interval	30
	register		0
}
define service {
	name			arcce-passive-service
	use			arcce-service
	active_checks_enabled	0
	passive_checks_enabled	1
	check_command		check_passive
	register		0
}

define service {
	use			arcce-monitoring-service
	host_name		localhost
	service_description	ARCCE Monitoring
	check_command		check_arcce_monitor
}

# For each ARC CE, we need one active service for submission and a number of
# passive services to collect the results.  In the following we associate the
# per-CE services to the "arcce-hosts" group, which will add them to all
# members of the group.
define service {
	use			arcce-submission-service
	service_description	ARCCE Job Submission
	hostgroup_name		arcce-hosts
	check_command		check_arcce_submit
}
define service {
	use			arcce-passive-service
	service_description	ARCCE Job Termination
	hostgroup_name		arcce-hosts
}
define service {
	use			arcce-passive-service
	service_description	ARCCE Python version
	hostgroup_name		arcce-hosts
}
define service {
	use			arcce-passive-service
	service_description	ARCCE Perl version
	hostgroup_name		arcce-hosts
}
define service {
	use			arcce-passive-service
	service_description	ARCCE GCC version
	hostgroup_name		arcce-hosts
}
define service {
	use			arcce-passive-service
	service_description	ARCCE csh usability
	hostgroup_name		arcce-hosts
}

# Hosts
# =====

# This provides the monitoring service.
define host {
	use			nagios-host
	host_name		localhost
}
# Any host which use the arcce-host template will get an active submission
# service, and all the related passive services.
#define host {
#	use			arcce-host
#	host_name		ce-00.example.org
#}