/*****************************************************************************
 *
 * CHECKS.C - Service and host check functions for NetSaint
 *
 * Copyright (c) 1999-2000 Ethan Galstad (netsaint@netsaint.org)
 * Last Modified:   10-30-2000
 *
 * License:
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *****************************************************************************/

#include "../common/config.h"
#include "../common/common.h"
#include "../common/statusdata.h"
#include "netsaint.h"

extern char     temp_file[MAX_FILENAME_LENGTH];

extern int      interval_length;

extern int      ipc_pipe[2];

extern int      log_initial_states;

extern int      service_check_timeout;
extern int      host_check_timeout;

extern int      service_check_reaper_interval;

extern int      use_aggressive_host_checking;

extern int      currently_running_service_checks;
extern int      non_parallelized_check_running;

extern int      accept_passive_service_checks;
extern int      obsess_over_services;

extern timed_event      *event_list_low;
extern service          *service_list;

extern host     *host_list;

extern service_message svc_msg;


/******************************************************************/
/****************** SERVICE MONITORING FUNCTIONS ******************/
/******************************************************************/

/* forks a child process to run a service check, but does not wait for the service check result */
void run_service_check(service *svc){
	char raw_command[MAX_INPUT_BUFFER];
	char processed_command[MAX_INPUT_BUFFER];
	char plugin_output[MAX_PLUGINOUTPUT_LENGTH];
	int check_service=TRUE;
	time_t current_time;
	time_t preferred_time=0L;
	time_t next_valid_time;
	pid_t pid;
	int fork_error=FALSE;
	int wait_result=0;
	host *temp_host=NULL;
	FILE *fp;
	int pclose_result=0;
	int time_is_valid=TRUE;


	/* get the current time */
	time(&current_time);

	/* if the service check is currently disabled... */
	if(svc->checks_enabled==FALSE){

		/* don't check the service */
		check_service=FALSE;

		/* reschedule the service check */
		preferred_time=current_time+(svc->check_interval*interval_length);
                }

	/* make sure this is a valid time to check the service */
	if(check_time_against_period((unsigned long)current_time,svc->check_period)==ERROR){

		/* don't check the service */
		check_service=FALSE;

		/* get the next valid time we can run the check */
		preferred_time=current_time;

		/* set the invalid time flag */
		time_is_valid=FALSE;
	        }

	/* find the host associated with this service */
	temp_host=find_host(svc->host_name,NULL);

	/* don't check the service if we couldn't find the associated host */
	if(temp_host==NULL)
		check_service=FALSE;

	/* if we shouldn't check the service, just reschedule it and leave... */
	if(check_service==FALSE){

		/* make sure we rescheduled the next service check at a valid time */
		get_next_valid_time(preferred_time,&next_valid_time,svc->check_period);

		/* the service could not be rescheduled properly - set the next check time for next year, but don't actually reschedule it */
		if(time_is_valid==FALSE && next_valid_time==preferred_time){

			svc->next_check=(time_t)(next_valid_time+(60*60*24*365));
			svc->should_be_scheduled=FALSE;
#ifdef DEBUG1
			printf("Warning: Could not find any valid times to reschedule a check of service '%s' on host '%s'!\n",svc->description,svc->host_name);
#endif
		        }

		/* this service could be rescheduled... */
		else{
			svc->next_check=next_valid_time;
			svc->should_be_scheduled=TRUE;
		        }

		/* update the status log with the current service */
		update_service_status(svc,FALSE);

		/* reschedule the next service check - unless we couldn't find a valid next check time */
		if(svc->should_be_scheduled==TRUE)
			schedule_service_check(svc,svc->next_check);

		return;
	        }


	/**** ELSE RUN THE SERVICE CHECK ****/

#ifdef DEBUG3
	printf("\tChecking service '%s' on host '%s'...\n",svc->description,svc->host_name);
#endif

	/* increment number of parallel service checks currently out there... */
	currently_running_service_checks++;

	/* set a flag if this service check shouldn't be parallelized with others... */
	if(svc->parallelize==FALSE)
		non_parallelized_check_running=TRUE;

	/* set the execution flag */
	svc->is_executing=TRUE;

	/* grab the host and service macro variables */
	clear_volatile_macros();
	grab_host_macros(temp_host);
	grab_service_macros(svc);

	/* get the raw command line */
	get_raw_command_line(svc->service_check_command,raw_command,sizeof(raw_command));
	strip(raw_command);

	/* process any macros contained in the argument */
	process_macros(raw_command,processed_command,sizeof(processed_command));
	strip(processed_command);

	/* save service info */
	strncpy(svc_msg.host_name,svc->host_name,sizeof(svc_msg.host_name)-1);
	svc_msg.host_name[sizeof(svc_msg.host_name)-1]='\x0';
	strncpy(svc_msg.description,svc->description,sizeof(svc_msg.description)-1);
	svc_msg.description[sizeof(svc_msg.description)-1]='\x0';
	svc_msg.parallelized=svc->parallelize;
	svc_msg.check_time=current_time;
	svc_msg.finish_time=current_time;

	/* fork a child process */
	pid=fork();

	/* an error occurred while trying to fork */
	if(pid==-1)
		fork_error=TRUE;

	/* if we are in the child process... */
	else if(pid==0){

		/* free allocated memory */
		free_memory();

		/* fork again... */
		pid=fork();

		/* an error occurred while trying to fork again */
		if(pid==-1)
			exit(STATE_UNKNOWN);

		/* the grandchild process should run the service check... */
		if(pid==0){

			/* reset signal handling */
			reset_sighandler();

			/* become the process group leader */
			setpgid(0,0);

			/* close read end of IPC pipe */
			close(ipc_pipe[0]);

			/* catch plugins that don't finish in a timely manner */
			signal(SIGALRM,service_check_sighandler);
			alarm(service_check_timeout);

			/* run the plugin check command */
			fp=popen(processed_command,"r");
			if(fp==NULL)
				exit(STATE_UNKNOWN);

			/* grab the plugin output and clean it */
			fgets(plugin_output,sizeof(plugin_output)-1,fp);
			strip(plugin_output);

			/* close the process */
			pclose_result=pclose(fp);

			/* reset the alarm */
			alarm(0);

			/* test for execution error */
			if(pclose_result==-1){
				strncpy(svc_msg.output,"(Error returned by call to pclose() function)",sizeof(svc_msg.output)-1);
				svc_msg.output[sizeof(svc_msg.output)-1]='\x0';
				svc_msg.return_code=STATE_CRITICAL;
				svc_msg.exited_ok=FALSE;
				svc_msg.finish_time=time(NULL);
				write_svc_message(&svc_msg);

				/* close write end of IPC pipe */
				close(ipc_pipe[1]);

				exit(STATE_UNKNOWN);
			        }

			/* else write plugin check results to message queue */
			else{
				strncpy(svc_msg.output,plugin_output,sizeof(svc_msg.output)-1);
				svc_msg.output[sizeof(svc_msg.output)-1]='\x0';
				svc_msg.return_code=(int)WEXITSTATUS(pclose_result);
				svc_msg.exited_ok=TRUE;
				svc_msg.check_type=SERVICE_CHECK_ACTIVE;
				svc_msg.finish_time=time(NULL);
				write_svc_message(&svc_msg);
			        }

			/* close write end of IPC pipe */
			close(ipc_pipe[1]);

			/* return with plugin exit status - not really necessary... */
			exit(pclose_result);
		        }

		/* close write end of IPC pipe */
		close(ipc_pipe[1]);

		/* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
		exit(STATE_OK);
	        }

	/* else the parent should wait for the first child to return... */
	else if(pid>0){

		wait_result=waitpid(pid,NULL,0);

		/* removed 06/28/2000 - caused problems under AIX */
		/*
		result=WEXITSTATUS(wait_result);
		if(result==STATE_UNKNOWN)
			fork_error=TRUE;
		*/
	        }

	/* see if we could run the check... */
	if(fork_error==TRUE){

		/* write plugin check results to message queue */
		strncpy(svc_msg.output,"(Could not execute service check due to a fork() error)",sizeof(svc_msg.output)-1);
		svc_msg.output[sizeof(svc_msg.output)-1]='\x0';
		svc_msg.return_code=STATE_CRITICAL;
		svc_msg.exited_ok=TRUE;
		svc_msg.check_type=SERVICE_CHECK_ACTIVE;
		svc_msg.finish_time=time(NULL);
		write_svc_message(&svc_msg);
	        }
	
	return;
        }



/* reaps service check results */
void reap_service_checks(void){
	service_message svc_msg;
	service *temp_service=NULL;
	host *temp_host=NULL;
	time_t preferred_time;
	time_t next_valid_time;
	char temp_buffer[MAX_INPUT_BUFFER];
	int state_change=FALSE;
	int hard_state_change=FALSE;
	int route_result=HOST_UP;
	int ignore_service_for_now=FALSE;
	time_t current_time;
	int first_check=FALSE;

#ifdef DEBUG0
        printf("reap_service_checks() start\n");
#endif

#ifdef DEBUG3
	printf("Starting to reap service check results...\n");
#endif

	/* read all service checks results that have come in... */
	while(read_svc_message(&svc_msg)!=-1){

		/* make sure we really have something... */
		if(!strcmp(svc_msg.description,"") && !strcmp(svc_msg.host_name,"")){
#ifdef DEBUG1
			printf("Found an empty message in service result pipe!\n");
#endif
			continue;
		        }

		/* get the current time */
		time(&current_time);

		/* skip this service check results if its passive and we aren't accepting passive check results */
		if(accept_passive_service_checks==FALSE && svc_msg.check_type==SERVICE_CHECK_PASSIVE)
			continue;

		/* because of my idiotic idea of having UNKNOWN states be equivalent to -1, I must hack things a bit... */
		if(svc_msg.return_code==255)
			svc_msg.return_code=STATE_UNKNOWN;

		/* find the service */
		temp_service=find_service(svc_msg.host_name,svc_msg.description,NULL);
		if(temp_service==NULL){

			snprintf(temp_buffer,sizeof(temp_buffer),"Warning:  Message queue contained results for service '%s' on host '%s'.  The service could not be found!\n",svc_msg.description,svc_msg.host_name);
			temp_buffer[sizeof(temp_buffer)-1]='\x0';
			write_to_logs_and_console(temp_buffer,NSLOG_RUNTIME_WARNING,TRUE);

			continue;
		        }

		/* update the execution time for this check */
		if(svc_msg.check_time>current_time || svc_msg.finish_time>current_time || (svc_msg.finish_time<svc_msg.check_time))
			temp_service->execution_time=0L;
		else
			temp_service->execution_time=(unsigned long)(svc_msg.finish_time-svc_msg.check_time);

		/* ignore passive service check results if we're not accepting them for this service */
		if(temp_service->accept_passive_service_checks==FALSE && svc_msg.check_type==SERVICE_CHECK_PASSIVE)
			continue;

#ifdef DEBUG3
		printf("\n\tFound check result for service '%s' on host '%s'\n",temp_service->description,temp_service->host_name);
		printf("\t\tCheck Type:    %s\n",(svc_msg.check_type==SERVICE_CHECK_ACTIVE)?"ACTIVE":"PASSIVE");
		printf("\t\tParallelized?: %s\n",(svc_msg.parallelized==TRUE)?"Yes":"No");
		printf("\t\tExited OK?:    %s\n",(svc_msg.exited_ok==TRUE)?"Yes":"No");
		printf("\t\tReturn Status: %d\n",svc_msg.return_code);
		printf("\t\tPlugin Output: '%s'\n",svc_msg.output);
#endif

		/* decrement the number of service checks still out there... */
		if(svc_msg.check_type==SERVICE_CHECK_ACTIVE)
			currently_running_service_checks--;

		/* if this check was not parallelized, clear the flag */
		if(svc_msg.parallelized==FALSE && svc_msg.check_type==SERVICE_CHECK_ACTIVE)
			non_parallelized_check_running=FALSE;

		/* clear the execution flag if this was an active check */
		if(svc_msg.check_type==SERVICE_CHECK_ACTIVE)
			temp_service->is_executing=FALSE;

		/* get the last check time */
		temp_service->last_check=svc_msg.check_time;

		/* was this check passive or active? */
		temp_service->check_type=(svc_msg.check_type==SERVICE_CHECK_ACTIVE)?SERVICE_CHECK_ACTIVE:SERVICE_CHECK_PASSIVE;

		/* INITIALIZE VARIABLES FOR THIS SERVICE */
		state_change=FALSE;
		hard_state_change=FALSE;
		route_result=HOST_UP;
		ignore_service_for_now=FALSE;
		first_check=FALSE;

		/* save the old service status info */
		temp_service->last_state=temp_service->current_state;

		/* initialize plugin output buffer if necessary */
		if(temp_service->plugin_output==NULL)
			temp_service->plugin_output=(char *)malloc(MAX_PLUGINOUTPUT_LENGTH);

		/* if there was some error running the command, just skip it (this shouldn't be happening) */
		if(svc_msg.exited_ok==FALSE){

			snprintf(temp_buffer,sizeof(temp_buffer),"Warning:  Check of service '%s' on host '%s' did not exit properly!\n",temp_service->description,temp_service->host_name);
			temp_buffer[sizeof(temp_buffer)-1]='\x0';
			write_to_logs_and_console(temp_buffer,NSLOG_RUNTIME_WARNING,TRUE);

			snprintf(temp_service->plugin_output,MAX_PLUGINOUTPUT_LENGTH-1,"(Service check did not exit properly)");
			temp_service->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';

			temp_service->current_state=STATE_CRITICAL;
                        }

		/* make sure the return code is within bounds */
		else if(svc_msg.return_code<-1 || svc_msg.return_code>2){

			snprintf(temp_buffer,sizeof(temp_buffer),"Warning: Return code of %d for check of service '%s' on host '%s' was out of bounds.\n",svc_msg.return_code,temp_service->description,temp_service->host_name);
			temp_buffer[sizeof(temp_buffer)-1]='\x0';
			write_to_logs_and_console(temp_buffer,NSLOG_RUNTIME_WARNING,TRUE);

			snprintf(temp_service->plugin_output,MAX_PLUGINOUTPUT_LENGTH-1,"(Return code of %d is out of bounds)",svc_msg.return_code);
			temp_service->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';

			temp_service->current_state=STATE_CRITICAL;
                        }

		/* else the return code is okay... */
		else{

			/* grab the plugin output */
			strip(svc_msg.output);
			if(!strcmp(svc_msg.output,""))
				strncpy(temp_service->plugin_output,"(No output returned from plugin)",MAX_PLUGINOUTPUT_LENGTH-1);
			else
				strncpy(temp_service->plugin_output,svc_msg.output,MAX_PLUGINOUTPUT_LENGTH-1);
			temp_service->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';

			/* grab the return code */
			temp_service->current_state=svc_msg.return_code;
		        }

		/* get the host that this service runs on */
		temp_host=find_host(temp_service->host_name,NULL);

		/* if the service check was okay... */
		if(temp_service->current_state==STATE_OK){

			/* if the host has never been checked before... */
			if(temp_host->last_check==(time_t)0 || temp_host->last_state_change==(time_t)0){

				/* update the last check time */
				temp_host->last_check=temp_service->last_check;

				/* update the last state change time */
				temp_host->last_state_change=temp_service->last_check;

				if(temp_host->plugin_output==NULL)
					temp_host->plugin_output=(char *)malloc(MAX_PLUGINOUTPUT_LENGTH);

				strncpy(temp_host->plugin_output,"(Host assumed to be up)",MAX_PLUGINOUTPUT_LENGTH);
				temp_host->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';

				/* update the status log with the host status */
				update_host_status(temp_host,FALSE);

				/* log the initial state if the user wants */
				if(log_initial_states==TRUE)
					log_host_event(temp_host,HOST_UP,HARD_STATE);
		                }

			/* if the service has never been checked before... */
			if(temp_service->last_state_change==(time_t)0){

				/* update the last state change time */
				temp_service->last_state_change=temp_service->last_check;

				/* log the initial state if the user wants */
				if(log_initial_states==TRUE)
					log_service_event(temp_service,HARD_STATE);
		                }
		        }

		/* check for a state change (either soft or hard) */
		if(temp_service->current_state!=temp_service->last_state){
		
#ifdef DEBUG3
			printf("\t\tService '%s' on host '%s' has changed state since last check!\n",temp_service->description,temp_service->host_name);
#endif

			/* set the state change flag */
			state_change=TRUE;
		        }


		/* checks for a hard state change where host was down at last service check */
		if(temp_service->host_problem_at_last_check==TRUE && temp_service->current_state==STATE_OK){

#ifdef DEBUG3
			printf("\t\tService '%s' on host '%s' has had a HARD STATE CHANGE!!\n",temp_service->description,temp_service->host_name);
#endif

			hard_state_change=TRUE;
	                }


		/* check for a "normal" hard state change where max check attempts is reached */
		else if(temp_service->current_attempt>=temp_service->max_attempts){

			if(temp_service->current_state!=temp_service->last_hard_state){

#ifdef DEBUG3
				printf("\t\tService '%s' on host '%s' has had a HARD STATE CHANGE!!\n",temp_service->description,temp_service->host_name);
#endif

				hard_state_change=TRUE;
			        }
		        }

		/* reset last and next notification times and acknowledgement flag if necessary */
		if(state_change==TRUE || hard_state_change==TRUE){

			temp_service->last_notification=(time_t)0;
			temp_service->next_notification=(time_t)0;
			temp_service->problem_has_been_acknowledged=FALSE;

			/* do NOT reset current notification number!!! */
			/*temp_service->current_notification_number=0;*/
		        }






		/**************************************/
		/******* SERVICE CHECK OK LOGIC *******/
		/**************************************/

		/* if the service is up and running OK... */
		if(temp_service->current_state==STATE_OK){

			/* clear the next notification time (this is not used, since we are now in an OK state) */
			temp_service->next_notification=(time_t)0;

			/* reset the acknowledgement flag (this should already have been done, but just in case...) */
			temp_service->problem_has_been_acknowledged=FALSE;

			/* the service check was okay, so the associated host must be up... */
			if(temp_host->status!=HOST_UP){

				/* verify the route to the host and send out host recovery notifications */
				verify_route_to_host(temp_host);

				/* set the host problem flag (i.e. don't notify about recoveries for this service) */
				temp_service->host_problem_at_last_check=TRUE;
			        }

			/* if a hard service recovery has occurred... */
			if(hard_state_change==TRUE){

				/* set the state type macro */
				temp_service->state_type=HARD_STATE;

				/* update service state times... */
				update_service_state_times(temp_service);

				/* log the service recovery */
				log_service_event(temp_service,HARD_STATE);

				/* notify contacts about the service recovery if the host was not down or unreachable at the last check */
				if(temp_service->no_recovery_notification==FALSE && temp_service->notify_on_recovery  && temp_service->host_problem_at_last_check==FALSE){

					if(temp_service->has_been_unknown==TRUE && temp_service->notify_on_unknown)
						service_notification(temp_service,NULL);
		
					else if(temp_service->has_been_warning==TRUE && temp_service->notify_on_warning)
						service_notification(temp_service,NULL);

					else if(temp_service->has_been_critical==TRUE && temp_service->notify_on_critical)
						service_notification(temp_service,NULL);
				        }

				/* run the service event handler to handle the hard state change */
				handle_service_event(temp_service,HARD_STATE);
			        }

			/* else if a soft service recovery has occurred... */
			else if(state_change==TRUE){

				/* this is a soft recovery */
				temp_service->state_type=SOFT_STATE;

				/* log the soft recovery */
				log_service_event(temp_service,SOFT_STATE);

				/* run the service event handler to handle the soft state change */
				handle_service_event(temp_service,SOFT_STATE);
		                }

			/* else no service state change has occured... */

			/* should we obsessive over service checks? */
			if(obsess_over_services==TRUE)
				obsessive_compulsive_service_check_processor(temp_service,temp_service->state_type);

			/* reset all service variables because its okay now... */
			temp_service->host_problem_at_last_check=FALSE;
			temp_service->no_recovery_notification=FALSE;
			temp_service->current_attempt=1;
			temp_service->state_type=HARD_STATE;
			temp_service->last_hard_state=STATE_OK;
			temp_service->last_notification=(time_t)0;
			temp_service->next_notification=(time_t)0;
			temp_service->current_notification_number=0;
			temp_service->problem_has_been_acknowledged=FALSE;
			temp_service->has_been_unknown=FALSE;
			temp_service->has_been_warning=FALSE;
			temp_service->has_been_critical=FALSE;
			temp_service->next_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));
		        }


		/*******************************************/
		/******* SERVICE CHECK PROBLEM LOGIC *******/
		/*******************************************/

		/* hey, something's not working quite like it should... */
		else{

			/* check the route to the host if its supposed to be up right now... */
			if(temp_host->status==HOST_UP)
				route_result=verify_route_to_host(temp_host);

			/* else the host is either down or unreachable, so recheck it if necessary */
			else{

				/* we're using agressive host checking, so really do recheck the host... */
				if(use_aggressive_host_checking==TRUE)
					route_result=verify_route_to_host(temp_host);

				/* the service wobbled between non-OK states, so check the host... */
				else if(state_change==TRUE && temp_service->last_hard_state!=STATE_OK)
					route_result=verify_route_to_host(temp_host);

				/* else fake the host check, but (possibly) resend host notifications to contacts... */
				else{

					/* fake the route check result */
					route_result=temp_host->status;

					/* log the initial state if the user wants to and this host hasn't been checked yet */
					if(log_initial_states==TRUE && (unsigned long)temp_service->last_state_change==0L)
						log_host_event(temp_host,temp_host->status,HARD_STATE);

				        /* possibly re-send host notifications... */
					host_notification(temp_host,temp_host->status,NULL);
				        }
			        }

			/* if the host is down or unreachable ... */
			if(route_result!=HOST_UP){

				/* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earler... */
				if(temp_service->last_hard_state!=temp_service->current_state)
					hard_state_change=TRUE;

				/* update service state times if necessary */
				if(hard_state_change==TRUE)
					update_service_state_times(temp_service);

				/* put service into a hard state without attempting check retries and don't send out notifications about it */
				temp_service->host_problem_at_last_check=TRUE;
				temp_service->state_type=HARD_STATE;
				temp_service->last_hard_state=temp_service->current_state;
				temp_service->current_attempt=1;
				ignore_service_for_now=TRUE;

				/* if the service has never been checked before, update the last state change time */
				if(temp_service->last_state_change==(time_t)0)
					temp_service->last_state_change=temp_service->last_check;
			        }

			/* else the host is up.. */
			else{

				/*  the host recovered since the last time the service was checked... */
				if(temp_service->host_problem_at_last_check==TRUE){

					/* next time the service is checked we shouldn't get into this same case... */
					temp_service->host_problem_at_last_check=FALSE;

					/* reset the current check counter, so we can mabye avoid a false recovery alarm - added 07/28/99 */
					temp_service->current_attempt=1;

					/* don't send out notifications for this service, since the host just came back online... */
					ignore_service_for_now=TRUE;

					/* don't send a recovery notification if the service recovers at the next check */
					temp_service->no_recovery_notification=TRUE;
				        }
			        }

			 /* if we should retry the service check, do so (except it the host is down or unreachable!) */
			if(temp_service->current_attempt < temp_service->max_attempts){

				/* the host is down or unreachable, so don't attempt to retry the service check */
				if(route_result!=HOST_UP){

					/* the host is not up, so reschedule the next service check at regular interval */
					temp_service->next_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));

					/* log the problem as a hard state if the host just went down (new in 0.0.5) */
					if(hard_state_change==TRUE)
						log_service_event(temp_service,HARD_STATE);
				        }

				/* the host is up, so continue to retry the service check */
				else{

					/* this is a soft state */
					temp_service->state_type=SOFT_STATE;

					/* log the service check retry */
					log_service_event(temp_service,SOFT_STATE);

					/* run the service event handler to handle the soft state */
					handle_service_event(temp_service,SOFT_STATE);

					temp_service->current_attempt=temp_service->current_attempt+1;
					temp_service->next_check=(time_t)(temp_service->last_check+(temp_service->retry_interval*interval_length));
				        }
			        }
			

			/* we've reached the maximum number of service rechecks, so handle the error */
			else{

				/* this is a hard state */
				temp_service->state_type=HARD_STATE;

				/* keep track of this state, in case we "float" amongst other non-OK states before recovering */
				if(temp_service->current_state==STATE_UNKNOWN)
					temp_service->has_been_unknown=TRUE;
				else if(temp_service->current_state==STATE_WARNING)
					temp_service->has_been_warning=TRUE;
				else if(temp_service->current_state==STATE_CRITICAL)
					temp_service->has_been_critical=TRUE;

				/* if we've hard a hard state change... */
				if(hard_state_change==TRUE){

					/* update service state times */
					update_service_state_times(temp_service);

					/* log the service problem (even if host is not up, which is new in 0.0.5) */
					log_service_event(temp_service,HARD_STATE);
				        }

				/* else log the problem (again) if this service is flagged as being volatile */
				else if(temp_service->is_volatile==TRUE)
					log_service_event(temp_service,HARD_STATE);

				/* (re)send notifications out about this service problem if the host is up... */
				if(ignore_service_for_now==FALSE){

					if(temp_service->current_state==STATE_UNKNOWN && temp_service->notify_on_unknown)
						service_notification(temp_service,NULL);

					else if(temp_service->current_state==STATE_WARNING && temp_service->notify_on_warning)
						service_notification(temp_service,NULL);

					else if(temp_service->current_state==STATE_CRITICAL && temp_service->notify_on_critical)
						service_notification(temp_service,NULL);

					/* reset the flag that prevents recovery notification from being sent out */
					temp_service->no_recovery_notification=FALSE;
				        }

				/* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */
				if(hard_state_change==TRUE || temp_service->is_volatile==TRUE)
					handle_service_event(temp_service,HARD_STATE);

				/* save the last hard state */
				temp_service->last_hard_state=temp_service->current_state;

				/* reschedule the next check at the regular interval */
				temp_service->next_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));
			        }



			/* should we obsessive over service checks? */
			if(obsess_over_services==TRUE)
				obsessive_compulsive_service_check_processor(temp_service,temp_service->state_type);
		        }

	        /* make sure we don't get ourselves into too much trouble... */
		if(current_time>temp_service->next_check)
			temp_service->next_check=current_time;

		/* make sure we rescheduled the next service check at a valid time */
		preferred_time=temp_service->next_check;
		get_next_valid_time(preferred_time,&next_valid_time,temp_service->check_period);
		temp_service->next_check=next_valid_time;

		/* update the current service status log */
		update_service_status(temp_service,FALSE);

		/* reschedule the next service check ONLY for active checks */
		if(temp_service->check_type==SERVICE_CHECK_ACTIVE)
			schedule_service_check(temp_service,temp_service->next_check);
	        }

#ifdef DEBUG3
	printf("Finished reaping service check results.\n");
#endif

#ifdef DEBUG0
	printf("reap_service_checks() end\n");
#endif

	return;
        }



/* check for services that never returned from a check... */
void check_for_orphaned_services(void){
	service *temp_service;
	time_t expected_time;
	time_t current_time;
	char buffer[MAX_INPUT_BUFFER];

#ifdef DEBUG0
	printf("check_for_orphaned_services() start\n");
#endif

	time(&current_time);

	/* check all services... */
	for(temp_service=service_list;temp_service!=NULL;temp_service=temp_service->next){

		/* skip services that are not currently executing */
		if(temp_service->is_executing==FALSE)
			continue;

		/* determine the time at which the check results should have come in (allow 10 minutes slack time) */
		expected_time=(time_t)(temp_service->next_check+temp_service->latency+service_check_timeout+service_check_reaper_interval+600);

		/* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */
		if(expected_time<current_time){

			/* log a warning */
			snprintf(buffer,sizeof(buffer)-1,"Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back).  I'm scheduling an immediate check of the service...\n",temp_service->description,temp_service->host_name);
			buffer[sizeof(buffer)-1]='\x0';
			write_to_logs_and_console(buffer,NSLOG_RUNTIME_WARNING,TRUE);

			/* decremement the number of running service checks */
			if(currently_running_service_checks>0)
				currently_running_service_checks--;

			/* disable the executing flag */
			temp_service->is_executing=FALSE;

			/* schedule an immediate check of the service */
			schedule_service_check(temp_service,current_time);
		        }
	        }

#ifdef DEBUG0
	printf("check_for_orphaned_services() end\n");
#endif

	return;
        }




/******************************************************************/
/******************* ROUTE/HOST CHECK FUNCTIONS *******************/
/******************************************************************/


/* check to see if we can reach the host */
int verify_route_to_host(host *hst){
	int result;

#ifdef DEBUG0
	printf("verify_route_to_host() start\n");
#endif

	/* check route to the host (propagate problems and recoveries both up and down the tree) */
	result=check_host(hst,PROPAGATE_TO_PARENT_HOSTS | PROPAGATE_TO_CHILD_HOSTS);

#ifdef DEBUG0
	printf("verify_route_to_host() end\n");
#endif

	return result;
        }



/* see if the remote host is alive at all */
int check_host(host *hst,int propagation_options){
	int result=HOST_UP;
	int parent_result=HOST_UP;
	host *parent_host=NULL;
	hostsmember *temp_hostsmember=NULL;
	host *child_host=NULL;
	int return_result=HOST_UP;
	int max_check_attempts=1;
	int route_blocked=TRUE;

#ifdef DEBUG0
	printf("check_host() start\n");
#endif

	/* make sure we return the original host state unless it changes... */
	return_result=hst->status;

	/* if the host is already down or unreachable... */
	if(hst->status!=HOST_UP){

		/* how many times should we retry checks for this host? */
		if(use_aggressive_host_checking==FALSE)
			max_check_attempts=1;
		else
			max_check_attempts=hst->max_attempts;

		/* retry the host check as many times as necessary or allowed... */
		for(hst->current_attempt=1;hst->current_attempt<=max_check_attempts;hst->current_attempt++){
			
			/* check the host */
			result=run_host_check(hst);

			/* the host recovered from a hard problem... */
			if(result==HOST_UP){

				return_result=HOST_UP;

				/* handle the hard host recovery */
				handle_host_state(hst,HOST_UP,HARD_STATE);

				/* propagate the host recovery upwards (parents should not be down) */
				if(propagation_options & PROPAGATE_TO_PARENT_HOSTS){

					/* propagate to all parent hosts */
					for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){

						/* find the parent host */
						parent_host=find_host(temp_hostsmember->host_name,NULL);

						/* check the parent host (and propagate upwards) if its not up */
						if(parent_host!=NULL && parent_host->status!=HOST_UP)
							check_host(parent_host,PROPAGATE_TO_PARENT_HOSTS);
					        }
				        }

				/* propagate the host recovery downwards (children may or may not be down) */
				if(propagation_options & PROPAGATE_TO_CHILD_HOSTS){

					/* check all child hosts... */
					for(child_host=host_list;child_host!=NULL;child_host=child_host->next){

						/* if this is a child of the host, check it if it is not marked as UP */
						if(is_host_immediate_child_of_host(hst,child_host)==TRUE && child_host->status!=HOST_UP)
						        check_host(child_host,PROPAGATE_TO_CHILD_HOSTS);
					        }
				        }

				break;
			        }

		        }

		/* if the host isn't up and its currently marked as being unreachable, make absolutely sure it isn't down now. */
		/* to do this we have to check the (saved) status of all parent hosts.  this situation can occur if a host is */
		/* unreachable, its parent recovers, but it does not return to an UP state.  Even though it is not up, the host */
		/* has changed from an unreachable to a down state */

		if(hst->status==HOST_UNREACHABLE && result!=HOST_UP){

			/* check all parent hosts */
			for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){

				/* find the parent host */
				parent_host=find_host(temp_hostsmember->host_name,NULL);

				/* if at least one parent host is up, this host is no longer unreachable - it is now down instead */
				if(parent_host->status==HOST_UP){
					
					/* handle the hard host state change */
					handle_host_state(hst,HOST_DOWN,HARD_STATE);

					break;
				        }
			        }
		        }

		/* reset the current attempt if we just went over... */
		if(hst->current_attempt>hst->max_attempts)
			hst->current_attempt=hst->max_attempts;
	        }


	/* else the host is supposed to be up right now... */
	else{

		for(hst->current_attempt=1;hst->current_attempt<=hst->max_attempts;hst->current_attempt++){

			/* run the host check */
			result=run_host_check(hst);

			/* if this is the last check and we still haven't had a recovery, check the parents and children and then handle the hard host state */
			if(result!=HOST_UP && (hst->current_attempt==hst->max_attempts)){

				/* check all parent hosts */
				for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){

					/* find the parent host */
					parent_host=find_host(temp_hostsmember->host_name,NULL);

					/* check the parent host, assume its up if we can't find it, use the parent host's "old" status if we shouldn't propagate */
					if(parent_host==NULL)
						parent_result=HOST_UP;
					else if(propagation_options & PROPAGATE_TO_PARENT_HOSTS)
						parent_result=check_host(parent_host,PROPAGATE_TO_PARENT_HOSTS);
					else
						parent_result=parent_host->status;

					/* if this parent host was up, the route is okay */
					if(parent_result==HOST_UP)
						route_blocked=FALSE;

					/* we could break out of this loop once we've found one parent host that is up, but we won't because we want
					   immediate notification of state changes (i.e. recoveries) for parent hosts */
				        }

			        /* if this host has at least one parent host and the route to this host is blocked, it is unreachable */
				if(route_blocked==TRUE && hst->parent_hosts!=NULL)
					return_result=HOST_UNREACHABLE;

			        /* else the parent host is up (or there isn't a parent host), so this host must be down */
				else
					return_result=HOST_DOWN;

				/* handle the hard host state (whether it is DOWN or UNREACHABLE) */
				handle_host_state(hst,return_result,HARD_STATE);

				/* propagate the host problem to all child hosts (they should be unreachable now unless they have multiple parents) */
				if(propagation_options & PROPAGATE_TO_CHILD_HOSTS){

					/* check all child hosts... */
					for(child_host=host_list;child_host!=NULL;child_host=child_host->next){

						/* if this is a child of the host, check it if it is not marked as UP */
						if(is_host_immediate_child_of_host(hst,child_host)==TRUE && child_host->status!=HOST_UP)
						        check_host(child_host,PROPAGATE_TO_CHILD_HOSTS);
					        }
				        }
			        }

			/* handle any soft states (during host check retries that return a non-ok state) */
			else if(result!=HOST_UP || (result==HOST_UP && hst->current_attempt!=1))
				handle_host_state(hst,result,SOFT_STATE);

			/* the host recovered (or it was never down), so break out of the check loop */
			if(result==HOST_UP){

				/* this host hasn't had a state change yet... */
				if(hst->last_state_change==(time_t)0){

					/* initialize the last state change time */
					hst->last_state_change=hst->last_check;

					/* update the status log with the current host info */
					update_host_status(hst,FALSE);
				        }
				break;
			        }
	                }


		/* adjust the current check number if we exceeded the max count */
		if(hst->current_attempt>hst->max_attempts)
			hst->current_attempt=hst->max_attempts;
	        }


#ifdef DEBUG3
	printf("\tHost Check Result: Host '%s' is ",hst->name);
	if(return_result==HOST_UP)
		printf("UP\n");
	else if(return_result==HOST_DOWN)
		printf("DOWN\n");
	else if(return_result==HOST_UNREACHABLE)
		printf("UNREACHABLE\n");
#endif

#ifdef DEBUG0
	printf("check_host() end\n");
#endif
	
	return return_result;
        }




/* run an "alive" check on a host */
int run_host_check(host *hst){
	int result=STATE_OK;
	int return_result=HOST_UP;
	char processed_check_command[MAX_INPUT_BUFFER];
	char raw_check_command[MAX_INPUT_BUFFER];
	char temp_buffer[MAX_INPUT_BUFFER];
	command *temp_command;
	time_t current_time;
	char *temp_ptr;
	int early_timeout=FALSE;

		

#ifdef DEBUG0
	printf("run_host_check() start\n");
#endif

	/* if checks are disabled, just return the last host state */
	if(hst->checks_enabled==FALSE)
		return hst->status;

	/* if there is no host check command, just return with no error */
	if(hst->host_check_command==NULL){

#ifdef DEBUG3
		printf("\tNo host check command specified, so no check will be done (host assumed to be up)!\n");
#endif

		return HOST_UP;
	        }

	/* find the command we use to check the host */
	temp_command=find_command(hst->host_check_command,NULL);

	/* if we couldn't find the command, return with an error */
	if(temp_command==NULL){

#ifdef DEBUG3
		printf("\tCouldn't find the host check command!\n");
#endif

		return HOST_UP;
	        }

	/* grab the host macros */
	clear_volatile_macros();
	grab_host_macros(hst);

	/* get the last host check time */
	time(&current_time);
	hst->last_check=current_time;

	/* get the raw command line */
	strncpy(raw_check_command,temp_command->command_line,sizeof(raw_check_command));
	raw_check_command[sizeof(raw_check_command)-1]='\x0';

	/* process any macros in the check command */
	process_macros(raw_check_command,&processed_check_command[0],(int)sizeof(processed_check_command));

			
#ifdef DEBUG3
	printf("\t\tRaw Command: %s\n",hst->host_check_command);
	printf("\t\tProcessed Command: %s\n",processed_check_command);
#endif

	/* allocate new plugin output buffer if it isn't already allocated */
	if(hst->plugin_output==NULL)
		hst->plugin_output=(char *)malloc(MAX_PLUGINOUTPUT_LENGTH);

	/* run the host check command */
	result=my_system(processed_check_command,host_check_timeout,&early_timeout,hst->plugin_output,MAX_PLUGINOUTPUT_LENGTH-1);

	/* if the check timed out, report an error */
	if(early_timeout==TRUE){
		snprintf(hst->plugin_output,MAX_PLUGINOUTPUT_LENGTH-1,"Host check timed out after %d seconds\n",host_check_timeout);
		hst->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';

		/* log the timeout */
		snprintf(temp_buffer,sizeof(temp_buffer)-1,"Warning: Host check command '%s' for host '%s' timed out after %d seconds\n",hst->host_check_command,hst->name,host_check_timeout);
		temp_buffer[sizeof(temp_buffer)-1]='\x0';
		write_to_logs_and_console(temp_buffer,NSLOG_RUNTIME_WARNING,TRUE);
	        }

	/* make sure the output isn't NULL */
	strip(hst->plugin_output);
	if(!strcmp(hst->plugin_output,"")){
		strncpy(hst->plugin_output,"(No Information Returned From Check)",MAX_PLUGINOUTPUT_LENGTH-1);
		hst->plugin_output[MAX_PLUGINOUTPUT_LENGTH-1]='\x0';
	        }

	/* replace semicolons in plugin output with colons */
	temp_ptr=hst->plugin_output;
	while((temp_ptr=strchr(temp_ptr,';')))
	      *temp_ptr=':';


	/* if we're not doing agressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
	if(use_aggressive_host_checking==FALSE && result==STATE_WARNING)
		result=STATE_OK;


	if(result==STATE_OK)
		return_result=HOST_UP;
	else
		return_result=HOST_DOWN;


#ifdef DEBUG3
	printf("\tHost Check Result: Host '%s' is ",hst->name);
	if(return_result==HOST_UP)
		printf("UP\n");
	else
		printf("DOWN\n");
#endif

#ifdef DEBUG0
	printf("run_host_check() end\n");
#endif
	
	return return_result;
        }


