This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

cluster: STABLE2 - cman: drastically improve startup errors


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=d036b16a47e93506c80dfa3c48c4f609f5d9338a
Commit:        d036b16a47e93506c80dfa3c48c4f609f5d9338a
Parent:        5a0d236cbce8e0bc280d72215dc3b046dd60cf8b
Author:        Christine Caulfield <ccaulfie@redhat.com>
AuthorDate:    Thu Dec 18 14:40:09 2008 +0000
Committer:     Christine Caulfield <ccaulfie@redhat.com>
CommitterDate: Thu Dec 18 14:40:09 2008 +0000

cman: drastically improve startup errors

cman_tool join has a nasty habit of just exiting with
"corosync failed to start" or some such unhelpful error message.

This patch improves on these by trapping the corosync exit code and
attempting to interpret it for the user.

Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
---
 cman/cman_tool/join.c |  115 ++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/cman/cman_tool/join.c b/cman/cman_tool/join.c
index 0218375..9635d4f 100644
--- a/cman/cman_tool/join.c
+++ b/cman/cman_tool/join.c
@@ -38,6 +38,76 @@ static void be_daemon(int close_stderr)
 	setsid();
 }
 
+static char *aisexec_exit_reason(signed char status)
+{
+	static char reason[256];
+	switch (status) {
+	case -2:
+		return "Could not determine UID to run as";
+		break;
+	case -3:
+		return "Could not determine GID to run as";
+		break;
+	case -4:
+		return "Error initialising memory pool";
+		break;
+	case -5:
+		return "Could not fork";
+		break;
+	case -6:
+		return "Could not bind to libais socket";
+		break;
+	case -7:
+		return "Could not bind to network socket";
+		break;
+	case -8:
+		return "Could not read security key for communications";
+		break;
+	case -9:
+		return "Could not read cluster configuration";
+		break;
+	case -10:
+		return "Could not set up logging";
+		break;
+	case -11:
+		return "Could not dynamically load modules";
+		break;
+	case -12:
+		return "Could not load and initialise object database";
+		break;
+	case -13:
+		return "Could not initialise all required services";
+		break;
+	case -14:
+		return "Out of memory";
+		break;
+	default:
+		sprintf(reason, "Error, reason code is %d", status);
+		return reason;
+		break;
+	}
+}
+
+static int check_aisexec_status(pid_t pid)
+{
+	int status;
+	int pidstatus;
+
+	status = waitpid(pid, &pidstatus, WNOHANG);
+	if (status == -1 && errno == ECHILD) {
+
+		return 0;
+	}
+	if ((status == 0 || status == pid) && pidstatus != 0) {
+		if (WIFEXITED(pidstatus))
+			fprintf(stderr, "aisexec died: %s\n", aisexec_exit_reason(WEXITSTATUS(pidstatus)));
+		if (WIFSIGNALED(pidstatus))
+			fprintf(stderr, "aisexec died with signal: %d\n", WTERMSIG(pidstatus));
+		exit(1);
+	}
+	return status;
+}
+
 int join(commandline_t *comline)
 {
 	int i;
@@ -120,9 +190,10 @@ int join(commandline_t *comline)
 	envp[envptr++] = strdup(scratch);
 	envp[envptr++] = NULL;
 
+	/* Always run aisexec -f because we have already forked twice anyway, and
+	   we want to return any exit code that might happen */
 	argv[0] = "aisexec";
-	if (comline->verbose & ~DEBUG_STARTUP_ONLY)
-		argv[++argvptr] = "-f";
+	argv[++argvptr] = "-f";
 	argv[++argvptr] = NULL;
 
 	/* Fork/exec cman */
@@ -144,6 +215,10 @@ int join(commandline_t *comline)
 			}
 		}
 		be_daemon(!(comline->verbose & ~DEBUG_STARTUP_ONLY));
+
+		sprintf(scratch, "FORKED: %d", getpid());
+		write(p[1], scratch, strlen(scratch));
+
 		execve(AISEXECBIN, argv, envp);
 
 		/* exec failed - tell the parent process */
@@ -173,43 +248,31 @@ int join(commandline_t *comline)
 
 		status = select(p[0]+1, &fds, NULL, NULL, &tv);
 
-		/* Did we get an error? */
+		/* Did we get a cman-reported error? */
 		if (status == 1) {
 			int len;
 			if ((len = read(p[0], message, sizeof(message)) > 0)) {
 
+				/* Forked OK - get the real aisexec pid */
+				if (sscanf(message, "FORKED: %d", &aisexec_pid) == 1) {
+					if (comline->verbose & DEBUG_STARTUP_ONLY)
+						fprintf(stderr, "forked process ID is %d\n", aisexec_pid);
+					status = 1;
+					continue;
+				}
 				/* Success! get the new PID of double-forked aisexec */
 				if (sscanf(message, "SUCCESS: %d", &aisexec_pid) == 1) {
 					if (comline->verbose & DEBUG_STARTUP_ONLY)
 						fprintf(stderr, "aisexec running, process ID is %d\n", aisexec_pid);
 					status = 0;
+					break;
 				}
-				else {
-					fprintf(stderr, "cman not started: %s\n", message);
-				}
-				break;
 			}
 			else if (len < 0 && errno == EINTR) {
 				continue;
 			}
 			else { /* Error or EOF - check the child status */
-				int pidstatus;
-				status = waitpid(aisexec_pid, &pidstatus, WNOHANG);
-				if (status == -1 && errno == ECHILD) {
-					fprintf(stderr, "cman not started\n");
-					break;
-				}
-				if (status == 0 && pidstatus != 0) {
-					if (WIFEXITED(pidstatus))
-						fprintf(stderr, "aisexec died with status: %d\n", WEXITSTATUS(pidstatus));
-					if (WIFSIGNALED(pidstatus))
-						fprintf(stderr, "aisexec died with signal: %d\n", WTERMSIG(pidstatus));
-					status = -1;
-					break;
-				}
-				else {
-					status = 0; /* Try to connect */
-				}
+				status = check_aisexec_status(aisexec_pid);
 			}
 		}
 
@@ -221,13 +284,15 @@ int join(commandline_t *comline)
 		do {
 			if (status == 0) {
 				if (kill(aisexec_pid, 0) < 0) {
+					status = check_aisexec_status(aisexec_pid);
 					die("aisexec died during startup\n");
 				}
 
 				h = cman_admin_init(NULL);
 				if (!h && comline->verbose & DEBUG_STARTUP_ONLY)
 				{
-					fprintf(stderr, "waiting for aisexec to start\n");
+					fprintf(stderr, "waiting for cman to start\n");
+					status = check_aisexec_status(aisexec_pid);
 				}
 			}
 			sleep (1);


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]