This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

master - fence_tool: new option to delay before join


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=809e1e9fa79b4bf003fc137b2a8291e709d03b89
Commit:        809e1e9fa79b4bf003fc137b2a8291e709d03b89
Parent:        32849ba0f7e022ca5de30d043de5fe8c8c7ab982
Author:        David Teigland <teigland@redhat.com>
AuthorDate:    Wed Aug 27 14:08:07 2008 -0500
Committer:     David Teigland <teigland@redhat.com>
CommitterDate: Wed Aug 27 14:08:07 2008 -0500

fence_tool: new option to delay before join

bz 460190

Certain network/switch settings cause nodes to form partitioned clusters
when they start up.  Add code to better cope with these initial partitions.
The network partitions are a particular problem for two_node clusters where
a node has quorum when it starts up on its own.

This adds a new fence_tool option -m, e.g. fence_tool join -m <seconds>.
It causes fence_tool to delay the join by up to <seconds> to allow all
nodes in cluster.conf to become cluster members.

This allows openais on the nodes to all see each other before starting
the fence domain. So we join the domain *after* the nodes merge into a
single cluster.  If we joined the domain *before* the cluster partition
merged, then nodes end up being fenced unnecessarily.  (This is a similar
idea to post_join_delay; a delay that gives us time to determine that a
node in an unknown state is actually ok and doesn't require fencing.)

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fence/fence_tool/fence_tool.c |  169 +++++++++++++++++++++++++++++++++-------
 fence/man/fence_tool.8        |    7 +-
 2 files changed, 144 insertions(+), 32 deletions(-)

diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c
index 95f4ba1..8e4040b 100644
--- a/fence/fence_tool/fence_tool.c
+++ b/fence/fence_tool/fence_tool.c
@@ -27,20 +27,28 @@
 
 #define DEFAULT_WAIT_TIMEOUT		300 /* five minutes */
 
-#define die(fmt, args...) \
-do { \
-	fprintf(stderr, "%s: ", prog_name); \
-	fprintf(stderr, fmt "\n", ##args); \
-	exit(EXIT_FAILURE); \
-} while (0)
+#define MAX_NODES			128
 
+int all_nodeids[MAX_NODES];
+int all_nodeids_count;
+cman_node_t cman_nodes[MAX_NODES];
+int cman_nodes_count;
+struct fenced_node nodes[MAX_NODES];
 char *prog_name;
 int operation;
 int verbose = 0;
 int inquorate_fail = 0;
 int wait_join = 0;			 /* default: don't wait for join */
 int wait_leave = 0;			 /* default: don't wait for leave */
-int wait_timeout = DEFAULT_WAIT_TIMEOUT; /* applies to all waits */
+int wait_members = 0;			 /* default: don't wait for members */
+int wait_timeout = DEFAULT_WAIT_TIMEOUT;
+
+#define die(fmt, args...) \
+do { \
+	fprintf(stderr, "%s: ", prog_name); \
+	fprintf(stderr, fmt "\n", ##args); \
+	exit(EXIT_FAILURE); \
+} while (0)
 
 static int do_write(int fd, void *buf, size_t count)
 {
@@ -116,7 +124,7 @@ static int we_are_in_fence_domain(void)
 	return 0;
 }
 
-static void do_wait(int joining)
+static void wait_domain(int joining)
 {
 	int in, tries = 0;
 
@@ -144,10 +152,65 @@ static void do_wait(int joining)
 	printf("Error %s the fence group.\n", joining ? "joining" : "leaving");
 }
 
-static void wait_quorum(void)
+static void read_ccs_nodeids(int cd)
+{
+	char path[PATH_MAX];
+	char *nodeid_str;
+	int i, error;
+
+	memset(all_nodeids, 0, sizeof(all_nodeids));
+	all_nodeids_count = 0;
+
+	for (i = 1; ; i++) {
+		nodeid_str = NULL;
+		memset(path, 0, sizeof(path));
+		sprintf(path, "/cluster/clusternodes/clusternode[%d]/@nodeid", i);
+
+		error = ccs_get(cd, path, &nodeid_str);
+		if (error || !nodeid_str)
+			break;
+
+		all_nodeids[all_nodeids_count++] = atoi(nodeid_str);
+		free(nodeid_str);
+	}
+}
+
+static int all_nodeids_are_members(cman_handle_t ch)
+{
+	int i, j, rv, found;
+
+	memset(&cman_nodes, 0, sizeof(cman_nodes));
+	cman_nodes_count = 0;
+
+	rv = cman_get_nodes(ch, MAX_NODES, &cman_nodes_count, cman_nodes);
+	if (rv < 0) {
+		printf("cman_get_nodes error %d %d\n", rv, errno);
+		return 0;
+	}
+
+	for (i = 0; i < all_nodeids_count; i++) {
+		found = 0;
+
+		for (j = 0; j < cman_nodes_count; j++) {
+			if (cman_nodes[j].cn_nodeid == all_nodeids[i] &&
+			    cman_nodes[j].cn_member) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			return 0;
+	}
+	return 1;
+}
+
+static void wait_cman(void)
 {
 	cman_handle_t ch;
-	int rv, try_init = 0, try_active = 0, try_quorate = 0;
+	int try_init = 0, try_active = 0, try_quorate = 0;
+	int try_ccs = 0, try_members = 0;
+	int rv, cd;
 
 	while (1) {
 		ch = cman_init(NULL);
@@ -157,8 +220,11 @@ static void wait_quorum(void)
 		if (inquorate_fail)
 			goto fail;
 
-		if (try_init++ >= wait_timeout)
-			goto fail_err;
+		if (try_init++ >= wait_timeout) {
+			printf("%s: timed out waiting for cman init\n",
+			       prog_name);
+			goto fail;
+		}
 
 		if (!(try_init % 10))
 			printf("%s: waiting for cman to start\n", prog_name);
@@ -174,12 +240,14 @@ static void wait_quorum(void)
 		if (inquorate_fail)
 			goto fail;
 
-		if (try_active++ >= wait_timeout)
-			goto fail_err;
+		if (try_active++ >= wait_timeout) {
+			printf("%s: timed out waiting for cman active\n",
+			       prog_name);
+			goto fail;
+		}
 
 		if (!(try_active % 10))
-			printf("%s: waiting for cman to be active\n",prog_name);
-
+			printf("%s: waiting for cman active\n", prog_name);
 		sleep(1);
 	}
 
@@ -191,22 +259,61 @@ static void wait_quorum(void)
 		if (inquorate_fail)
 			goto fail;
 
-		if (try_quorate++ >= wait_timeout)
-			goto fail_err;
+		if (try_quorate++ >= wait_timeout) {
+			printf("%s: timed out waiting for cman quorum\n",
+			       prog_name);
+			goto fail;
+		}
 
 		if (!(try_quorate % 10))
-			printf("%s: waiting for cluster quorum\n", prog_name);
+			printf("%s: waiting for cman quorum\n", prog_name);
+
+		sleep(1);
+	}
+
+	while (1) {
+		cd = ccs_connect();
+		if (cd > 0)
+			break;
+
+		if (try_ccs++ >= wait_timeout) {
+			printf("%s: timed out waiting for ccs connect\n",
+			       prog_name);
+			goto fail;
+		}
+
+		if (!(try_ccs % 10))
+			printf("%s: waiting for ccs connect\n", prog_name);
 
 		sleep(1);
 	}
 
+	if (!wait_members)
+		goto out;
+	read_ccs_nodeids(cd);
+
+	while (1) {
+		rv = all_nodeids_are_members(ch);
+		if (rv)
+			break;
+
+		if (try_members++ >= wait_members)
+			break;
+
+		if (!(try_members % 10))
+			printf("%s: waiting for all %d nodes to be members\n",
+			       prog_name, all_nodeids_count);
+		sleep(1);
+	}
+
+ out:
+	ccs_disconnect(cd);
 	cman_finish(ch);
 	return;
 
- fail_err:
-	printf("%s: Timed out waiting for cluster quorum to form.\n",
-	       prog_name);
  fail:
+	if (ch)
+		cman_finish(ch);
 	exit(EXIT_FAILURE);
 }
 
@@ -214,14 +321,14 @@ static void do_join(int argc, char *argv[])
 {
 	int rv;
 
-	wait_quorum();
+	wait_cman();
 
 	rv = fenced_join();
 	if (rv < 0)
 		die("can't communicate with fenced");
 
 	if (wait_join)
-		do_wait(1);
+		wait_domain(1);
 
 	exit(EXIT_SUCCESS);
 }
@@ -237,7 +344,7 @@ static void do_leave(void)
 		die("can't communicate with fenced");
 
 	if (wait_leave)
-		do_wait(0);
+		wait_domain(0);
 
 	exit(EXIT_SUCCESS);
 }
@@ -264,10 +371,6 @@ static int node_compare(const void *va, const void *vb)
 	return a->nodeid - b->nodeid;
 }
 
-#define MAX_NODES 128
-
-struct fenced_node nodes[MAX_NODES];
-
 static int do_list(void)
 {
 	struct fenced_domain d;
@@ -346,6 +449,8 @@ static void print_usage(void)
 	printf("  dump		   Dump debug buffer from fenced\n");
 	printf("\n");
 	printf("Options:\n");
+	printf("  -m <seconds>     Delay join up to <seconds> for all nodes in cluster.conf\n");
+	printf("                   to be cluster members\n");
 	printf("  -w               Wait for join or leave to complete\n");
 	printf("  -t <seconds>     Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT);
 	printf("  -Q               Fail if cluster is not quorate, don't wait\n");
@@ -354,7 +459,7 @@ static void print_usage(void)
 	printf("\n");
 }
 
-#define OPTION_STRING "vVht:wQ"
+#define OPTION_STRING "vVht:wQm:"
 
 static void decode_arguments(int argc, char *argv[])
 {
@@ -391,6 +496,10 @@ static void decode_arguments(int argc, char *argv[])
 			wait_leave = 1;
 			break;
 
+		case 'm':
+			wait_members = atoi(optarg);
+			break;
+
 		case 't':
 			wait_timeout = get_int_arg(optchar, optarg);
 			break;
diff --git a/fence/man/fence_tool.8 b/fence/man/fence_tool.8
index a83da94..625fbe0 100644
--- a/fence/man/fence_tool.8
+++ b/fence/man/fence_tool.8
@@ -20,6 +20,9 @@ it to stdout.
 
 .SH OPTIONS
 .TP
+\fB-m\fP <n>
+Delay join up to n seconds for all nodes in cluster.conf to be cluster members.
+.TP
 \fB-w\fP
 Wait until the join or leave is completed.
 .TP
@@ -29,8 +32,8 @@ Help.  Print out the usage syntax.
 \fB-V\fP
 Print version information.
 .TP
-\fB-t\fP
-Maximum time in seconds to wait (default: 300 seconds)
+\fB-t\fP <n>
+Maximum time in seconds to wait for quorum or -w (default: 300 seconds)
 .TP
 \fB-Q\fP
 Fail command immediately if the cluster is not quorate, don't wait.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]