This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

The tag: gfs_0_1_18 has been created


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=c91f6ca13c5b67b9b111c50d4455fe0110b52274
Commit:        c91f6ca13c5b67b9b111c50d4455fe0110b52274
Parent:        86753221e88e910d57429e06c4aa3e6295d8d4f2
Author:        David Teigland <teigland@redhat.com>
AuthorDate:    Thu Jul 31 15:55:54 2008 -0500
Committer:     David Teigland <teigland@redhat.com>
CommitterDate: Thu Jul 31 15:55:54 2008 -0500

fence_tool: add domain member checks

using libfenced library.  Also clean up the code and logic for waiting
and timeouts.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fence/fence_tool/fence_tool.c |  260 ++++++++++++++++++++---------------------
 fence/fenced/main.c           |    9 +-
 fence/libfenced/libfenced.h   |    3 +
 3 files changed, 138 insertions(+), 134 deletions(-)

diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c
index ded7eda..95f4ba1 100644
--- a/fence/fence_tool/fence_tool.c
+++ b/fence/fence_tool/fence_tool.c
@@ -25,23 +25,22 @@
 #define OP_LIST				3
 #define OP_DUMP				4
 
+#define DEFAULT_WAIT_TIMEOUT		300 /* five minutes */
+
 #define die(fmt, args...) \
-do \
-{ \
-  fprintf(stderr, "%s: ", prog_name); \
-  fprintf(stderr, fmt "\n", ##args); \
-  exit(EXIT_FAILURE); \
-} \
-while (0)
+do { \
+	fprintf(stderr, "%s: ", prog_name); \
+	fprintf(stderr, fmt "\n", ##args); \
+	exit(EXIT_FAILURE); \
+} while (0)
 
 char *prog_name;
 int operation;
 int verbose = 0;
-int child_wait = 0;
-int quorum_wait = 1;
-int fenced_start_timeout = 300; /* five minutes */
-int signalled = 0;
-cman_handle_t ch;
+int inquorate_fail = 0;
+int wait_join = 0;			 /* default: don't wait for join */
+int wait_leave = 0;			 /* default: don't wait for leave */
+int wait_timeout = DEFAULT_WAIT_TIMEOUT; /* applies to all waits */
 
 static int do_write(int fd, void *buf, size_t count)
 {
@@ -62,25 +61,6 @@ static int do_write(int fd, void *buf, size_t count)
 	return 0;
 }
 
-#if 0
-static int do_read(int fd, void *buf, size_t count)
-{
-	int rv, off = 0;
-
-	while (off < count) {
-		rv = read(fd, buf + off, count - off);
-		if (rv == 0)
-			return -1;
-		if (rv == -1 && errno == EINTR)
-			continue;
-		if (rv == -1)
-			return -1;
-		off += rv;
-	}
-	return 0;
-}
-#endif
-
 static int get_int_arg(char argopt, char *arg)
 {
 	char *tmp;
@@ -120,119 +100,133 @@ static int check_mounted(void)
 	return 0;
 }
 
-static void sigalarm_handler(int sig)
-{
-	signalled = 1;
-}
-
 static int we_are_in_fence_domain(void)
 {
-#if 0
-	group_data_t gdata;
+	struct fenced_node nodeinfo;
 	int rv;
 
-	memset(&gdata, 0, sizeof(gdata));
-	rv = group_get_group(0, "default", &gdata);
+	memset(&nodeinfo, 0, sizeof(nodeinfo));
 
-	if (rv || strcmp(gdata.client_name, "fence"))
+	rv = fenced_node_info(FENCED_NODEID_US, &nodeinfo);
+	if (rv < 0)
 		return 0;
 
-	return gdata.member;
-#endif
-	printf("FIXME: use libfenced:fenced_domain_members()\n");
-	return 1;
+	if (nodeinfo.member)
+		return 1;
+	return 0;
 }
 
-/*
- * We wait for the cluster to be quorate in this program because it's easy to
- * kill this program if we want to quit waiting.  If we just started fenced
- * without waiting for quorum, fenced's join would then wait for quorum in SM
- * but we can't kill/cancel it at that point -- we have to wait for it to
- * complete.
- *
- * A second reason to wait for quorum is that the unfencing step involves
- * cluster.conf lookups through ccs, but ccsd may wait for the cluster to be
- * quorate before responding to the lookups.  There wouldn't be a problem
- * blocking there per se, but it's cleaner I think to just wait here first.
- *
- * In the case where we're leaving, we want to wait for quorum because if we go
- * ahead and shut down fenced, the fence domain leave will block in SM where it
- * will wait for quorum before the leave can be processed.  We can't
- * kill/cancel the leave at that point, but we can if we're waiting here.
- *
- * Waiting here doesn't guarantee we won't end up blocking in SM on the join or
- * leave, but it avoids it in some common cases which can be helpful.  (Quorum
- * could easily be lost between the time we wait for it here and then begin the
- * join/leave process.)
- */
-
-static int check_quorum(void)
+static void do_wait(int joining)
 {
-	int rv = 0, i = 0;
+	int in, tries = 0;
 
-	while (!signalled) {
-		rv = cman_is_quorate(ch);
-		if (rv)
-			return 1;
-		else if (!quorum_wait)
-			return 0;
+	while (1) {
+		in = we_are_in_fence_domain();
 
-		sleep(1);
+		if (joining && in)
+			break;
 
-		if (!signalled && ++i > 9 && !(i % 10))
-			printf("%s: waiting for cluster quorum\n", prog_name);
+		if (!joining && !in)
+			break;
+
+		if (tries++ >= wait_timeout)
+			goto fail;
+
+		if (!(tries % 5))
+			printf("Waiting for fenced to %s the fence group.\n",
+			       joining ? "join" : "leave");
+
+		sleep(1);
 	}
 
-	errno = ETIMEDOUT;
-	return 0;
+	return;
+ fail:
+	printf("Error %s the fence group.\n", joining ? "joining" : "leaving");
 }
 
-static int do_wait(int joining)
+static void wait_quorum(void)
 {
-	int i;
+	cman_handle_t ch;
+	int rv, try_init = 0, try_active = 0, try_quorate = 0;
+
+	while (1) {
+		ch = cman_init(NULL);
+		if (ch)
+			break;
+
+		if (inquorate_fail)
+			goto fail;
+
+		if (try_init++ >= wait_timeout)
+			goto fail_err;
+
+		if (!(try_init % 10))
+			printf("%s: waiting for cman to start\n", prog_name);
 
-	for (i=0; !fenced_start_timeout || i < fenced_start_timeout; i++) {
-		if (we_are_in_fence_domain() == joining)
-			return 0;
-		if (i && !(i % 5))
-			printf("Waiting for fenced to %s the fence group.\n",
-				   (joining?"join":"leave"));
 		sleep(1);
 	}
-	printf("Error joining the fence group.\n");
-	return -1;
-}
 
-static int do_join(int argc, char *argv[])
-{
-	int rv;
+	while (1) {
+		rv = cman_is_active(ch);
+		if (rv)
+			break;
+
+		if (inquorate_fail)
+			goto fail;
+
+		if (try_active++ >= wait_timeout)
+			goto fail_err;
 
-	ch = cman_init(NULL);
+		if (!(try_active % 10))
+			printf("%s: waiting for cman to be active\n",prog_name);
 
-	if (fenced_start_timeout) {
-		signal(SIGALRM, sigalarm_handler);
-		alarm(fenced_start_timeout);
+		sleep(1);
 	}
 
-	if (!check_quorum()) {
-		if (errno == ETIMEDOUT)
-			printf("%s: Timed out waiting for cluster "
-			       "quorum to form.\n", prog_name);
-		cman_finish(ch);
-		return EXIT_FAILURE;
+	while (1) {
+		rv = cman_is_quorate(ch);
+		if (rv)
+			break;
+
+		if (inquorate_fail)
+			goto fail;
+
+		if (try_quorate++ >= wait_timeout)
+			goto fail_err;
+
+		if (!(try_quorate % 10))
+			printf("%s: waiting for cluster quorum\n", prog_name);
+
+		sleep(1);
 	}
+
 	cman_finish(ch);
+	return;
+
+ fail_err:
+	printf("%s: Timed out waiting for cluster quorum to form.\n",
+	       prog_name);
+ fail:
+	exit(EXIT_FAILURE);
+}
+
+static void do_join(int argc, char *argv[])
+{
+	int rv;
+
+	wait_quorum();
 
 	rv = fenced_join();
 	if (rv < 0)
 		die("can't communicate with fenced");
 
-	if (child_wait)
+	if (wait_join)
 		do_wait(1);
-	return EXIT_SUCCESS;
+
+	exit(EXIT_SUCCESS);
 }
 
-static int do_leave(void)
+static void do_leave(void)
 {
 	int rv;
 
@@ -242,12 +236,13 @@ static int do_leave(void)
 	if (rv < 0)
 		die("can't communicate with fenced");
 
-	if (child_wait)
+	if (wait_leave)
 		do_wait(0);
-	return EXIT_SUCCESS;
+
+	exit(EXIT_SUCCESS);
 }
 
-static int do_dump(void)
+static void do_dump(void)
 {
 	char buf[FENCED_DUMP_SIZE];
 	int rv;
@@ -258,7 +253,7 @@ static int do_dump(void)
 
 	do_write(STDOUT_FILENO, buf, sizeof(buf));
 
-	return 0;
+	exit(EXIT_SUCCESS);
 }
 
 static int node_compare(const void *va, const void *vb)
@@ -282,7 +277,7 @@ static int do_list(void)
 
 	rv = fenced_domain_info(&d);
 	if (rv < 0)
-		goto out;
+		goto fail;
 
 	printf("fence domain \"default\"\n");
 	printf("member_count %d master_nodeid %d victim_count %d current_victim %d state %d\n",
@@ -294,7 +289,7 @@ static int do_list(void)
 	rv = fenced_domain_nodes(FENCED_NODES_MEMBERS, MAX_NODES,
 				 &node_count, nodes);
 	if (rv < 0)
-		goto out;
+		goto fail;
 
 	qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare);
 
@@ -308,7 +303,7 @@ static int do_list(void)
 	printf("\n");
 
 	if (!verbose)
-		return 0;
+		exit(EXIT_SUCCESS);
 
 	node_count = 0;
 	memset(&nodes, 0, sizeof(nodes));
@@ -316,7 +311,7 @@ static int do_list(void)
 	rv = fenced_domain_nodes(FENCED_NODES_ALL, MAX_NODES,
 				 &node_count, nodes);
 	if (rv < 0)
-		goto out;
+		goto fail;
 
 	qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare);
 
@@ -332,10 +327,10 @@ static int do_list(void)
 				np->last_fenced_how);
 		np++;
 	}
-	return 0;
- out:
+	exit(EXIT_SUCCESS);
+ fail:
 	fprintf(stderr, "fenced query error %d\n", rv);
-	return rv;
+	exit(EXIT_FAILURE);
 }
 
 static void print_usage(void)
@@ -351,11 +346,11 @@ static void print_usage(void)
 	printf("  dump		   Dump debug buffer from fenced\n");
 	printf("\n");
 	printf("Options:\n");
-	printf("  -w               Wait for join to complete\n");
+	printf("  -w               Wait for join or leave to complete\n");
+	printf("  -t <seconds>     Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT);
+	printf("  -Q               Fail if cluster is not quorate, don't wait\n");
 	printf("  -V               Print program version information, then exit\n");
 	printf("  -h               Print this help, then exit\n");
-	printf("  -t               Maximum time in seconds to wait\n");
-	printf("  -Q               Fail if cluster is not quorate, don't wait\n");
 	printf("\n");
 }
 
@@ -388,11 +383,16 @@ static void decode_arguments(int argc, char *argv[])
 			break;
 
 		case 'Q':
-			quorum_wait = 0;
+			inquorate_fail = 1;
 			break;
 
 		case 'w':
-			child_wait = 1;
+			wait_join = 1;
+			wait_leave = 1;
+			break;
+
+		case 't':
+			wait_timeout = get_int_arg(optchar, optarg);
 			break;
 
 		case ':':
@@ -405,10 +405,6 @@ static void decode_arguments(int argc, char *argv[])
 			cont = 0;
 			break;
 
-		case 't':
-			fenced_start_timeout = get_int_arg(optchar, optarg);
-			break;
-
 		default:
 			die("unknown option: %c\n", optchar);
 			break;
@@ -441,13 +437,13 @@ int main(int argc, char *argv[])
 
 	switch (operation) {
 	case OP_JOIN:
-		return do_join(argc, argv);
+		do_join(argc, argv);
 	case OP_LEAVE:
-		return do_leave();
+		do_leave();
 	case OP_DUMP:
-		return do_dump();
+		do_dump();
 	case OP_LIST:
-		return do_list();
+		do_list();
 	}
 
 	return EXIT_FAILURE;
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index 3c88197..6bfbb0e 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -316,11 +316,11 @@ static void query_dump_debug(int f)
 	do_write(f, dump_buf, len);
 }
 
-static void query_node_info(int f, int nodeid)
+static void query_node_info(int f, int data_nodeid)
 {
 	struct fd *fd;
 	struct fenced_node node;
-	int rv;
+	int nodeid, rv;
 
 	fd = find_fd("default");
 	if (!fd) {
@@ -328,6 +328,11 @@ static void query_node_info(int f, int nodeid)
 		goto out;
 	}
 
+	if (data_nodeid == FENCED_NODEID_US)
+		nodeid = our_nodeid;
+	else
+		nodeid = data_nodeid;
+
 	if (group_mode == GROUP_LIBGROUP)
 		rv = set_node_info_group(fd, nodeid, &node);
 	else
diff --git a/fence/libfenced/libfenced.h b/fence/libfenced/libfenced.h
index 056e4f2..30ea210 100644
--- a/fence/libfenced/libfenced.h
+++ b/fence/libfenced/libfenced.h
@@ -3,6 +3,9 @@
 
 #define FENCED_DUMP_SIZE (1024 * 1024)
 
+/* for querying local node info */
+#define FENCED_NODEID_US 0
+
 struct fenced_node {
 	int nodeid;
 	int member;


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]