This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
RHEL5 - clogd: Fix one cause of bug 464550 - copy percentage can getstuck during

From: Jonathan Brassow <jbrassow at fedoraproject dot org>
To: cluster-cvs-relay at redhat dot com
Date: Wed, 8 Oct 2008 21:13:10 +0000 (UTC)
Subject: RHEL5 - clogd: Fix one cause of bug 464550 - copy percentage can getstuck during
Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=027ab643fa43ab1b97724b5702f55bcbe619b12e
Commit:        027ab643fa43ab1b97724b5702f55bcbe619b12e
Parent:        b89bf045f70c2ce4c2534012a50e691190cbe50d
Author:        Jonathan Brassow <jbrassow@redhat.com>
AuthorDate:    Wed Oct 8 16:11:46 2008 -0500
Committer:     Jonathan Brassow <jbrassow@redhat.com>
CommitterDate: Wed Oct 8 16:11:46 2008 -0500

clogd:  Fix one cause of bug 464550 - copy percentage can get stuck during

If the cluster log gets a request from the kernel to shutdown immediately
after it was created, then it must perform some clean-up.  This covers the
case where it is told to shutdown after it has requested a checkpoint, but
before it has read/unlinked it.
---
 cmirror/src/cluster.c |  138 +++++++++++++++++++++++++++----------------------
 1 files changed, 77 insertions(+), 61 deletions(-)

diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index 3ac4f87..8c49dee 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -59,13 +59,6 @@ static SaCkptHandleT ckpt_handle = 0;
 static SaCkptCallbacksT callbacks = { 0, 0 };
 static SaVersionT version = { 'B', 1, 1 };
 
-#define DEBUGGING_HISTORY 100
-static char debugging[DEBUGGING_HISTORY][128];
-static int idx = 0;
-static int memberz = 0;
-static int doit = 0;
-
-
 struct checkpoint_data {
 	uint32_t requester;
 	char uuid[CPG_MAX_NAME_LENGTH];
@@ -162,7 +155,7 @@ static int clog_tfr_cmp(struct clog_tfr *a, struct clog_tfr *b)
 	return r;
 }
 
-static int handle_cluster_request(struct clog_tfr *tfr, int server, int printz)
+static int handle_cluster_request(struct clog_tfr *tfr, int server)
 {
 	int r = 0;
 
@@ -184,11 +177,6 @@ static int handle_cluster_request(struct clog_tfr *tfr, int server, int printz)
 		/*
 		 * Errors from previous functions are in the tfr struct.
 		 */
-		if (printz)
-			LOG_DBG("[%s] Sending response to %u on cluster: [%s/%u]",
-				SHORT_UUID(tfr->uuid), tfr->originator,
-				RQ_TYPE(tfr->request_type & ~DM_CLOG_RESPONSE),
-				tfr->seq);
 		r = cluster_send(tfr);
 		if (r)
 			LOG_ERROR("cluster_send failed: %s", strerror(-r));
@@ -803,7 +791,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 							RQ_TYPE(startup_tfr->request_type));
 						i_was_server = (startup_tfr->error == my_cluster_id) ? 1 : 0;
 						startup_tfr->error = 0;
-						r = handle_cluster_request(startup_tfr, i_was_server, 1);
+						r = handle_cluster_request(startup_tfr, i_was_server);
 
 						if (r) {
 							LOG_ERROR("Error while processing delayed CPG message");
@@ -879,8 +867,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 			goto out;
 		}
 
-		r = handle_cluster_request(tfr, i_am_server,
-					   ((memberz != 4) || (--doit > 0)));
+		r = handle_cluster_request(tfr, i_am_server);
 	}
 
 out:
@@ -896,28 +883,6 @@ out:
 		if (response)
 			LOG_ERROR("[%s]    Responder : %u",
 				  SHORT_UUID(tfr->uuid), nodeid);
-		LOG_ERROR("HISTORY::");
-
-		for (i = 0; i < DEBUGGING_HISTORY; i++) {
-			idx++;
-			idx = idx % DEBUGGING_HISTORY;
-			if (debugging[idx][0] == '\0')
-				continue;
-			LOG_ERROR("%d:%d) %s", i, idx, debugging[idx]);
-		}
-	} else if (!(tfr->request_type & DM_CLOG_RESPONSE) ||
-		   (tfr->originator == my_cluster_id)) {
-		int len;
-		idx++;
-		idx = idx % DEBUGGING_HISTORY;
-		len = sprintf(debugging[idx],
-			      "SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
-			      tfr->seq,
-			      SHORT_UUID(tfr->uuid),
-			      RQ_TYPE(tfr->request_type),
-			      tfr->originator, (response) ? "YES" : "NO");
-		if (response)
-			sprintf(debugging[idx] + len, ", RSPR=%u", nodeid);
 	}
 }
 
@@ -1076,18 +1041,22 @@ static void cpg_leave_callback(struct clog_cpg *match,
 		list_for_each_safe(p, n, &cluster_queue->list) {
 			tfr = (struct clog_tfr *)p;
 
-			/*
-			 * Don't resend DM_CLOG_POSTSUSPEND request, it will
-			 * be handled when we get our own config leave
-			 */
-			if (!strcmp(match->name.value, tfr->uuid) &&
-			    (tfr->request_type != DM_CLOG_POSTSUSPEND)){
-				LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
-					  SHORT_UUID(match->name.value),
-					  RQ_TYPE(tfr->request_type),
-					  lowest, match->lowest_id);
-				if (cluster_send(tfr))
-					LOG_ERROR("Failed resend");
+			if (!strcmp(match->name.value, tfr->uuid)) {
+				switch (tfr->request_type) {
+				case DM_CLOG_POSTSUSPEND:
+					/*
+					 * Don't resend DM_CLOG_POSTSUSPEND request, it will
+					 * be handled when we get our own config leave
+					 */
+					break;
+				default:
+					LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
+						  SHORT_UUID(match->name.value),
+						  RQ_TYPE(tfr->request_type),
+						  lowest, match->lowest_id);
+					if (cluster_send(tfr))
+						LOG_ERROR("Failed resend");
+				}
 			}
 		}
 	} else
@@ -1107,8 +1076,6 @@ static void cpg_config_callback(cpg_handle_t handle, struct cpg_name *gname,
 	struct clog_cpg *match, *tmp;
 	int found = 0;
 
-	memberz = member_list_entries;
-
 	list_for_each_entry_safe(match, tmp, &clog_cpg_list, list)
 		if (match->handle == handle) {
 			found = 1;
@@ -1130,9 +1097,6 @@ static void cpg_config_callback(cpg_handle_t handle, struct cpg_name *gname,
 	else
 		cpg_leave_callback(match, left_list,
 				  member_list, member_list_entries);
-
-	if (joined_list_entries && (joined_list[0].nodeid == my_cluster_id))
-		doit = 25;
 }
 
 cpg_callbacks_t cpg_callbacks = {
@@ -1201,6 +1165,61 @@ int create_cluster_cpg(char *str)
 	return 0;
 }
 
+static void abort_startup(struct clog_cpg *del)
+{
+	int len;
+	SaNameT name;
+	SaAisErrorT rv;
+	SaCkptCheckpointHandleT h;
+	struct clog_tfr *startup_tfr = NULL;
+
+	LOG_ERROR("[%s]  CPG teardown before checkpoint received",
+		  SHORT_UUID(del->name.value));
+
+	while ((startup_tfr = queue_remove(del->startup_queue))) {
+		LOG_ERROR("[%s]  Ignoring request from %u: %s",
+			  SHORT_UUID(del->name.value), startup_tfr->originator,
+			  RQ_TYPE(startup_tfr->request_type));
+		queue_add(startup_tfr, free_queue);
+	}
+
+	len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, "bitmaps_%s_%u",
+                       SHORT_UUID(del->name.value), my_cluster_id);
+	name.length = len;
+
+open_retry:
+	rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL,
+                                  SA_CKPT_CHECKPOINT_READ, 0, &h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		LOG_ERROR("abort_startup: ckpt open retry");
+                usleep(1000);
+                goto open_retry;
+        }
+
+	if (rv != SA_AIS_OK) {
+                LOG_ERROR("[%s] Failed to open checkpoint: %s",
+                          SHORT_UUID(del->name.value), str_ais_error(rv));
+                return;
+        }
+
+	LOG_ERROR("[%s]  Removing checkpoint", SHORT_UUID(del->name.value));
+unlink_retry:
+        rv = saCkptCheckpointUnlink(ckpt_handle, &name);
+        if (rv == SA_AIS_ERR_TRY_AGAIN) {
+                LOG_ERROR("abort_startup: ckpt unlink retry");
+                usleep(1000);
+                goto unlink_retry;
+        }
+	
+	if (rv != SA_AIS_OK) {
+                LOG_ERROR("[%s] Failed to unlink checkpoint: %s",
+                          SHORT_UUID(del->name.value), str_ais_error(rv));
+                return;
+        }
+
+	saCkptCheckpointClose(h);
+}
+
 int destroy_cluster_cpg(char *str)
 {
 	int r;
@@ -1210,6 +1229,9 @@ int destroy_cluster_cpg(char *str)
 		if (!strncmp(del->name.value, str, CPG_MAX_NAME_LENGTH)) {
 			del->cpg_state = INVALID;
 			del->state = LEAVING;
+			if (!queue_empty(del->startup_queue))
+				abort_startup(del);
+
 			r = cpg_leave(del->handle, &del->name);
 			if (r != CPG_OK)
 				LOG_ERROR("Error leaving CPG!");
@@ -1223,12 +1245,6 @@ int init_cluster(void)
 {
 	SaAisErrorT rv;
 
-	{
-		int i;
-		for(i = 0; i < DEBUGGING_HISTORY; i++)
-			debugging[i][0] = '\0';
-	}
-
 	INIT_LIST_HEAD(&clog_cpg_list);
 	rv = saCkptInitialize(&ckpt_handle, &callbacks, &version);
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]