This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

cluster: RHEL53 - clogd: Fixes for bug 474179 - more checkpoint issues


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=19193c063c6c1831d9ce84aff2987dd706152023
Commit:        19193c063c6c1831d9ce84aff2987dd706152023
Parent:        6ae48f308f488af819ae0e8b1607e81d9e9f24a7
Author:        Jonathan Brassow <jbrassow@redhat.com>
AuthorDate:    Mon Dec 8 09:24:02 2008 -0600
Committer:     Jonathan Brassow <jbrassow@redhat.com>
CommitterDate: Mon Dec 8 15:48:52 2008 -0600

clogd: Fixes for bug 474179 - more checkpoint issues

I've now added intelligence that detects if the last node
that has current log state has left.  If this has occured
before that node could give a checkpoint of that state,
we need to find the first node that entered in the new
phase and allow it to behave like the 'first joiner'.  IOW,
give it the power to read the disk log.

I've also added data to the checkpoint that specifies who
is recovering if there is recovery happening.  I believe
this is partially to blame for 464550.
---
 cmirror/src/cluster.c   |  138 ++++++++++++++++++++++++++++++++--------------
 cmirror/src/functions.c |  128 ++++++++++++++++++++++++++++---------------
 2 files changed, 179 insertions(+), 87 deletions(-)

diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index cfc6d3c..d077382 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -21,6 +21,7 @@
 #include "common.h"
 #include "logging.h"
 #include "link_mon.h"
+#include "cluster.h"
 
 /* Open AIS error codes */
 #define str_ais_error(x) \
@@ -146,8 +147,22 @@ int cluster_send(struct clog_tfr *tfr)
 
 	do {
 		r = cpg_mcast_joined(entry->handle, CPG_TYPE_AGREED, &iov, 1);
-		if ((r != SA_AIS_ERR_TRY_AGAIN) || (count++ > 5))
+		if (r != SA_AIS_ERR_TRY_AGAIN)
 			break;
+		count++;
+		if (count < 10)
+			LOG_PRINT("[%s]  Retry #%d of cpg_mcast_joined: %s",
+				  SHORT_UUID(tfr->uuid), count, str_ais_error(r));
+		else if ((count < 100) && !(count % 10))
+			LOG_ERROR("[%s]  Retry #%d of cpg_mcast_joined: %s",
+				  SHORT_UUID(tfr->uuid), count, str_ais_error(r));
+		else if ((count < 1000) && !(count % 100))
+			LOG_ERROR("[%s]  Retry #%d of cpg_mcast_joined: %s",
+				  SHORT_UUID(tfr->uuid), count, str_ais_error(r));
+		else if ((count < 10000) && !(count % 1000))
+			LOG_ERROR("[%s]  Retry #%d of cpg_mcast_joined: %s - "
+				  "OpenAIS not handling the load?",
+				  SHORT_UUID(tfr->uuid), count, str_ais_error(r));
 		usleep(1000);
 	} while (1);
 
@@ -818,6 +833,54 @@ static int do_cluster_work(void *data)
 	return (r == SA_AIS_OK) ? 0 : -1;  /* FIXME: good error number? */
 }
 
+static int flush_startup_list(struct clog_cpg *entry)
+{
+	int r = 0;
+	int i_was_server;
+	struct list_head *p, *n;
+	struct clog_tfr *tfr = NULL;
+	struct checkpoint_data *new;
+
+	list_for_each_safe(p, n, &entry->startup_list) {
+		list_del_init(p);
+		tfr = (struct clog_tfr *)p;
+		if (tfr->request_type == DM_CLOG_MEMBER_JOIN) {
+			new = prepare_checkpoint(entry, tfr->originator);
+			if (!new) {
+				/*
+				 * FIXME: Need better error handling.  Other nodes
+				 * will be trying to send the checkpoint too, and we
+				 * must continue processing the list; so report error
+				 * but continue.
+				 */
+				LOG_ERROR("Failed to prepare checkpoint for %u!!!",
+					  tfr->originator);
+				free(tfr);
+				continue;
+			}
+			LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u",
+				 SHORT_UUID(entry->name.value), tfr->originator);
+			new->next = entry->checkpoint_list;
+			entry->checkpoint_list = new;
+		} else {
+			LOG_DBG("[%s] Processing delayed request: %s",
+				SHORT_UUID(tfr->uuid), _RQ_TYPE(tfr->request_type));
+			i_was_server = (tfr->error == my_cluster_id) ? 1 : 0;
+			tfr->error = 0;
+			r = handle_cluster_request(entry, tfr, i_was_server);
+
+			if (r)
+				/*
+				 * FIXME: If we error out here, we will never get
+				 * another opportunity to retry these requests
+				 */
+				LOG_ERROR("Error while processing delayed CPG message");
+		}
+		free(tfr);
+	}
+	return 0;
+}
+
 static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 				 uint32_t nodeid, uint32_t pid,
 				 void *msg, int msg_len)
@@ -825,9 +888,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 	int i;
 	int r = 0;
 	int i_am_server;
-	int i_was_server;
 	int response = 0;
-	struct list_head *p, *n;
 	struct clog_tfr *tfr = msg;
 	struct clog_tfr *tmp_tfr = NULL;
 	struct clog_cpg *match;
@@ -919,44 +980,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 					 SHORT_UUID(match->name.value), nodeid);
 				match->state = VALID;
 
-				list_for_each_safe(p, n, &match->startup_list) {
-					list_del_init(p);
-					tmp_tfr = (struct clog_tfr *)p;
-					if (tmp_tfr->request_type == DM_CLOG_MEMBER_JOIN) {
-						struct checkpoint_data *new;
-
-						new = prepare_checkpoint(match, tmp_tfr->originator);
-						if (!new) {
-							/* FIXME: Need better error handling */
-							LOG_ERROR("Failed to prepare checkpoint for %u!!!",
-								  tmp_tfr->originator);
-							free(tmp_tfr);
-							goto out;
-						}
-						LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u",
-							 SHORT_UUID(tfr->uuid), tmp_tfr->originator);
-						new->next = match->checkpoint_list;
-						match->checkpoint_list = new;
-					} else {
-						LOG_DBG("[%s] Processing delayed request: %s",
-							SHORT_UUID(tmp_tfr->uuid),
-							_RQ_TYPE(tmp_tfr->request_type));
-						i_was_server = (tmp_tfr->error == my_cluster_id) ? 1 : 0;
-						tmp_tfr->error = 0;
-						r = handle_cluster_request(match, tmp_tfr, i_was_server);
-
-						if (r) {
-							LOG_ERROR("Error while processing delayed CPG message");
-							/*
-							 * FIXME: If we error out here, we will never get
-							 * another opportunity to retry these requests
-							 */
-							free(tmp_tfr);
-							goto out;
-						}
-					}
-					free(tmp_tfr);
-				}
+				flush_startup_list(match);
 			}
 		}
 		goto out;
@@ -1018,6 +1042,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 	}
 
 out:
+	/* nothing happens after this point.  It is just for debugging */
 	if (r) {
 		LOG_ERROR("[%s] Error while processing CPG message, %s: %s",
 			  SHORT_UUID(tfr->uuid),
@@ -1179,7 +1204,8 @@ static void cpg_leave_callback(struct clog_cpg *match,
 			match->resend_requests = 0;
 		LOG_COND(log_resend_requests, "[%s] %u has left, delay = %d%s",
 			 SHORT_UUID(match->name.value), left->nodeid,
-			 match->delay, (list_empty(&match->working_list)) ? " -- working_list empty": "");
+			 match->delay, (list_empty(&match->working_list)) ?
+			 " -- working_list empty": "");
 	}
 
 	/* Find the lowest_id, i.e. the server */
@@ -1205,6 +1231,34 @@ static void cpg_leave_callback(struct clog_cpg *match,
 		LOG_COND(log_membership_change, "[%s]  Server unchanged at %u (%u left)",
 			 SHORT_UUID(match->name.value), lowest, left->nodeid);
 
+	if ((match->state == INVALID) && !match->free_me) {
+		/*
+		 * If all CPG members are waiting for checkpoints and they
+		 * are all present in my startup_list, then I was the first to
+		 * join and I must assume control.
+		 *
+		 * We do not normally end up here, but if there was a quick
+		 * 'resume -> suspend -> resume' across the cluster, we may
+		 * have initially thought we were not the first to join because
+		 * of the presence of out-going (and unable to respond) members.
+		 */
+
+		i = 1; /* We do not have a DM_CLOG_MEMBER_JOIN entry */
+		list_for_each_safe(p, n, &match->startup_list) {
+			tfr = (struct clog_tfr *)p;
+			if (tfr->request_type == DM_CLOG_MEMBER_JOIN)
+				i++;
+		}
+
+		if (i == member_list_entries) {
+			/* 
+			 * Last node who could have given me a checkpoint just left.
+			 * Setting log state to VALID and acting as 'first join'.
+			 */
+			match->state = VALID;
+			flush_startup_list(match);
+		}
+	}
 }
 
 static void cpg_config_callback(cpg_handle_t handle, struct cpg_name *gname,
diff --git a/cmirror/src/functions.c b/cmirror/src/functions.c
index a76318a..4db14ad 100644
--- a/cmirror/src/functions.c
+++ b/cmirror/src/functions.c
@@ -29,6 +29,15 @@
 #define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2
 
+#define RESYNC_HISTORY 50
+static char resync_history[RESYNC_HISTORY][128];
+static int idx = 0;
+#define LOG_SPRINT(f, arg...) do {\
+		idx++; \
+		idx = idx % RESYNC_HISTORY; \
+		sprintf(resync_history[idx], f, ## arg); \
+	} while (0)
+
 struct log_header {
         uint32_t magic;
         uint32_t version;
@@ -1132,19 +1141,25 @@ static int clog_get_resync_work(struct clog_tfr *tfr)
 		 * FIXME: handle intermittent errors during recovery
 		 * by resetting sync_search... but not to many times.
 		 */
-		LOG_DBG("[%s]  Recovery has finished", SHORT_UUID(lc->uuid));
+		LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+			   "Recovery finished",
+			   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
 		return 0;
 	}
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		LOG_DBG("[%s] Someone is already recovering region %llu",
-			SHORT_UUID(lc->uuid),
-			(unsigned long long)lc->recovering_region);
 		if (lc->recoverer == tfr->originator) {
-			LOG_PRINT("[%s] %u is re-requesting resync work, seq=%u",
-				  SHORT_UUID(lc->uuid), tfr->originator, tfr->seq);
+			LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+				   "Re-requesting work (%llu)",
+				   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+				   (unsigned long long)lc->recovering_region);
 			pkg->r = lc->recovering_region;
 			pkg->i = 1;
+		} else {
+			LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+				   "Someone already recovering (%llu)",
+				   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+				   (unsigned long long)lc->recovering_region);
 		}
 
 		return 0;
@@ -1160,16 +1175,10 @@ static int clog_get_resync_work(struct clog_tfr *tfr)
 		free(del);
 
 		if (!log_test_bit(lc->sync_bits, pkg->r)) {
-			LOG_DBG("[%s] Assigning priority resync work to %u: %llu",
-				SHORT_UUID(lc->uuid), tfr->originator,
-				(unsigned long long)pkg->r);
-#ifdef DEBUG
-			LOG_DBG("[%s] Priority work remaining:",
-				SHORT_UUID(lc->uuid));
-			for (del = lc->recovery_request_list; del; del = del->next)
-				LOG_DBG("[%s]  %llu", SHORT_UUID(lc->uuid),
-					(unsigned long long)del->region);
-#endif			
+			LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+				   "Assigning priority resync work (%llu)",
+				   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+				   (unsigned long long)pkg->r);
 			pkg->i = 1;
 			lc->recovering_region = pkg->r;
 			lc->recoverer = tfr->originator;
@@ -1181,14 +1190,19 @@ static int clog_get_resync_work(struct clog_tfr *tfr)
 				    lc->region_count,
 				    lc->sync_search);
 
-	if (pkg->r >= lc->region_count)
+	if (pkg->r >= lc->region_count) {
+		LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+			   "Resync work complete.",
+			   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
 		return 0;
+	}
 
 	lc->sync_search = pkg->r + 1;
 
-	LOG_DBG("[%s] Assigning resync work to %u: region = %llu\n",
-		SHORT_UUID(lc->uuid), tfr->originator, 
-		(unsigned long long)pkg->r);
+	LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+		   "Assigning resync work (%llu)",
+		   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+		   (unsigned long long)pkg->r);
 	pkg->i = 1;
 	lc->recovering_region = pkg->r;
 	lc->recoverer = tfr->originator;
@@ -1212,39 +1226,41 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
 
 	if (pkg->in_sync) {
 		if (log_test_bit(lc->sync_bits, pkg->region)) {
-			LOG_ERROR("[%s]  Region already in-sync: region=%llu, seq=%u, who=%u",
-				  SHORT_UUID(lc->uuid),
-				  (unsigned long long)pkg->region,
-				  tfr->seq,
-				  tfr->originator);
+			LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+				   "Region already set (%llu)",
+				   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+				   (unsigned long long)pkg->region);
 		} else {
 			log_set_bit(lc, lc->sync_bits, pkg->region);
 			lc->sync_count++;
-			LOG_DBG("[%s] sync_count=%llu, Region %llu marked in-sync by %u, seq=%u",
-				SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count,
-				(unsigned long long)pkg->region, tfr->originator,
-				tfr->seq);
+			LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+				   "Setting region (%llu)",
+				   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+				   (unsigned long long)pkg->region);
 		}
 	} else if (log_test_bit(lc->sync_bits, pkg->region)) {
 		lc->sync_count--;
 		log_clear_bit(lc, lc->sync_bits, pkg->region);
-		LOG_DBG("[%s] sync_count = %llu, Region %llu marked not in-sync by %u",
-			SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count,
-			(unsigned long long)pkg->region, tfr->originator);
+		LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+			   "Unsetting region (%llu)",
+			   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+			   (unsigned long long)pkg->region);
 	}
 
 	if (lc->sync_count != count_bits32(lc->sync_bits, lc->bitset_uint32_count)) {
 		unsigned long long reset = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
 
-		LOG_ERROR("[%s]  sync_count(%llu) does not match bitmap count(%llu)",
-			  SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count, reset);
-		LOG_ERROR("[%s]  Resetting sync_count = %llu", SHORT_UUID(lc->uuid), reset);
+		LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+			   "sync_count(%llu) != bitmap count(%llu)",
+			   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator,
+			   (unsigned long long)lc->sync_count, reset);
 		lc->sync_count = reset;
 	}
 
 	if (lc->sync_count > lc->region_count)
-		LOG_ERROR("[%s]  clog_resume: (lc->sync_count > lc->region_count)",
-			  SHORT_UUID(lc->uuid));
+		LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: "
+			   "(lc->sync_count > lc->region_count) - this is bad",
+			   tfr->seq, SHORT_UUID(lc->uuid), tfr->originator);
 
 	tfr->data_size = 0;
 	return 0;
@@ -1592,12 +1608,17 @@ int push_state(const char *uuid, const char *which, char **buf)
 	}
 
 	if (!strcmp(which, "recovering_region")) {
-		*buf = malloc(32); /* easily covers largest 64-bit int */
+		*buf = malloc(64); /* easily handles the 2 written numbers */
 		if (!*buf)
 			return -ENOMEM;
-		sprintf(*buf, "%llu", (unsigned long long)lc->recovering_region);
-
-		return 32;
+		sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region,
+			lc->recoverer);
+
+		LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = X:: "
+			   "recovering_region=%llu, recoverer=%u",
+			   SHORT_UUID(lc->uuid),
+			   (unsigned long long)lc->recovering_region, lc->recoverer);
+		return 64;
 	}
 
 	bitset_size = lc->bitset_uint32_count * sizeof(*lc->clean_bits);
@@ -1639,10 +1660,12 @@ int pull_state(const char *uuid, const char *which, char *buf, int size)
 	}
 
 	if (!strncmp(which, "recovering_region", 17)) {
-		sscanf(buf, "%llu", (unsigned long long *)&lc->recovering_region);
-		LOG_DBG("[%s] recovering_region set to %llu",
-			SHORT_UUID(uuid),
-			(unsigned long long)lc->recovering_region);
+		sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region,
+		       &lc->recoverer);
+		LOG_SPRINT("CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: "
+			   "recovering_region=%llu, recoverer=%u",
+			   SHORT_UUID(lc->uuid),
+			   (unsigned long long)lc->recovering_region, lc->recoverer);
 		return 0;
 	}
 
@@ -1704,6 +1727,7 @@ void log_debug(void)
 	struct list_head *l;
 	struct log_c *lc;
 	uint64_t r;
+	int i;
 
 	LOG_ERROR("");
 	LOG_ERROR("LOG COMPONENT DEBUGGING::");
@@ -1711,6 +1735,11 @@ void log_debug(void)
 	__list_for_each(l, &log_list) {
 		lc = list_entry(l, struct log_c, list);
 		LOG_ERROR("%s", lc->uuid);
+		LOG_ERROR("  recoverer        : %u", lc->recoverer);
+		LOG_ERROR("  recovering_region: %llu",
+			  (unsigned long long)lc->recovering_region);
+		LOG_ERROR("  recovery_halted  : %s", (lc->recovery_halted) ?
+			  "YES" : "NO");
 		LOG_ERROR("sync_bits:");
 		print_bits((char *)lc->sync_bits,
 			   lc->bitset_uint32_count * sizeof(*lc->sync_bits), 1);
@@ -1747,4 +1776,13 @@ void log_debug(void)
 			lc->sync_count = lc->region_count;
 		}
 	}
+
+	LOG_ERROR("Resync request history:");
+	for (i = 0; i < RESYNC_HISTORY; i++) {
+		idx++;
+		idx = idx % RESYNC_HISTORY;
+		if (resync_history[idx][0] == '\0')
+			continue;
+		LOG_ERROR("%d:%d) %s", i, idx, resync_history[idx]);
+	}
 }


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]