This is the mail archive of the
cluster-cvs@sourceware.org
mailing list for the cluster.
RHEL5 - clogd: Fix one cause of bug 464550 - copy percentage can getstuck during
- From: Jonathan Brassow <jbrassow at fedoraproject dot org>
- To: cluster-cvs-relay at redhat dot com
- Date: Wed, 8 Oct 2008 21:13:10 +0000 (UTC)
- Subject: RHEL5 - clogd: Fix one cause of bug 464550 - copy percentage can getstuck during
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=027ab643fa43ab1b97724b5702f55bcbe619b12e
Commit: 027ab643fa43ab1b97724b5702f55bcbe619b12e
Parent: b89bf045f70c2ce4c2534012a50e691190cbe50d
Author: Jonathan Brassow <jbrassow@redhat.com>
AuthorDate: Wed Oct 8 16:11:46 2008 -0500
Committer: Jonathan Brassow <jbrassow@redhat.com>
CommitterDate: Wed Oct 8 16:11:46 2008 -0500
clogd: Fix one cause of bug 464550 - copy percentage can get stuck during
If the cluster log gets a request from the kernel to shutdown immediately
after it was created, then it must perform some clean-up. This covers the
case where it is told to shutdown after it has requested a checkpoint, but
before it has read/unlinked it.
---
cmirror/src/cluster.c | 138 +++++++++++++++++++++++++++----------------------
1 files changed, 77 insertions(+), 61 deletions(-)
diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index 3ac4f87..8c49dee 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -59,13 +59,6 @@ static SaCkptHandleT ckpt_handle = 0;
static SaCkptCallbacksT callbacks = { 0, 0 };
static SaVersionT version = { 'B', 1, 1 };
-#define DEBUGGING_HISTORY 100
-static char debugging[DEBUGGING_HISTORY][128];
-static int idx = 0;
-static int memberz = 0;
-static int doit = 0;
-
-
struct checkpoint_data {
uint32_t requester;
char uuid[CPG_MAX_NAME_LENGTH];
@@ -162,7 +155,7 @@ static int clog_tfr_cmp(struct clog_tfr *a, struct clog_tfr *b)
return r;
}
-static int handle_cluster_request(struct clog_tfr *tfr, int server, int printz)
+static int handle_cluster_request(struct clog_tfr *tfr, int server)
{
int r = 0;
@@ -184,11 +177,6 @@ static int handle_cluster_request(struct clog_tfr *tfr, int server, int printz)
/*
* Errors from previous functions are in the tfr struct.
*/
- if (printz)
- LOG_DBG("[%s] Sending response to %u on cluster: [%s/%u]",
- SHORT_UUID(tfr->uuid), tfr->originator,
- RQ_TYPE(tfr->request_type & ~DM_CLOG_RESPONSE),
- tfr->seq);
r = cluster_send(tfr);
if (r)
LOG_ERROR("cluster_send failed: %s", strerror(-r));
@@ -803,7 +791,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
RQ_TYPE(startup_tfr->request_type));
i_was_server = (startup_tfr->error == my_cluster_id) ? 1 : 0;
startup_tfr->error = 0;
- r = handle_cluster_request(startup_tfr, i_was_server, 1);
+ r = handle_cluster_request(startup_tfr, i_was_server);
if (r) {
LOG_ERROR("Error while processing delayed CPG message");
@@ -879,8 +867,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
goto out;
}
- r = handle_cluster_request(tfr, i_am_server,
- ((memberz != 4) || (--doit > 0)));
+ r = handle_cluster_request(tfr, i_am_server);
}
out:
@@ -896,28 +883,6 @@ out:
if (response)
LOG_ERROR("[%s] Responder : %u",
SHORT_UUID(tfr->uuid), nodeid);
- LOG_ERROR("HISTORY::");
-
- for (i = 0; i < DEBUGGING_HISTORY; i++) {
- idx++;
- idx = idx % DEBUGGING_HISTORY;
- if (debugging[idx][0] == '\0')
- continue;
- LOG_ERROR("%d:%d) %s", i, idx, debugging[idx]);
- }
- } else if (!(tfr->request_type & DM_CLOG_RESPONSE) ||
- (tfr->originator == my_cluster_id)) {
- int len;
- idx++;
- idx = idx % DEBUGGING_HISTORY;
- len = sprintf(debugging[idx],
- "SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s",
- tfr->seq,
- SHORT_UUID(tfr->uuid),
- RQ_TYPE(tfr->request_type),
- tfr->originator, (response) ? "YES" : "NO");
- if (response)
- sprintf(debugging[idx] + len, ", RSPR=%u", nodeid);
}
}
@@ -1076,18 +1041,22 @@ static void cpg_leave_callback(struct clog_cpg *match,
list_for_each_safe(p, n, &cluster_queue->list) {
tfr = (struct clog_tfr *)p;
- /*
- * Don't resend DM_CLOG_POSTSUSPEND request, it will
- * be handled when we get our own config leave
- */
- if (!strcmp(match->name.value, tfr->uuid) &&
- (tfr->request_type != DM_CLOG_POSTSUSPEND)){
- LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
- SHORT_UUID(match->name.value),
- RQ_TYPE(tfr->request_type),
- lowest, match->lowest_id);
- if (cluster_send(tfr))
- LOG_ERROR("Failed resend");
+ if (!strcmp(match->name.value, tfr->uuid)) {
+ switch (tfr->request_type) {
+ case DM_CLOG_POSTSUSPEND:
+ /*
+ * Don't resend DM_CLOG_POSTSUSPEND request, it will
+ * be handled when we get our own config leave
+ */
+ break;
+ default:
+ LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
+ SHORT_UUID(match->name.value),
+ RQ_TYPE(tfr->request_type),
+ lowest, match->lowest_id);
+ if (cluster_send(tfr))
+ LOG_ERROR("Failed resend");
+ }
}
}
} else
@@ -1107,8 +1076,6 @@ static void cpg_config_callback(cpg_handle_t handle, struct cpg_name *gname,
struct clog_cpg *match, *tmp;
int found = 0;
- memberz = member_list_entries;
-
list_for_each_entry_safe(match, tmp, &clog_cpg_list, list)
if (match->handle == handle) {
found = 1;
@@ -1130,9 +1097,6 @@ static void cpg_config_callback(cpg_handle_t handle, struct cpg_name *gname,
else
cpg_leave_callback(match, left_list,
member_list, member_list_entries);
-
- if (joined_list_entries && (joined_list[0].nodeid == my_cluster_id))
- doit = 25;
}
cpg_callbacks_t cpg_callbacks = {
@@ -1201,6 +1165,61 @@ int create_cluster_cpg(char *str)
return 0;
}
+static void abort_startup(struct clog_cpg *del)
+{
+ int len;
+ SaNameT name;
+ SaAisErrorT rv;
+ SaCkptCheckpointHandleT h;
+ struct clog_tfr *startup_tfr = NULL;
+
+ LOG_ERROR("[%s] CPG teardown before checkpoint received",
+ SHORT_UUID(del->name.value));
+
+ while ((startup_tfr = queue_remove(del->startup_queue))) {
+ LOG_ERROR("[%s] Ignoring request from %u: %s",
+ SHORT_UUID(del->name.value), startup_tfr->originator,
+ RQ_TYPE(startup_tfr->request_type));
+ queue_add(startup_tfr, free_queue);
+ }
+
+ len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, "bitmaps_%s_%u",
+ SHORT_UUID(del->name.value), my_cluster_id);
+ name.length = len;
+
+open_retry:
+ rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL,
+ SA_CKPT_CHECKPOINT_READ, 0, &h);
+ if (rv == SA_AIS_ERR_TRY_AGAIN) {
+ LOG_ERROR("abort_startup: ckpt open retry");
+ usleep(1000);
+ goto open_retry;
+ }
+
+ if (rv != SA_AIS_OK) {
+ LOG_ERROR("[%s] Failed to open checkpoint: %s",
+ SHORT_UUID(del->name.value), str_ais_error(rv));
+ return;
+ }
+
+ LOG_ERROR("[%s] Removing checkpoint", SHORT_UUID(del->name.value));
+unlink_retry:
+ rv = saCkptCheckpointUnlink(ckpt_handle, &name);
+ if (rv == SA_AIS_ERR_TRY_AGAIN) {
+ LOG_ERROR("abort_startup: ckpt unlink retry");
+ usleep(1000);
+ goto unlink_retry;
+ }
+
+ if (rv != SA_AIS_OK) {
+ LOG_ERROR("[%s] Failed to unlink checkpoint: %s",
+ SHORT_UUID(del->name.value), str_ais_error(rv));
+ return;
+ }
+
+ saCkptCheckpointClose(h);
+}
+
int destroy_cluster_cpg(char *str)
{
int r;
@@ -1210,6 +1229,9 @@ int destroy_cluster_cpg(char *str)
if (!strncmp(del->name.value, str, CPG_MAX_NAME_LENGTH)) {
del->cpg_state = INVALID;
del->state = LEAVING;
+ if (!queue_empty(del->startup_queue))
+ abort_startup(del);
+
r = cpg_leave(del->handle, &del->name);
if (r != CPG_OK)
LOG_ERROR("Error leaving CPG!");
@@ -1223,12 +1245,6 @@ int init_cluster(void)
{
SaAisErrorT rv;
- {
- int i;
- for(i = 0; i < DEBUGGING_HISTORY; i++)
- debugging[i][0] = '\0';
- }
-
INIT_LIST_HEAD(&clog_cpg_list);
rv = saCkptInitialize(&ckpt_handle, &callbacks, &version);