This is the mail archive of the
cluster-cvs@sourceware.org
mailing list for the cluster.
STABLE2 - cman: Don't let qdiskd update cman if the disk isunavailable
- From: Lon Hohberger <lon at fedoraproject dot org>
- To: cluster-cvs-relay at redhat dot com
- Date: Mon, 22 Sep 2008 15:47:32 +0000 (UTC)
- Subject: STABLE2 - cman: Don't let qdiskd update cman if the disk isunavailable
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=839c697d39dc4698da5de301ceb21e0b594d87f9
Commit: 839c697d39dc4698da5de301ceb21e0b594d87f9
Parent: 62e48ab2112089fa79aa686a6bed923530c6a3d6
Author: Lon Hohberger <lhh@redhat.com>
AuthorDate: Fri Sep 5 10:50:19 2008 -0400
Committer: Lon Hohberger <lhh@redhat.com>
CommitterDate: Mon Sep 22 11:33:56 2008 -0400
cman: Don't let qdiskd update cman if the disk is unavailable
rhbz#460937
---
cman/man/qdisk.5 | 16 ++++++++++++++++
cman/qdisk/disk.h | 2 ++
cman/qdisk/main.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
3 files changed, 59 insertions(+), 8 deletions(-)
diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5
index 569563c..dcf0908 100644
--- a/cman/man/qdisk.5
+++ b/cman/man/qdisk.5
@@ -338,6 +338,22 @@ daemon will read /proc/partitions and check for qdisk signatures
on every block device found, comparing the label against the specified
label. This is useful in configurations where the block device name
differs on a per-node basis.
+
+.in 9
+\fIcman_label\fP\fB="\fPmylabel\fB"/>\fP
+.in 12
+This overrides the label advertised to CMAN if present. If specified,
+the quorum daemon will register with this name instead of the actual
+device name.
+
+.in 9
+\fImax_error_cycles\fP\fB="\fP0\fB"/>\fP
+.in 12
+If we receive an I/O error during a cycle, we do not poll CMAN and tell
+it we are alive. If specified, this value will cause qdiskd to exit
+after the specified number of consecutive cycles during which I/O errors
+occur. The default is 0 (no maximum).
+
.in 8
\fB/>\fP
.in 0
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 0062c86..27a4db4 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -241,6 +241,8 @@ typedef struct {
int qc_scoremin;
int qc_sched;
int qc_sched_prio;
+ int qc_max_error_cycles;
+ int qc_pad;
disk_node_state_t qc_disk_status;
disk_node_state_t qc_status;
int qc_master; /* Master?! */
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 3301d45..0e27cb7 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -122,10 +122,10 @@ check_self(qd_ctx *ctx, status_block_t *sb)
or has not updated their timestamp recently. See check_transitions as
well.
*/
-static void
+int
read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
{
- int x;
+ int x, errors = 0;
status_block_t *sb;
for (x = 0; x < max; x++) {
@@ -137,6 +137,7 @@ read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
sb, sizeof(*sb)) < 0) {
clulog(LOG_WARNING,"Error reading node ID block %d\n",
x+1);
+ ++errors;
continue;
}
swab_status_block_t(sb);
@@ -172,6 +173,8 @@ read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
ni[x].ni_seen++;
ni[x].ni_last_seen = sb->ps_timestamp;
}
+
+ return errors;
}
@@ -789,7 +792,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
{
disk_msg_t msg = {0, 0, 0};
int low_id, bid_pending = 0, score, score_max, score_req,
- upgrade = 0, count;
+ upgrade = 0, count, errors, error_cycles = 0;
memb_mask_t mask, master_mask;
struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
@@ -814,7 +817,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
get_time(&oldtime, (ctx->qc_flags&RF_UPTIME));
/* Read everyone else's status */
- read_node_blocks(ctx, ni, max);
+ errors = read_node_blocks(ctx, ni, max);
/* Check for node transitions */
check_transitions(ctx, ni, max, mask);
@@ -851,7 +854,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
if (ctx->qc_flags & RF_REBOOT)
reboot(RB_AUTOBOOT);
}
- } else {
+ } else {
set_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
if (ctx->qc_status == S_NONE) {
clulog(LOG_NOTICE,
@@ -958,7 +961,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
return -1;
}
check_cman(ctx, mask, master_mask);
- cman_poll_quorum_device(ctx->qc_ch, 1);
+ if (!errors)
+ cman_poll_quorum_device(ctx->qc_ch, 1);
} else if (ctx->qc_status == S_RUN && ctx->qc_master &&
ctx->qc_master != ctx->qc_my_id) {
@@ -977,7 +981,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
"Halting qdisk operations\n");
return -1;
}
- cman_poll_quorum_device(ctx->qc_ch, 1);
+ if (!errors)
+ cman_poll_quorum_device(ctx->qc_ch, 1);
}
}
@@ -985,6 +990,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
if (qd_write_status(ctx, ctx->qc_my_id, ctx->qc_status,
&msg, mask, master_mask) != 0) {
clulog(LOG_ERR, "Error writing to quorum disk\n");
+ errors++; /* this value isn't really used
+ at this point */
}
/* write out our local status */
@@ -1025,13 +1032,24 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
(int)diff.tv_sec, (int)diff.tv_usec);
memcpy(&sleeptime, &interval, sizeof(sleeptime));
}
+
+ if (errors && ctx->qc_max_error_cycles) {
+ ++error_cycles;
+ if (error_cycles >= ctx->qc_max_error_cycles) {
+ clulog(LOG_ALERT,
+ "Too many I/O errors; giving up.\n");
+ _running = 0;
+ }
+ } else {
+ error_cycles = 0;
+ }
/* Could hit a watchdog timer here if we wanted to */
if (_running)
select(0, NULL, NULL, NULL, &sleeptime);
}
- return 0;
+ return !!errors;
}
@@ -1079,6 +1097,7 @@ get_config_data(qd_ctx *ctx, struct h_data *h, int maxh,
ctx->qc_flags |= RF_DEBUG;
ctx->qc_sched = SCHED_RR;
ctx->qc_sched_prio = 1;
+ ctx->qc_max_error_cycles = 0;
/* Get log log_facility */
snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
@@ -1299,6 +1318,20 @@ get_config_data(qd_ctx *ctx, struct h_data *h, int maxh,
free(val);
}
+
+ /*
+ * How many consecutive error cycles do we allow before
+ * giving up?
+ */
+ /* default = no max */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@max_error_cycles");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_max_error_cycles = atoi(val);
+ if (ctx->qc_max_error_cycles <= 0)
+ ctx->qc_max_error_cycles = 0;
+ free(val);
+ }
+
*cfh = configure_heuristics(ccsfd, h, maxh);
clulog(LOG_DEBUG,