This is the mail archive of the cluster-cvs@sourceware.org mailing list for the cluster.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

cluster: RHEL48 - rgmanager: Detect restricted failover domain crash


Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=4a62d37ed15229356fde8945fa3b5798fb85b663
Commit:        4a62d37ed15229356fde8945fa3b5798fb85b663
Parent:        da9f72456bdda7833f8360de92807b0f66cb334a
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Mon Aug 25 15:54:44 2008 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Fri Apr 3 10:12:12 2009 -0400

rgmanager: Detect restricted failover domain crash

Mark service as 'stopped' when it is 'running' but the
node is down.  rhbz #428108
---
 rgmanager/include/reslist.h                     |    5 +-
 rgmanager/src/daemons/fo_domain.c               |   17 ++++-
 rgmanager/src/daemons/groups.c                  |   80 +++++++++++++++++------
 rgmanager/src/daemons/members.c                 |   30 +++++++++
 rgmanager/src/daemons/rg_state.c                |   27 ++++++--
 rgmanager/src/daemons/service_op.c              |   15 ++++-
 rgmanager/src/daemons/slang_event.c             |   23 ++-----
 rgmanager/src/resources/default_event_script.sl |    3 +-
 8 files changed, 151 insertions(+), 49 deletions(-)

diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index f78288f..4d3feea 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -23,6 +23,7 @@
 #include <libxml/parser.h>
 #include <libxml/xmlmemory.h>
 #include <libxml/xpath.h>
+#include <sets.h>
 
 
 #define RA_PRIMARY	(1<<0)	/** Primary key */
@@ -205,8 +206,8 @@ void deconstruct_domains(fod_t **domains);
 void print_domains(fod_t **domains);
 int node_should_start(uint64_t nodeid, cluster_member_list_t *membership,
 		      char *rg_name, fod_t **domains);
-int node_domain_set(fod_t *domain, uint64_t **ret, int *retlen);
-int node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags);
+int node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags);
+int node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags);
 
 
 /*
diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c
index 9019a10..be1918d 100644
--- a/rgmanager/src/daemons/fo_domain.c
+++ b/rgmanager/src/daemons/fo_domain.c
@@ -349,13 +349,24 @@ node_in_domain(char *nodename, fod_t *domain,
 
 
 int
-node_domain_set(fod_t *domain, uint64_t **ret, int *retlen)
+node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags)
 {
 	int x, i, j;
 	set_type_t *tmpset;
 	int ts_count;
-
 	fod_node_t *fodn;
+	fod_t *domain;
+	int found = 0;
+
+	list_for(domains, domain, x) {
+		if (!strcasecmp(domain->fd_name, name)) {
+			found = 1;
+			break;
+		}
+	} // while (!list_done(&_domains, fod));
+
+	if (!found)
+		return -1;
 
 	/* Count domain length */
 	list_for(&domain->fd_nodes, fodn, x) { }
@@ -368,6 +379,8 @@ node_domain_set(fod_t *domain, uint64_t **ret, int *retlen)
 	if (!(*tmpset))
 		return -1;
 
+	*flags = domain->fd_flags;
+
 	if (domain->fd_flags & FOD_ORDERED) {
 		for (i = 1; i <= 100; i++) {
 			
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index d1fe3db..a0816d5 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -18,9 +18,10 @@
   MA 02139, USA.
 */
 //#define DEBUG
+#include <sets.h>
 #include <platform.h>
-#include <resgroup.h>
 #include <restart_counter.h>
+#include <resgroup.h>
 #include <reslist.h>
 #include <vf.h>
 #include <magma.h>
@@ -60,6 +61,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER;
 
 void res_build_name(char *, size_t, resource_t *);
 int group_migratory(char *groupname, int lock);
+int group_property(char *groupname, char *property, char *ret, size_t len);
+int member_online_set(set_type_t **nodes, int *nodecount);
 
 
 struct status_arg {
@@ -88,25 +91,11 @@ node_should_start_safe(uint64_t nodeid, cluster_member_list_t *membership,
 
 
 int
-node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags)
+node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags)
 {
-	fod_t *fod;
-	int rv = -1, found = 0, x = 0;
-
+	int rv = 0;
 	pthread_rwlock_rdlock(&resource_lock);
-
-	list_for(&_domains, fod, x) {
-		if (!strcasecmp(fod->fd_name, domainname)) {
-			found = 1;
-			break;
-		}
-	} // while (!list_done(&_domains, fod));
-
-	if (found) {
-		rv = node_domain_set(fod, ret, retlen);
-		*flags = fod->fd_flags;
-	}
-
+	rv = node_domain_set(&_domains, domainname, ret, retlen, flags);
 	pthread_rwlock_unlock(&resource_lock);
 
 	return rv;
@@ -440,6 +429,52 @@ check_depend_safe(char *rg_name)
 }
 
 
+int
+check_rdomain_crash(char *svcName)
+{
+	set_type_t *nodes = NULL;
+	set_type_t *fd_nodes = NULL;
+	set_type_t *isect = NULL;
+	int nodecount;
+	int fd_nodecount, fl;
+	int icount;
+	char fd_name[256];
+
+	if (group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0)
+		goto out_free;
+
+	member_online_set(&nodes, &nodecount);
+
+	if (node_domain_set(&_domains, fd_name, &fd_nodes,
+			    &fd_nodecount, &fl) != 0)
+		goto out_free;
+
+	if (!(fl & FOD_RESTRICTED))
+		goto out_free;
+	
+	if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount, 
+		    &isect, &icount) < 0)
+		goto out_free;
+
+	if (icount == 0) {
+		clulog(LOG_DEBUG, "Marking %s as stopped: "
+		       "Restricted domain unavailable\n", svcName);
+		rt_enqueue_request(svcName, RG_STOP, -1, 0, 0,
+				   0, 0);
+	}
+
+out_free:
+	if (fd_nodes)
+		free(fd_nodes);
+	if (nodes)
+		free(nodes);
+	if (isect)
+		free(isect);
+
+	return 0;
+}
+
+
 /**
   Start or failback a resource group: if it's not running, start it.
   If it is running and we're a better member to run it, then ask for
@@ -453,6 +488,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
 	cluster_member_t *mp;
 	int autostart, exclusive;
 	void *lockp = NULL;
+	int fod_ret;
 
 	mp = memb_id_to_p(membership, my_id());
 	assert(mp);
@@ -545,10 +581,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
 	 * Start any stopped services, or started services
 	 * that are owned by a down node.
 	 */
-	if (node_should_start(mp->cm_id, membership, svcName, &_domains) ==
-	    FOD_BEST)
+	fod_ret = node_should_start(mp->cm_id, membership,
+				    svcName, &_domains);
+	if (fod_ret == FOD_BEST)
 		rt_enqueue_request(svcName, RG_START, -1, 0, mp->cm_id,
 				   0, 0);
+	else if (fod_ret == FOD_ILLEGAL)
+		check_rdomain_crash(svcName);
 }
 
 
@@ -979,7 +1018,6 @@ group_property_unlocked(char *groupname, char *property, char *ret,
 }
 
 
-
 /**
   Send the state of a resource group to a given file descriptor.
 
diff --git a/rgmanager/src/daemons/members.c b/rgmanager/src/daemons/members.c
index 910d174..6fc1327 100644
--- a/rgmanager/src/daemons/members.c
+++ b/rgmanager/src/daemons/members.c
@@ -16,6 +16,7 @@
   Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
   MA 02139, USA.
 */
+#include <sets.h>
 #include <pthread.h>
 #include <magma.h>
 #include <magmamsg.h>
@@ -94,6 +95,35 @@ member_list(void)
 }
 
 
+int
+member_online_set(set_type_t **nodes, int *nodecount)
+{
+	int ret = 1, i;
+
+	pthread_rwlock_rdlock(&memblock);
+	if (!membership)
+		goto out_unlock;
+
+	*nodes = malloc(sizeof(set_type_t) * membership->cml_count);
+	if (!*nodes)
+		goto out_unlock;
+
+	*nodecount = 0;
+	for (i = 0; i < membership->cml_count; i++) {
+		if (membership->cml_members[i].cm_state &&
+		    membership->cml_members[i].cm_id > 0) {
+			(*nodes)[*nodecount] = membership->cml_members[i].cm_id;
+			++(*nodecount);
+		}
+	}
+
+	ret = 0;
+out_unlock:
+	pthread_rwlock_unlock(&memblock);
+	return ret;
+}
+
+
 char *
 member_name(uint64_t id, char *buf, int buflen)
 {
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index c717ecc..386d569 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -17,6 +17,7 @@
   MA 02139, USA.
 */
 //#define DEBUG
+#include <sets.h>
 #include <assert.h>
 #include <platform.h>
 #include <magma.h>
@@ -30,6 +31,7 @@
 #include <ccs.h>
 #include <rg_queue.h>
 #include <msgsimple.h>
+#include <event.h>
 
 #define cm_svccount cm_pad[0] /* Theses are uint8_t size */
 #define cm_svcexcl  cm_pad[1]
@@ -46,6 +48,7 @@ int get_rg_state(char *servicename, rg_state_t *svcblk);
 void get_recovery_policy(char *rg_name, char *buf, size_t buflen);
 int have_exclusive_resources(void);
 int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+int count_resource_groups_local(cluster_member_t *mp);
 
 
 pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -433,9 +436,12 @@ get_rg_state_local(char *rgname, rg_state_t *svcblk)
  * @param req		Specify request to perform
  * @return		0 = DO RG_NOT stop service, return RG_EFAIL
  *			1 = STOP service - return whatever it returns.
- *			2 = DO RG_NOT stop service, return 0 (success)
- *                      3 = DO RG_NOT stop service, return RG_EFORWARD
- *			4 = DO RG_NOT stop service, return RG_EAGAIN
+ *			2 = DO NOT stop service, return 0 (success)
+ *                      3 = DO NOT stop service, return RG_EFORWARD
+ *			4 = DO NOT stop service, return RG_EAGAIN
+ *			5 = DO NOT stop service, return RG_EFROZEN
+ *			6 = DO NOT stop service, mark stopped and return
+ *			    RG_SUCCESS (0)
  */
 int
 svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
@@ -494,9 +500,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
 
 		/*
 		   Service is marked as running but node is down.
-		   Doesn't make much sense to stop it.
+		   Doesn't make much sense to stop it - but we need
+		   to mark it stopped
 		 */
-		ret = 2;
+		ret = 6;
 		break;
 
 	case RG_STATE_ERROR:
@@ -929,6 +936,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
 		clulog(LOG_DEBUG, "Unable to stop %s in %s state\n",
 		       svcName, rg_state_str(svcStatus.rs_state));
 		return RG_EFAIL;
+	case 6:
+		/* Mark stopped, but do not do anything */
+		svcStatus.rs_last_owner = svcStatus.rs_owner;
+		svcStatus.rs_owner = 0;
+		svcStatus.rs_state = RG_STATE_STOPPED;
+		if (set_rg_state(svcName, &svcStatus) != 0) {
+			rg_unlock(svcName, lockp);
+			return RG_EFAIL;
+		}
+		/* FALLTHROUGH */
 	case 2:
 		rg_unlock(svcName, lockp);
 		return RG_ESUCCESS;
diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c
index 3c02688..48fbd2d 100644
--- a/rgmanager/src/daemons/service_op.c
+++ b/rgmanager/src/daemons/service_op.c
@@ -21,6 +21,7 @@
 #include <magmamsg.h>
 #include <stdio.h>
 #include <string.h>
+#include <sets.h>
 #include <resgroup.h>
 #include <clulog.h>
 #include <rg_locks.h>
@@ -153,8 +154,18 @@ service_op_stop(char *svcName, int do_disable, int event_type)
 
 	if (get_service_state_internal(svcName, &svcStatus) < 0)
 		return RG_EFAIL;
-	if (svcStatus.rs_owner != NODE_ID_NONE)
-		msgtarget = svcStatus.rs_owner;
+
+	if (svcStatus.rs_owner != NODE_ID_NONE) {
+		if (member_online(svcStatus.rs_owner)) {
+			msgtarget = svcStatus.rs_owner;
+		} else {
+			/* If the owner is not online, 
+			   mark the service as 'stopped' but
+			   otherwise, do nothing.
+			 */
+			return svc_stop(svcName, RG_STOP);
+		}
+	}
 
 	if ((fd = msg_open(msgtarget, RG_PORT, RG_PURPOSE, 2)) < 0) {
 		clulog(LOG_ERR,
diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c
index d3a522b..6e17db0 100644
--- a/rgmanager/src/daemons/slang_event.c
+++ b/rgmanager/src/daemons/slang_event.c
@@ -19,10 +19,12 @@
   @file S/Lang event handling & intrinsic functions + vars
  */
 #include <platform.h>
+#include <sets.h>
 #include <resgroup.h>
 #include <list.h>
 #include <restart_counter.h>
 #include <reslist.h>
+#include <resgroup.h>
 #include <clulog.h>
 #include <magma.h>
 #include <magmamsg.h>
@@ -35,7 +37,6 @@
 #include <sys/syslog.h>
 #include <malloc.h>
 #include <clulog.h>
-#include <sets.h>
 #include <signal.h>
 
 static int __sl_initialized = 0;
@@ -46,6 +47,8 @@ static int _service_list_len = 0;
 char **get_service_names(int *len); /* from groups.c */
 int get_service_property(char *rg_name, char *prop, char *buf, size_t buflen);
 void push_int_array(set_type_t *stuff, int len);
+int member_online_set(set_type_t **nodes, int *nodecount);
+
 
 
 /* ================================================================
@@ -604,25 +607,13 @@ push_int_array(set_type_t *stuff, int len)
 void
 sl_nodes_online(void)
 {
-	int i, nodecount = 0;
 	set_type_t *nodes;
+	int nodecount = 0, x = 0;
 
-	cluster_member_list_t *membership = member_list();
-	if (!membership)
-		return;
-	nodes = malloc(sizeof(set_type_t) * membership->cml_count);
-	if (!nodes)
+	x = member_online_set(&nodes, &nodecount);
+	if (x < 0 || !nodes || !nodecount)
 		return;
 
-	nodecount = 0;
-	for (i = 0; i < membership->cml_count; i++) {
-		if (membership->cml_members[i].cm_state &&
-		    membership->cml_members[i].cm_id != 0) {
-			nodes[nodecount] = membership->cml_members[i].cm_id;
-			++nodecount;
-		}
-	}
-	cml_free(membership);
 	push_int_array(nodes, nodecount);
 	free(nodes);
 }
diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl
index e961266..cecc1f6 100644
--- a/rgmanager/src/resources/default_event_script.sl
+++ b/rgmanager/src/resources/default_event_script.sl
@@ -31,7 +31,8 @@ define move_or_start(service, node_list)
 
 	len = length(node_list);
 	if (len == 0) {
-		debug(service, " is not runnable");
+		notice(service, " is not runnable - restricted domain offline");
+		()=service_stop(service);
 		return ERR_DOMAIN;
 	}
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]