pacemaker  1.1.18-36d2962a86
Scalable High-Availability cluster resource manager
failcounts.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2008-2017 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13 
14 #include <crm/crm.h>
15 #include <crm/msg_xml.h>
16 #include <crm/common/xml.h>
17 #include <crm/common/util.h>
18 #include <crm/pengine/internal.h>
19 
20 int
21 get_failcount(node_t *node, resource_t *rsc, time_t *last_failure,
22  pe_working_set_t *data_set)
23 {
24  return get_failcount_full(node, rsc, last_failure, TRUE, NULL, data_set);
25 }
26 
27 static gboolean
28 is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml,
29  xmlNode *lrm_op_xml)
30 {
31  gboolean matched = FALSE;
32  const char *conf_op_name = NULL;
33  int conf_op_interval = 0;
34  const char *lrm_op_task = NULL;
35  int lrm_op_interval = 0;
36  const char *lrm_op_id = NULL;
37  char *last_failure_key = NULL;
38 
39  if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
40  return FALSE;
41  }
42 
43  conf_op_name = crm_element_value(conf_op_xml, "name");
44  conf_op_interval = crm_get_msec(crm_element_value(conf_op_xml, "interval"));
45  lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
46  crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_INTERVAL, &lrm_op_interval);
47 
48  if (safe_str_eq(conf_op_name, lrm_op_task) == FALSE
49  || conf_op_interval != lrm_op_interval) {
50  return FALSE;
51  }
52 
53  lrm_op_id = ID(lrm_op_xml);
54  last_failure_key = generate_op_key(rsc_id, "last_failure", 0);
55 
56  if (safe_str_eq(last_failure_key, lrm_op_id)) {
57  matched = TRUE;
58 
59  } else {
60  char *expected_op_key = generate_op_key(rsc_id, conf_op_name,
61  conf_op_interval);
62 
63  if (safe_str_eq(expected_op_key, lrm_op_id)) {
64  int rc = 0;
65  int target_rc = get_target_rc(lrm_op_xml);
66 
67  crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68  if (rc != target_rc) {
69  matched = TRUE;
70  }
71  }
72  free(expected_op_key);
73  }
74 
75  free(last_failure_key);
76  return matched;
77 }
78 
79 static gboolean
80 block_failure(node_t *node, resource_t *rsc, xmlNode *xml_op,
81  pe_working_set_t *data_set)
82 {
83  char *xml_name = clone_strip(rsc->id);
84  char *xpath = crm_strdup_printf("//primitive[@id='%s']//op[@on-fail='block']",
85  xml_name);
86  xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
87  gboolean should_block = FALSE;
88 
89  free(xpath);
90 
91 #if 0
92  /* A good idea? */
93  if (rsc->container == NULL && is_not_set(data_set->flags, pe_flag_stonith_enabled)) {
94  /* In this case, stop on-fail defaults to block in unpack_operation() */
95  return TRUE;
96  }
97 #endif
98 
99  if (xpathObj) {
100  int max = numXpathResults(xpathObj);
101  int lpc = 0;
102 
103  for (lpc = 0; lpc < max; lpc++) {
104  xmlNode *pref = getXpathResult(xpathObj, lpc);
105 
106  if (xml_op) {
107  should_block = is_matched_failure(xml_name, pref, xml_op);
108  if (should_block) {
109  break;
110  }
111 
112  } else {
113  const char *conf_op_name = NULL;
114  int conf_op_interval = 0;
115  char *lrm_op_xpath = NULL;
116  xmlXPathObject *lrm_op_xpathObj = NULL;
117 
118  conf_op_name = crm_element_value(pref, "name");
119  conf_op_interval = crm_get_msec(crm_element_value(pref, "interval"));
120 
121  lrm_op_xpath = crm_strdup_printf("//node_state[@uname='%s']"
122  "//lrm_resource[@id='%s']"
123  "/lrm_rsc_op[@operation='%s'][@interval='%d']",
124  node->details->uname, xml_name,
125  conf_op_name, conf_op_interval);
126  lrm_op_xpathObj = xpath_search(data_set->input, lrm_op_xpath);
127 
128  free(lrm_op_xpath);
129 
130  if (lrm_op_xpathObj) {
131  int max2 = numXpathResults(lrm_op_xpathObj);
132  int lpc2 = 0;
133 
134  for (lpc2 = 0; lpc2 < max2; lpc2++) {
135  xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
136  lpc2);
137 
138  should_block = is_matched_failure(xml_name, pref,
139  lrm_op_xml);
140  if (should_block) {
141  break;
142  }
143  }
144  }
145  freeXpathObject(lrm_op_xpathObj);
146 
147  if (should_block) {
148  break;
149  }
150  }
151  }
152  }
153 
154  free(xml_name);
155  freeXpathObject(xpathObj);
156 
157  return should_block;
158 }
159 
169 static inline char *
170 rsc_fail_name(resource_t *rsc)
171 {
172  const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
173 
174  return is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
175 }
176 
190 static void
191 generate_fail_regex(const char *prefix, const char *rsc_name,
192  gboolean is_legacy, gboolean is_unique, regex_t *re)
193 {
194  char *pattern;
195 
196  /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
197  * per-operation.
198  */
199  const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
200 
201  /* Ignore instance numbers for anything other than globally unique clones.
202  * Anonymous clone fail counts could contain an instance number if the
203  * clone was initially unique, failed, then was converted to anonymous.
204  * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
205  * clone instance numbers.
206  */
207  const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
208 
209  pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
210  instance_pattern, op_pattern);
211  CRM_LOG_ASSERT(regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) == 0);
212  free(pattern);
213 }
214 
226 static void
227 generate_fail_regexes(resource_t *rsc, pe_working_set_t *data_set,
228  regex_t *failcount_re, regex_t *lastfailure_re)
229 {
230  char *rsc_name = rsc_fail_name(rsc);
231  const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
232  gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
233 
234  generate_fail_regex(CRM_FAIL_COUNT_PREFIX, rsc_name, is_legacy,
235  is_set(rsc->flags, pe_rsc_unique), failcount_re);
236 
237  generate_fail_regex(CRM_LAST_FAILURE_PREFIX, rsc_name, is_legacy,
238  is_set(rsc->flags, pe_rsc_unique), lastfailure_re);
239 
240  free(rsc_name);
241 }
242 
243 int
244 get_failcount_full(node_t *node, resource_t *rsc, time_t *last_failure,
245  bool effective, xmlNode *xml_op, pe_working_set_t *data_set)
246 {
247  char *key = NULL;
248  const char *value = NULL;
249  regex_t failcount_re, lastfailure_re;
250  int failcount = 0;
251  time_t last = 0;
252  GHashTableIter iter;
253 
254  generate_fail_regexes(rsc, data_set, &failcount_re, &lastfailure_re);
255 
256  /* Resource fail count is sum of all matching operation fail counts */
257  g_hash_table_iter_init(&iter, node->details->attrs);
258  while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
259  if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
260  failcount = merge_weights(failcount, char2score(value));
261  } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
262  last = QB_MAX(last, crm_int_helper(value, NULL));
263  }
264  }
265 
266  regfree(&failcount_re);
267  regfree(&lastfailure_re);
268 
269  if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
270  *last_failure = last;
271  }
272 
273  /* If failure blocks the resource, disregard any failure timeout */
274  if ((failcount > 0) && rsc->failure_timeout
275  && block_failure(node, rsc, xml_op, data_set)) {
276 
277  pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
278  rsc->id, rsc->failure_timeout);
279  rsc->failure_timeout = 0;
280  }
281 
282  /* If all failures have expired, ignore fail count */
283  if (effective && (failcount > 0) && (last > 0) && rsc->failure_timeout) {
284  time_t now = get_effective_time(data_set);
285 
286  if (now > (last + rsc->failure_timeout)) {
287  crm_debug("Failcount for %s on %s expired after %ds",
288  rsc->id, node->details->uname, rsc->failure_timeout);
289  failcount = 0;
290  }
291  }
292 
293  if (failcount > 0) {
294  char *score = score2char(failcount);
295 
296  crm_info("%s has failed %s times on %s",
297  rsc->id, score, node->details->uname);
298  free(score);
299  }
300 
301  return failcount;
302 }
303 
304 /* If it's a resource container, get its failcount plus all the failcounts of
305  * the resources within it
306  */
307 int
308 get_failcount_all(node_t *node, resource_t *rsc, time_t *last_failure,
309  pe_working_set_t *data_set)
310 {
311  int failcount_all = 0;
312 
313  failcount_all = get_failcount(node, rsc, last_failure, data_set);
314 
315  if (rsc->fillers) {
316  GListPtr gIter = NULL;
317 
318  for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
319  resource_t *filler = (resource_t *) gIter->data;
320  time_t filler_last_failure = 0;
321 
322  failcount_all += get_failcount(node, filler, &filler_last_failure,
323  data_set);
324 
325  if (last_failure && filler_last_failure > *last_failure) {
326  *last_failure = filler_last_failure;
327  }
328  }
329 
330  if (failcount_all != 0) {
331  char *score = score2char(failcount_all);
332 
333  crm_info("Container %s and the resources within it have failed %s times on %s",
334  rsc->id, score, node->details->uname);
335  free(score);
336  }
337  }
338 
339  return failcount_all;
340 }
const char * uname
Definition: status.h:137
int get_failcount_all(node_t *node, resource_t *rsc, time_t *last_failure, pe_working_set_t *data_set)
Definition: failcounts.c:308
A dumping ground.
xmlNode * xml
Definition: status.h:256
#define CRM_LAST_FAILURE_PREFIX
Definition: internal.h:88
long long crm_int_helper(const char *text, char **end_text)
Definition: strings.c:80
int char2score(const char *score)
Definition: utils.c:230
long long crm_get_msec(const char *input)
Definition: utils.c:598
#define XML_LRM_ATTR_INTERVAL
Definition: msg_xml.h:283
time_t get_effective_time(pe_working_set_t *data_set)
Definition: utils.c:1629
#define CRM_LOG_ASSERT(expr)
Definition: logging.h:150
char * clone_name
Definition: status.h:255
int get_target_rc(xmlNode *xml_op)
Definition: unpack.c:2943
char version[256]
Definition: plugin.c:84
char * id
Definition: status.h:254
#define XML_LRM_ATTR_TASK
Definition: msg_xml.h:284
#define pe_warn(fmt...)
Definition: internal.h:28
struct node_shared_s * details
Definition: status.h:176
#define crm_debug(fmt, args...)
Definition: logging.h:253
#define CRM_FAIL_COUNT_PREFIX
Definition: internal.h:87
Utility functions.
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1537
resource_t * container
Definition: status.h:304
Wrappers for and extensions to libxml2.
int crm_element_value_int(xmlNode *data, const char *name, int *dest)
Definition: xml.c:3844
const char * crm_element_value(xmlNode *data, const char *name)
Definition: xml.c:5165
unsigned long long flags
Definition: status.h:278
xmlNode * input
Definition: status.h:84
GListPtr fillers
Definition: status.h:305
int failure_timeout
Definition: status.h:272
#define pe_rsc_unique
Definition: status.h:188
xmlXPathObjectPtr xpath_search(xmlNode *xml_top, const char *path)
Definition: xpath.c:145
GHashTable * attrs
Definition: status.h:156
int get_failcount(node_t *node, resource_t *rsc, time_t *last_failure, pe_working_set_t *data_set)
Definition: failcounts.c:21
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:64
int compare_version(const char *version1, const char *version2)
Definition: utils.c:486
int merge_weights(int w1, int w2)
Definition: common.c:386
int get_failcount_full(node_t *node, resource_t *rsc, time_t *last_failure, bool effective, xmlNode *xml_op, pe_working_set_t *data_set)
Definition: failcounts.c:244
#define XML_ATTR_CRM_VERSION
Definition: msg_xml.h:84
#define XML_LRM_ATTR_RC
Definition: msg_xml.h:295
Definition: status.h:172
unsigned long long flags
Definition: status.h:93
#define ID(x)
Definition: msg_xml.h:446
char * generate_op_key(const char *rsc_id, const char *op_type, int interval)
Generate an operation key.
Definition: operations.c:37
#define safe_str_eq(a, b)
Definition: util.h:72
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:45
GList * GListPtr
Definition: crm.h:218
#define crm_info(fmt, args...)
Definition: logging.h:251
#define pe_flag_stonith_enabled
Definition: status.h:63
char * score2char(int score)
Definition: utils.c:282