pacemaker  2.0.1-15814c6c0d
Scalable High-Availability cluster resource manager
election.c
Go to the documentation of this file.
1 /*
2  * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/time.h>
11 #include <sys/resource.h>
12 
13 #include <crm/msg_xml.h>
14 #include <crm/common/xml.h>
15 
16 #include <crm/common/mainloop.h>
17 #include <crm/cluster/internal.h>
18 #include <crm/cluster/election.h>
19 #include <crm/crm.h>
20 
21 #define STORM_INTERVAL 2 /* in seconds */
22 
23 struct election_s {
24  enum election_result state;
25  guint count; // How many times local node has voted
26  char *name; // Descriptive name for this election
27  char *uname; // Local node's name
28  GSourceFunc cb; // Function to call if election is won
29  GHashTable *voted; // Key = node name, value = how node voted
30  mainloop_timer_t *timeout; // When to abort if all votes not received
31 };
32 
33 static void election_complete(election_t *e)
34 {
35  e->state = election_won;
36 
37  if(e->cb) {
38  e->cb(e);
39  }
40 
41  election_reset(e);
42 }
43 
44 static gboolean election_timer_cb(gpointer user_data)
45 {
46  election_t *e = user_data;
47 
48  crm_info("%s timed out, declaring local node as winner", e->name);
49  election_complete(e);
50  return FALSE;
51 }
52 
53 enum election_result
55 {
56  if(e) {
57  return e->state;
58  }
59  return election_error;
60 }
61 
78 election_t *
79 election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
80 {
81  election_t *e = NULL;
82 
83  static guint count = 0;
84 
85  CRM_CHECK(uname != NULL, return NULL);
86 
87  e = calloc(1, sizeof(election_t));
88  if (e == NULL) {
89  crm_perror(LOG_CRIT, "Cannot create election");
90  return NULL;
91  }
92 
93  e->uname = strdup(uname);
94  if (e->uname == NULL) {
95  crm_perror(LOG_CRIT, "Cannot create election");
96  free(e);
97  return NULL;
98  }
99 
100  e->name = name? crm_strdup_printf("election-%s", name)
101  : crm_strdup_printf("election-%u", count++);
102  e->cb = cb;
103  e->timeout = mainloop_timer_add(e->name, period_ms, FALSE,
104  election_timer_cb, e);
105  crm_trace("Created %s", e->name);
106  return e;
107 }
108 
118 void
120 {
121  if(e && uname && e->voted) {
122  crm_trace("Discarding %s (no-)vote from lost peer %s", e->name, uname);
123  g_hash_table_remove(e->voted, uname);
124  }
125 }
126 
132 void
134 {
135  if (e != NULL) {
136  crm_trace("Resetting election %s", e->name);
137  mainloop_timer_stop(e->timeout);
138  if (e->voted) {
139  crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
140  g_hash_table_destroy(e->voted);
141  e->voted = NULL;
142  }
143  }
144 }
145 
154 void
156 {
157  if(e) {
158  election_reset(e);
159  crm_trace("Destroying %s", e->name);
160  mainloop_timer_del(e->timeout);
161  free(e->uname);
162  free(e->name);
163  free(e);
164  }
165 }
166 
167 static void
168 election_timeout_start(election_t *e)
169 {
170  if(e) {
171  mainloop_timer_start(e->timeout);
172  }
173 }
174 
180 void
182 {
183  if(e) {
184  mainloop_timer_stop(e->timeout);
185  }
186 }
187 
194 void
196 {
197  if(e) {
198  mainloop_timer_set_period(e->timeout, period);
199  } else {
200  crm_err("No election defined");
201  }
202 }
203 
204 static int
205 crm_uptime(struct timeval *output)
206 {
207  static time_t expires = 0;
208  static struct rusage info;
209 
210  time_t tm_now = time(NULL);
211 
212  if (expires < tm_now) {
213  int rc = 0;
214 
215  info.ru_utime.tv_sec = 0;
216  info.ru_utime.tv_usec = 0;
217  rc = getrusage(RUSAGE_SELF, &info);
218 
219  output->tv_sec = 0;
220  output->tv_usec = 0;
221 
222  if (rc < 0) {
223  crm_perror(LOG_ERR, "Could not calculate the current uptime");
224  expires = 0;
225  return -1;
226  }
227 
228  crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
229  (long)info.ru_utime.tv_usec);
230  }
231 
232  expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
233  output->tv_sec = info.ru_utime.tv_sec;
234  output->tv_usec = info.ru_utime.tv_usec;
235 
236  return 1;
237 }
238 
239 static int
240 crm_compare_age(struct timeval your_age)
241 {
242  struct timeval our_age;
243 
244  crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
245 
246  if (our_age.tv_sec > your_age.tv_sec) {
247  crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
248  return 1;
249  } else if (our_age.tv_sec < your_age.tv_sec) {
250  crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
251  return -1;
252  } else if (our_age.tv_usec > your_age.tv_usec) {
253  crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
254  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
255  return 1;
256  } else if (our_age.tv_usec < your_age.tv_usec) {
257  crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
258  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
259  return -1;
260  }
261 
262  return 0;
263 }
264 
278 void
280 {
281  struct timeval age;
282  xmlNode *vote = NULL;
283  crm_node_t *our_node;
284 
285  if (e == NULL) {
286  crm_trace("Election vote requested, but no election available");
287  return;
288  }
289 
290  our_node = crm_get_peer(0, e->uname);
291  if ((our_node == NULL) || (crm_is_peer_active(our_node) == FALSE)) {
292  crm_trace("Cannot vote in %s yet: local node not connected to cluster",
293  e->name);
294  return;
295  }
296 
297  election_reset(e);
298  e->state = election_in_progress;
299  vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
300 
301  e->count++;
302  crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
303  crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
304 
305  crm_uptime(&age);
306  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
307  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
308 
309  send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
310  free_xml(vote);
311 
312  crm_debug("Started %s round %d", e->name, e->count);
313  election_timeout_start(e);
314  return;
315 }
316 
332 bool
334 {
335  int voted_size = 0;
336  int num_members = 0;
337 
338  if(e == NULL) {
339  crm_trace("Election check requested, but no election available");
340  return FALSE;
341  }
342  if (e->voted == NULL) {
343  crm_trace("%s check requested, but no votes received yet", e->name);
344  return FALSE;
345  }
346 
347  voted_size = g_hash_table_size(e->voted);
348  num_members = crm_active_peers();
349 
350  /* in the case of #voted > #members, it is better to
351  * wait for the timeout and give the cluster time to
352  * stabilize
353  */
354  if (voted_size >= num_members) {
355  /* we won and everyone has voted */
357  if (voted_size > num_members) {
358  GHashTableIter gIter;
359  const crm_node_t *node;
360  char *key = NULL;
361 
362  crm_warn("Received too many votes in %s", e->name);
363  g_hash_table_iter_init(&gIter, crm_peer_cache);
364  while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
365  if (crm_is_peer_active(node)) {
366  crm_warn("* expected vote: %s", node->uname);
367  }
368  }
369 
370  g_hash_table_iter_init(&gIter, e->voted);
371  while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
372  crm_warn("* actual vote: %s", key);
373  }
374 
375  }
376 
377  crm_info("%s won by local node", e->name);
378  election_complete(e);
379  return TRUE;
380 
381  } else {
382  crm_debug("%s still waiting on %d of %d votes",
383  e->name, num_members - voted_size, num_members);
384  }
385 
386  return FALSE;
387 }
388 
389 #define LOSS_DAMPEN 2 /* in seconds */
390 
391 struct vote {
392  const char *op;
393  const char *from;
394  const char *version;
395  const char *election_owner;
396  int election_id;
397  struct timeval age;
398 };
399 
411 static bool
412 parse_election_message(election_t *e, xmlNode *message, struct vote *vote)
413 {
414  CRM_CHECK(message && vote, return FALSE);
415 
416  vote->election_id = -1;
417  vote->age.tv_sec = -1;
418  vote->age.tv_usec = -1;
419 
420  vote->op = crm_element_value(message, F_CRM_TASK);
421  vote->from = crm_element_value(message, F_CRM_HOST_FROM);
422  vote->version = crm_element_value(message, F_CRM_VERSION);
423  vote->election_owner = crm_element_value(message, F_CRM_ELECTION_OWNER);
424 
425  crm_element_value_int(message, F_CRM_ELECTION_ID, &(vote->election_id));
426 
427  if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
428  || (vote->election_owner == NULL) || (vote->election_id < 0)) {
429 
430  crm_warn("Invalid %s message from %s in %s ",
431  (vote->op? vote->op : "election"),
432  (vote->from? vote->from : "unspecified node"),
433  (e? e->name : "election"));
434  return FALSE;
435  }
436 
437  // Op-specific validation
438 
439  if (crm_str_eq(vote->op, CRM_OP_VOTE, TRUE)) {
440  // Only vote ops have uptime
442  F_CRM_ELECTION_AGE_US, &(vote->age));
443  if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
444  crm_warn("Cannot count %s %s from %s because it is missing uptime",
445  (e? e->name : "election"), vote->op, vote->from);
446  return FALSE;
447  }
448 
449  } else if (!crm_str_eq(vote->op, CRM_OP_NOVOTE, TRUE)) {
450  crm_info("Cannot process %s message from %s because %s is not a known election op",
451  (e? e->name : "election"), vote->from, vote->op);
452  return FALSE;
453  }
454 
455  // Election validation
456 
457  if (e == NULL) {
458  crm_info("Cannot count %s from %s because no election available",
459  vote->op, vote->from);
460  return FALSE;
461  }
462 
463  /* If the membership cache is NULL, we REALLY shouldn't be voting --
464  * the question is how we managed to get here.
465  */
466  if (crm_peer_cache == NULL) {
467  crm_info("Cannot count %s %s from %s because no peer information available",
468  e->name, vote->op, vote->from);
469  return FALSE;
470  }
471  return TRUE;
472 }
473 
474 static void
475 record_vote(election_t *e, struct vote *vote)
476 {
477  char *voter_copy = NULL;
478  char *vote_copy = NULL;
479 
480  CRM_ASSERT(e && vote && vote->from && vote->op);
481  if (e->voted == NULL) {
482  e->voted = crm_str_table_new();
483  }
484 
485  voter_copy = strdup(vote->from);
486  vote_copy = strdup(vote->op);
487  CRM_ASSERT(voter_copy && vote_copy);
488 
489  g_hash_table_replace(e->voted, voter_copy, vote_copy);
490 }
491 
492 static void
493 send_no_vote(crm_node_t *peer, struct vote *vote)
494 {
495  // @TODO probably shouldn't hardcode CRM_SYSTEM_CRMD and crm_msg_crmd
496 
497  xmlNode *novote = create_request(CRM_OP_NOVOTE, NULL, vote->from,
499 
500  crm_xml_add(novote, F_CRM_ELECTION_OWNER, vote->election_owner);
501  crm_xml_add_int(novote, F_CRM_ELECTION_ID, vote->election_id);
502 
503  send_cluster_message(peer, crm_msg_crmd, novote, TRUE);
504  free_xml(novote);
505 }
506 
522 enum election_result
523 election_count_vote(election_t *e, xmlNode *message, bool can_win)
524 {
525  int log_level = LOG_INFO;
526  gboolean done = FALSE;
527  gboolean we_lose = FALSE;
528  const char *reason = "unknown";
529  bool we_are_owner = FALSE;
530  crm_node_t *our_node = NULL, *your_node = NULL;
531  time_t tm_now = time(NULL);
532  struct vote vote;
533 
534  // @TODO these should be in election_t
535  static int election_wins = 0;
536  static time_t expires = 0;
537  static time_t last_election_loss = 0;
538 
539  CRM_CHECK(message != NULL, return election_error);
540  if (parse_election_message(e, message, &vote) == FALSE) {
541  return election_error;
542  }
543 
544  your_node = crm_get_peer(0, vote.from);
545  our_node = crm_get_peer(0, e->uname);
546  we_are_owner = (our_node != NULL)
547  && crm_str_eq(our_node->uuid, vote.election_owner, TRUE);
548 
549  if(can_win == FALSE) {
550  reason = "Not eligible";
551  we_lose = TRUE;
552 
553  } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
554  reason = "We are not part of the cluster";
555  log_level = LOG_ERR;
556  we_lose = TRUE;
557 
558  } else if (we_are_owner && (vote.election_id != e->count)) {
559  log_level = LOG_TRACE;
560  reason = "Superseded";
561  done = TRUE;
562 
563  } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
564  /* Possibly we cached the message in the FSA queue at a point that it wasn't */
565  reason = "Peer is not part of our cluster";
566  log_level = LOG_WARNING;
567  done = TRUE;
568 
569  } else if (crm_str_eq(vote.op, CRM_OP_NOVOTE, TRUE)
570  || crm_str_eq(vote.from, e->uname, TRUE)) {
571  /* Receiving our own broadcast vote, or a no-vote from peer, is a vote
572  * for us to win
573  */
574  if (!we_are_owner) {
575  crm_warn("Cannot count %s round %d %s from %s because we are not election owner (%s)",
576  e->name, vote.election_id, vote.op, vote.from,
577  vote.election_owner);
578  return election_error;
579  }
580  if (e->state != election_in_progress) {
581  // Should only happen if we already lost
582  crm_debug("Not counting %s round %d %s from %s because no election in progress",
583  e->name, vote.election_id, vote.op, vote.from);
584  return e->state;
585  }
586  record_vote(e, &vote);
587  reason = "Recorded";
588  done = TRUE;
589 
590  } else {
591  // A peer vote requires a comparison to determine which node is better
592  int age_result = crm_compare_age(vote.age);
593  int version_result = compare_version(vote.version, CRM_FEATURE_SET);
594 
595  if (version_result < 0) {
596  reason = "Version";
597  we_lose = TRUE;
598 
599  } else if (version_result > 0) {
600  reason = "Version";
601 
602  } else if (age_result < 0) {
603  reason = "Uptime";
604  we_lose = TRUE;
605 
606  } else if (age_result > 0) {
607  reason = "Uptime";
608 
609  } else if (strcasecmp(e->uname, vote.from) > 0) {
610  reason = "Host name";
611  we_lose = TRUE;
612 
613  } else {
614  reason = "Host name";
615  }
616  }
617 
618  if (expires < tm_now) {
619  election_wins = 0;
620  expires = tm_now + STORM_INTERVAL;
621 
622  } else if (done == FALSE && we_lose == FALSE) {
623  int peers = 1 + g_hash_table_size(crm_peer_cache);
624  static bool wrote_blackbox = FALSE; // @TODO move to election_t
625 
626  /* If every node has to vote down every other node, thats N*(N-1) total elections
627  * Allow some leeway before _really_ complaining
628  */
629  election_wins++;
630  if (election_wins > (peers * peers)) {
631  crm_warn("%s election storm detected: %d wins in %d seconds",
632  e->name, election_wins, STORM_INTERVAL);
633  election_wins = 0;
634  expires = tm_now + STORM_INTERVAL;
635  if (wrote_blackbox == FALSE) {
636  /* It's questionable whether a black box (from every node in the
637  * cluster) would be truly helpful in diagnosing an election
638  * storm. It's also highly doubtful a production environment
639  * would get multiple election storms from distinct causes, so
640  * saving one blackbox per process lifetime should be
641  * sufficient. Alternatives would be to save a timestamp of the
642  * last blackbox write instead of a boolean, and write a new one
643  * if some amount of time has passed; or to save a storm count,
644  * write a blackbox on every Nth occurrence.
645  */
646  crm_write_blackbox(0, NULL);
647  }
648  }
649  }
650 
651  if (done) {
652  do_crm_log(log_level + 1,
653  "Processed %s round %d %s (current round %d) from %s (%s)",
654  e->name, vote.election_id, vote.op, e->count, vote.from,
655  reason);
656  return e->state;
657 
658  } else if (we_lose == FALSE) {
659  if (last_election_loss == 0
660  || tm_now - last_election_loss > (time_t) LOSS_DAMPEN) {
661 
662  do_crm_log(log_level, "%s round %d (owner node ID %s) pass: %s from %s (%s)",
663  e->name, vote.election_id, vote.election_owner, vote.op,
664  vote.from, reason);
665 
666  last_election_loss = 0;
668 
669  /* Start a new election by voting down this, and other, peers */
670  e->state = election_start;
671  return e->state;
672  } else {
673  char *loss_time = ctime(&last_election_loss);
674 
675  if (loss_time) {
676  // Show only HH:MM:SS
677  loss_time += 11;
678  loss_time[8] = '\0';
679  }
680  crm_info("Ignoring %s round %d (owner node ID %s) pass vs %s because we lost less than %ds ago at %s",
681  e->name, vote.election_id, vote.election_owner, vote.from,
682  LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
683  }
684  }
685 
686  last_election_loss = tm_now;
687 
688  do_crm_log(log_level, "%s round %d (owner node ID %s) lost: %s from %s (%s)",
689  e->name, vote.election_id, vote.election_owner, vote.op,
690  vote.from, reason);
691 
692  election_reset(e);
693  send_no_vote(your_node, &vote);
694  e->state = election_lost;
695  return e->state;
696 }
#define F_CRM_TASK
Definition: msg_xml.h:50
#define LOG_TRACE
Definition: logging.h:35
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:165
char uname[MAX_NAME]
Definition: internal.h:82
void crm_write_blackbox(int nsig, struct qb_log_callsite *callsite)
Definition: logging.c:432
A dumping ground.
void mainloop_timer_start(mainloop_timer_t *t)
Definition: mainloop.c:1178
guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms)
Definition: mainloop.c:1196
void mainloop_timer_del(mainloop_timer_t *t)
Definition: mainloop.c:1234
gboolean crm_is_peer_active(const crm_node_t *node)
Definition: membership.c:272
char * uuid
Definition: cluster.h:53
#define STORM_INTERVAL
Definition: election.c:21
#define CRM_FEATURE_SET
Definition: crm.h:30
#define F_CRM_HOST_FROM
Definition: msg_xml.h:55
struct mainloop_timer_s mainloop_timer_t
Definition: mainloop.h:31
crm_node_t * crm_get_peer(unsigned int id, const char *uname)
Definition: membership.c:653
void election_timeout_stop(election_t *e)
Stop an election&#39;s timer, if running.
Definition: election.c:181
#define CRM_OP_NOVOTE
Definition: crm.h:110
guint crm_active_peers(void)
Definition: membership.c:365
void mainloop_timer_stop(mainloop_timer_t *t)
Definition: mainloop.c:1187
#define F_CRM_ELECTION_AGE_S
Definition: msg_xml.h:63
Wrappers for and extensions to glib mainloop.
struct election_s election_t
Definition: election.h:56
#define crm_warn(fmt, args...)
Definition: logging.h:250
#define crm_debug(fmt, args...)
Definition: logging.h:254
election_result
Definition: election.h:59
void election_vote(election_t *e)
Start a new election by offering local node&#39;s candidacy.
Definition: election.c:279
#define crm_trace(fmt, args...)
Definition: logging.h:255
#define do_crm_log(level, fmt, args...)
Log a message.
Definition: logging.h:130
enum election_result election_count_vote(election_t *e, xmlNode *message, bool can_win)
Process an election message (vote or no-vote) from a peer.
Definition: election.c:523
Wrappers for and extensions to libxml2.
void free_xml(xmlNode *child)
Definition: xml.c:2139
gboolean crm_str_eq(const char *a, const char *b, gboolean use_case)
Definition: strings.c:204
void election_timeout_set_period(election_t *e, guint period)
Change an election&#39;s timeout (restarting timer if running)
Definition: election.c:195
const char * crm_element_value(const xmlNode *data, const char *name)
Definition: xml.c:4641
election_t * election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
Create a new election object.
Definition: election.c:79
int crm_element_value_timeval(const xmlNode *data, const char *name_sec, const char *name_usec, struct timeval *dest)
Parse a time value from XML.
Definition: xml.c:3346
void election_fini(election_t *e)
Free an election object.
Definition: election.c:155
#define CRM_SYSTEM_CRMD
Definition: crm.h:78
#define CRM_OP_VOTE
Definition: crm.h:109
const char * crm_xml_add(xmlNode *node, const char *name, const char *value)
Definition: xml.c:1907
const char * crm_xml_add_int(xmlNode *node, const char *name, int value)
Definition: xml.c:1995
#define F_CRM_ELECTION_AGE_US
Definition: msg_xml.h:64
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:227
void election_reset(election_t *e)
Stop election timer and disregard all votes.
Definition: election.c:133
#define crm_err(fmt, args...)
Definition: logging.h:249
#define CRM_ASSERT(expr)
Definition: results.h:20
int compare_version(const char *version1, const char *version2)
Definition: utils.c:453
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Definition: xml.c:3310
mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata)
Definition: mainloop.c:1213
bool election_check(election_t *e)
Check whether local node has won an election.
Definition: election.c:333
#define F_CRM_ELECTION_ID
Definition: msg_xml.h:62
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
gboolean send_cluster_message(crm_node_t *node, enum crm_ais_msg_types service, xmlNode *data, gboolean ordered)
Definition: cluster.c:102
#define create_request(task, xml_data, host_to, sys_to, sys_from, uuid_from)
Definition: ipc.h:30
GHashTable * crm_peer_cache
Definition: membership.c:34
#define crm_info(fmt, args...)
Definition: logging.h:252
#define F_CRM_VERSION
Definition: msg_xml.h:57
uint32_t version
Definition: remote.c:146
#define LOSS_DAMPEN
Definition: election.c:389
void election_remove(election_t *e, const char *uname)
Disregard any previous vote by specified peer.
Definition: election.c:119
enum election_result election_state(election_t *e)
Definition: election.c:54
Functions for conducting elections.
#define F_CRM_ELECTION_OWNER
Definition: msg_xml.h:65