pacemaker  2.0.3-4b1f869f0f
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2019 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 #ifdef _POSIX_MEMLOCK
24 # include <sys/mman.h>
25 #endif
26 
27 static int sbd_pid = 0;
28 
30 {
35 };
36 
37 static void
38 sysrq_trigger(char t)
39 {
40 #if SUPPORT_PROCFS
41  FILE *procf;
42 
43  // Root can always write here, regardless of kernel.sysrq value
44  procf = fopen("/proc/sysrq-trigger", "a");
45  if (!procf) {
46  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
47  return;
48  }
49  crm_info("sysrq-trigger: %c", t);
50  fprintf(procf, "%c\n", t);
51  fclose(procf);
52 #endif // SUPPORT_PROCFS
53  return;
54 }
55 
56 
57 static void
58 pcmk_panic_local(void)
59 {
60  int rc = pcmk_ok;
61  uid_t uid = geteuid();
62  pid_t ppid = getppid();
63 
64  if(uid != 0 && ppid > 1) {
65  /* We're a non-root pacemaker daemon (pacemaker-based,
66  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
67  * the original pacemakerd parent.
68  *
69  * Of these, only the controller is likely to be initiating resets.
70  */
71  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
73  return;
74 
75  } else if (uid != 0) {
76 #if SUPPORT_PROCFS
77  /*
78  * No permissions, and no pacemakerd parent to escalate to.
79  * Track down the new pacemakerd process and send a signal instead.
80  */
81  union sigval signal_value;
82 
83  memset(&signal_value, 0, sizeof(signal_value));
84  ppid = crm_procfs_pid_of("pacemakerd");
85  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
86 
87  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
88  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
89  }
90 #endif // SUPPORT_PROCFS
91 
92  /* The best we can do now is die */
94  return;
95  }
96 
97  /* We're either pacemakerd, or a pacemaker daemon running as root */
98 
99  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
100  sysrq_trigger('c');
101  } else {
102  sysrq_trigger('b');
103  }
104  /* reboot(RB_HALT_SYSTEM); rc = errno; */
105  reboot(RB_AUTOBOOT);
106  rc = errno;
107 
108  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
109 
110  if(ppid > 1) {
111  /* child daemon */
112  exit(CRM_EX_PANIC);
113  } else {
114  /* pacemakerd or orphan child */
115  exit(CRM_EX_FATAL);
116  }
117 }
118 
119 static void
120 pcmk_panic_sbd(void)
121 {
122  union sigval signal_value;
123  pid_t ppid = getppid();
124 
125  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
126 
127  memset(&signal_value, 0, sizeof(signal_value));
128  /* TODO: Arrange for a slightly less brutal option? */
129  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
130  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
131  pcmk_panic_local();
132  }
133 
134  if(ppid > 1) {
135  /* child daemon */
136  exit(CRM_EX_PANIC);
137  } else {
138  /* pacemakerd or orphan child */
139  exit(CRM_EX_FATAL);
140  }
141 }
142 
143 void
144 pcmk_panic(const char *origin)
145 {
146  static struct qb_log_callsite *panic_cs = NULL;
147 
148  if (panic_cs == NULL) {
149  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
150  }
151 
152  /* Ensure sbd_pid is set */
153  (void)pcmk_locate_sbd();
154 
155  if (panic_cs && panic_cs->targets) {
156  /* getppid() == 1 means our original parent no longer exists */
157  do_crm_log_always(LOG_EMERG,
158  "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
159  origin, sbd_pid, getppid());
161  return;
162  }
163 
164  if(sbd_pid > 1) {
165  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
166  pcmk_panic_sbd();
167 
168  } else {
169  do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
170  pcmk_panic_local();
171  }
172 }
173 
174 pid_t
176 {
177  char *pidfile = NULL;
178  char *sbd_path = NULL;
179 
180  if(sbd_pid > 1) {
181  return sbd_pid;
182  }
183 
184  /* Look for the pid file */
185  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
186  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
187 
188  /* Read the pid file */
189  CRM_ASSERT(pidfile);
190 
191  sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
192  if(sbd_pid > 0) {
193  crm_trace("SBD detected at pid=%d (file)", sbd_pid);
194 
195 #if SUPPORT_PROCFS
196  } else {
197  /* Fall back to /proc for systems that support it */
198  sbd_pid = crm_procfs_pid_of("sbd");
199  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
200 #endif // SUPPORT_PROCFS
201  }
202 
203  if(sbd_pid < 0) {
204  sbd_pid = 0;
205  crm_trace("SBD not detected");
206  }
207 
208  free(pidfile);
209  free(sbd_path);
210 
211  return sbd_pid;
212 }
213 
214 long
216 {
217  static long sbd_timeout = -2;
218 
219  if (sbd_timeout == -2) {
220  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
221  }
222  return sbd_timeout;
223 }
224 
225 long
227 {
228  long sbd_timeout = crm_get_sbd_timeout();
229 
230  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
231 }
232 
233 gboolean
234 check_sbd_timeout(const char *value)
235 {
236  long st_timeout = value? crm_get_msec(value) : 0;
237 
238  if (st_timeout < 0) {
239  st_timeout = crm_auto_watchdog_timeout();
240  crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
241  st_timeout, value);
242  }
243 
244  if (st_timeout == 0) {
245  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
246  value? value : "default");
247 
248  } else if (pcmk_locate_sbd() == 0) {
249  do_crm_log_always(LOG_EMERG,
250  "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
251  (value? value : "auto"));
253  return FALSE;
254 
255  } else {
256  long sbd_timeout = crm_get_sbd_timeout();
257 
258  if (st_timeout < sbd_timeout) {
259  do_crm_log_always(LOG_EMERG,
260  "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
261  value, sbd_timeout);
263  return FALSE;
264  }
265  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
266  value, sbd_timeout);
267  }
268  return TRUE;
269 }
pcmk_panic_delay
Definition: watchdog.c:32
check_sbd_timeout
gboolean check_sbd_timeout(const char *value)
Definition: watchdog.c:234
do_crm_log_always
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:206
LOG_TRACE
#define LOG_TRACE
Definition: logging.h:26
pcmk_panic_kdump
Definition: watchdog.c:33
pcmk_panic_shutdown
Definition: watchdog.c:34
pcmk_strerror
const char * pcmk_strerror(int rc)
Definition: results.c:188
crm_trace
#define crm_trace(fmt, args...)
Definition: logging.h:247
safe_str_eq
#define safe_str_eq(a, b)
Definition: util.h:61
pcmk_panic_flags
pcmk_panic_flags
Definition: watchdog.c:29
crm_trace_nonlog
unsigned int crm_trace_nonlog
Definition: logging.c:39
SBIN_DIR
#define SBIN_DIR
Definition: config.h:556
crm_procfs_pid_of
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:110
CRM_EX_PANIC
Definition: results.h:141
pcmk_panic
void pcmk_panic(const char *origin)
Definition: watchdog.c:144
crm_info
#define crm_info(fmt, args...)
Definition: logging.h:244
crm_auto_watchdog_timeout
long crm_auto_watchdog_timeout()
Definition: watchdog.c:226
pcmk_locate_sbd
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:175
pcmk_panic_none
Definition: watchdog.c:31
crm_strdup_printf
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
crm_get_sbd_timeout
long crm_get_sbd_timeout(void)
Definition: watchdog.c:215
crm_debug
#define crm_debug(fmt, args...)
Definition: logging.h:246
crm_pidfile_inuse
long crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: pid.c:141
crm_perror
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:219
crm_get_msec
long long crm_get_msec(const char *input)
Definition: utils.c:572
CRM_ASSERT
#define CRM_ASSERT(expr)
Definition: results.h:42
PCMK_RUN_DIR
#define PCMK_RUN_DIR
Definition: config.h:544
crm_exit
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:478
crm_internal.h
pcmk_ok
#define pcmk_ok
Definition: results.h:57
CRM_EX_FATAL
Definition: results.h:140