summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam A. Kennington III <wak@google.com>2018-02-27 18:47:05 -0800
committerBrad Bishop <bradleyb@fuzziesquirrel.com>2018-03-07 11:23:10 +0000
commitd13310864134b8e5a237397a9e08607cb2a01013 (patch)
tree499cfcbbf369689d7ae5a97c5a50b4f67c77164a
parent825f4981ef91031f3a9a549ff011ed9cc943a9b1 (diff)
downloadphosphor-watchdog-d13310864134b8e5a237397a9e08607cb2a01013.tar.gz
phosphor-watchdog-d13310864134b8e5a237397a9e08607cb2a01013.zip
Implement a fallback watchdog option
Sometimes our initial watchdog action is not enough to recover the host from the state it transitioned into. However, always using a more power form of power cycle is not desirable as we can lose useful CPU crash state. It is desirable in this case to have two levels of watchog timers. This patch implements the ability for the service to specify a fallback watchdog action and interval. After the initial watchdog timeout is encountered, the watchdog will be re-armed with the new parameters. Once the watchdog times out again it will execute the fallback action. Attempts to update the timeRemaining will reset the fallback just in case something is still alive. Change-Id: I69f4422c7e3963f02200815f3cef620af9e6cf8b Signed-off-by: William A. Kennington III <wak@google.com>
-rw-r--r--argument.cpp23
-rw-r--r--mainapp.cpp46
-rw-r--r--test/watchdog_test.cpp135
-rw-r--r--watchdog.cpp43
-rw-r--r--watchdog.hpp21
5 files changed, 250 insertions, 18 deletions
diff --git a/argument.cpp b/argument.cpp
index d212261..c2d99ce 100644
--- a/argument.cpp
+++ b/argument.cpp
@@ -28,15 +28,17 @@ using namespace std::string_literals;
const std::vector<std::string> emptyArg;
const std::string ArgumentParser::trueString = "true"s;
-const char* ArgumentParser::optionStr = "p:s:t:a:ch";
+const char* ArgumentParser::optionStr = "p:s:t:a:f:i:ch";
const option ArgumentParser::options[] =
{
- { "path", required_argument, nullptr, 'p' },
- { "service", required_argument, nullptr, 's' },
- { "target", required_argument, nullptr, 't' },
- { "action_target", required_argument, nullptr, 'a' },
- { "continue", no_argument, nullptr, 'c' },
- { "help", no_argument, nullptr, 'h' },
+ { "path", required_argument, nullptr, 'p' },
+ { "service", required_argument, nullptr, 's' },
+ { "target", required_argument, nullptr, 't' },
+ { "action_target", required_argument, nullptr, 'a' },
+ { "fallback_action", required_argument, nullptr, 'f' },
+ { "fallback_interval", required_argument, nullptr, 'i' },
+ { "continue", no_argument, nullptr, 'c' },
+ { "help", no_argument, nullptr, 'h' },
{ 0, 0, 0, 0},
};
@@ -102,6 +104,13 @@ void ArgumentParser::usage(char * const argv[])
std::cerr << " [--action_target=<action>=<systemd unit>] Map of action to "
"systemd unit to be called on timeout if that action is "
"set for ExpireAction when the timer expires.\n";
+ std::cerr << " [--fallback_action=<action>] Enables the "
+ "watchdog even when disabled via the dbus interface. "
+ "Perform this action when the fallback expires.\n";
+ std::cerr << " [--fallback_interval=<interval in ms>] Enables the "
+ "watchdog even when disabled via the dbus interface. "
+ "Waits for this interval before performing the fallback "
+ "action.\n";
std::cerr << " [--continue] Continue daemon "
"after watchdog timeout.\n";
}
diff --git a/mainapp.cpp b/mainapp.cpp
index d935bfb..365fd50 100644
--- a/mainapp.cpp
+++ b/mainapp.cpp
@@ -15,6 +15,7 @@
*/
#include <iostream>
+#include <experimental/optional>
#include <phosphor-logging/log.hpp>
#include <phosphor-logging/elog.hpp>
#include <phosphor-logging/elog-errors.hpp>
@@ -133,6 +134,47 @@ int main(int argc, char** argv)
}
printActionTargets(actionTargets);
+ // Parse out the fallback settings for the watchdog. Note that we require
+ // both of the fallback arguments to do anything here, but having a fallback
+ // is entirely optional.
+ auto fallbackActionParam = (options)["fallback_action"];
+ auto fallbackIntervalParam = (options)["fallback_interval"];
+ if (fallbackActionParam.empty() ^ fallbackIntervalParam.empty())
+ {
+ exitWithError("Only one of the fallback options was specified.", argv);
+ }
+ if (fallbackActionParam.size() > 1 || fallbackIntervalParam.size() > 1)
+ {
+ exitWithError("Multiple fallbacks specified.", argv);
+ }
+ std::experimental::optional<Watchdog::Fallback> fallback;
+ if (!fallbackActionParam.empty())
+ {
+ Watchdog::Action action;
+ try
+ {
+ action = Watchdog::convertActionFromString(
+ fallbackActionParam.back());
+ }
+ catch (const sdbusplus::exception::InvalidEnumString &)
+ {
+ exitWithError("Bad action specified.", argv);
+ }
+ uint64_t interval;
+ try
+ {
+ interval = std::stoull(fallbackIntervalParam.back());
+ }
+ catch (const std::logic_error &)
+ {
+ exitWithError("Failed to convert fallback interval to integer.", argv);
+ }
+ fallback = Watchdog::Fallback{
+ .action = action,
+ .interval = interval,
+ };
+ }
+
sd_event* event = nullptr;
auto r = sd_event_default(&event);
if (r < 0)
@@ -155,7 +197,9 @@ int main(int argc, char** argv)
try
{
// Create a watchdog object
- Watchdog watchdog(bus, path.c_str(), eventP, std::move(actionTargets));
+ Watchdog watchdog(bus, path.c_str(), eventP, std::move(actionTargets),
+ std::move(fallback));
+
// Claim the bus
bus.request_name(service.c_str());
diff --git a/test/watchdog_test.cpp b/test/watchdog_test.cpp
index a749435..af7c3ed 100644
--- a/test/watchdog_test.cpp
+++ b/test/watchdog_test.cpp
@@ -1,4 +1,6 @@
#include <chrono>
+#include <memory>
+#include <utility>
#include "watchdog_test.hpp"
@@ -170,3 +172,136 @@ TEST_F(WdogTest, enableWdogAndWaitTillEnd)
EXPECT_TRUE(wdog->timerExpired());
EXPECT_FALSE(wdog->timerEnabled());
}
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ * Wait through the initial trip and ensure the fallback is observed
+ * Make sure that fallback runs to completion and ensure the watchdog
+ * is disabled
+ */
+TEST_F(WdogTest, enableWdogWithFallbackTillEnd)
+{
+ auto primaryInterval = 5s;
+ auto primaryIntervalMs = milliseconds(primaryInterval).count();
+ auto fallbackInterval = primaryInterval * 2;
+ auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+ // We need to make a wdog with the right fallback options
+ // The interval is set to be noticeably different from the default
+ // so we can always tell the difference
+ Watchdog::Fallback fallback{
+ .action = Watchdog::Action::PowerOff,
+ .interval = static_cast<uint64_t>(fallbackIntervalMs),
+ };
+ std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+ wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+ std::move(emptyActionTargets), std::move(fallback));
+ EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+
+ // Enable and then verify
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // Waiting default expiration
+ EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+ // We should now have entered the fallback once the primary expires
+ EXPECT_FALSE(wdog->enabled());
+ auto remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LT(primaryInterval, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // We should still be ticking in fallback when setting action or interval
+ auto newInterval = primaryInterval - 1s;
+ auto newIntervalMs = milliseconds(newInterval).count();
+ EXPECT_EQ(newInterval, milliseconds(wdog->interval(newIntervalMs)));
+ EXPECT_EQ(Watchdog::Action::None,
+ wdog->expireAction(Watchdog::Action::None));
+
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_GE(remaining, milliseconds(wdog->timeRemaining()));
+ EXPECT_LT(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // Test that setting the timeRemaining always resets the timer to the
+ // fallback interval
+ EXPECT_EQ(fallback.interval, wdog->timeRemaining(primaryInterval.count()));
+ EXPECT_FALSE(wdog->enabled());
+
+ remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LE(fallbackInterval - defaultDrift, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ // Waiting fallback expiration
+ EXPECT_EQ(fallbackInterval - 1s, waitForWatchdog(fallbackInterval));
+
+ // We should now have disabled the watchdog after the fallback expires
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+ EXPECT_TRUE(wdog->timerExpired());
+ EXPECT_FALSE(wdog->timerEnabled());
+
+ // Make sure enabling the watchdog again works
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // We should have re-entered the primary
+ EXPECT_TRUE(wdog->enabled());
+ EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+}
+
+/** @brief Make sure the watchdog is started and enabled with a fallback
+ * Wait through the initial trip and ensure the fallback is observed
+ * Make sure that we can re-enable the watchdog during fallback
+ */
+TEST_F(WdogTest, enableWdogWithFallbackReEnable)
+{
+ auto primaryInterval = 5s;
+ auto primaryIntervalMs = milliseconds(primaryInterval).count();
+ auto fallbackInterval = primaryInterval * 2;
+ auto fallbackIntervalMs = milliseconds(fallbackInterval).count();
+
+ // We need to make a wdog with the right fallback options
+ // The interval is set to be noticeably different from the default
+ // so we can always tell the difference
+ Watchdog::Fallback fallback{
+ .action = Watchdog::Action::PowerOff,
+ .interval = static_cast<uint64_t>(fallbackIntervalMs),
+ };
+ std::map<Watchdog::Action, Watchdog::TargetName> emptyActionTargets;
+ wdog = std::make_unique<Watchdog>(bus, TEST_PATH, eventP,
+ std::move(emptyActionTargets), std::move(fallback));
+ EXPECT_EQ(primaryInterval, milliseconds(wdog->interval(primaryIntervalMs)));
+ EXPECT_FALSE(wdog->enabled());
+ EXPECT_EQ(0, wdog->timeRemaining());
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_FALSE(wdog->timerEnabled());
+
+ // Enable and then verify
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // Waiting default expiration
+ EXPECT_EQ(primaryInterval - 1s, waitForWatchdog(primaryInterval));
+
+ // We should now have entered the fallback once the primary expires
+ EXPECT_FALSE(wdog->enabled());
+ auto remaining = milliseconds(wdog->timeRemaining());
+ EXPECT_GE(fallbackInterval, remaining);
+ EXPECT_LT(primaryInterval, remaining);
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+
+ EXPECT_TRUE(wdog->enabled(true));
+
+ // We should have re-entered the primary
+ EXPECT_TRUE(wdog->enabled());
+ EXPECT_GE(primaryInterval, milliseconds(wdog->timeRemaining()));
+ EXPECT_FALSE(wdog->timerExpired());
+ EXPECT_TRUE(wdog->timerEnabled());
+}
diff --git a/watchdog.cpp b/watchdog.cpp
index 24e5416..b0c7b6f 100644
--- a/watchdog.cpp
+++ b/watchdog.cpp
@@ -19,8 +19,12 @@ bool Watchdog::enabled(bool value)
{
if (!value)
{
- // Attempt to disable our timer if needed
- tryDisable();
+ // Make sure we accurately reflect our enabled state to the
+ // tryFallbackOrDisable() call
+ WatchdogInherits::enabled(value);
+
+ // Attempt to fallback or disable our timer if needed
+ tryFallbackOrDisable();
return false;
}
@@ -78,6 +82,13 @@ uint64_t Watchdog::timeRemaining(uint64_t value)
return 0;
}
+ if (!this->enabled())
+ {
+ // Having a timer but not displaying an enabled value means we
+ // are inside of the fallback
+ value = fallback->interval;
+ }
+
// Update new expiration
auto usec = duration_cast<microseconds>(milliseconds(value));
timer.start(usec);
@@ -89,9 +100,13 @@ uint64_t Watchdog::timeRemaining(uint64_t value)
// Optional callback function on timer expiration
void Watchdog::timeOutHandler()
{
- auto action = expireAction();
- auto target = actionTargets.find(action);
+ Action action = expireAction();
+ if (!this->enabled())
+ {
+ action = fallback->action;
+ }
+ auto target = actionTargets.find(action);
if (target == actionTargets.end())
{
log<level::INFO>("watchdog: Timed out with no target",
@@ -112,12 +127,26 @@ void Watchdog::timeOutHandler()
bus.call_noreply(method);
}
- tryDisable();
+ tryFallbackOrDisable();
}
-void Watchdog::tryDisable()
+void Watchdog::tryFallbackOrDisable()
{
- if (timerEnabled())
+ // We only re-arm the watchdog if we were already enabled and have
+ // a possible fallback
+ if (fallback && this->enabled())
+ {
+ auto interval_ms = fallback->interval;
+ auto interval_us = duration_cast<microseconds>(milliseconds(interval_ms));
+
+ timer.clearExpired();
+ timer.start(interval_us);
+ timer.setEnabled<std::true_type>();
+
+ log<level::INFO>("watchdog: falling back",
+ entry("INTERVAL=%llu", interval_ms));
+ }
+ else if (timerEnabled())
{
timer.setEnabled<std::false_type>();
diff --git a/watchdog.hpp b/watchdog.hpp
index ee5e65d..60bad4e 100644
--- a/watchdog.hpp
+++ b/watchdog.hpp
@@ -1,6 +1,7 @@
#pragma once
#include <systemd/sd-event.h>
+#include <experimental/optional>
#include <sdbusplus/bus.hpp>
#include <sdbusplus/server/object.hpp>
#include <xyz/openbmc_project/State/Watchdog/server.hpp>
@@ -31,21 +32,32 @@ class Watchdog : public WatchdogInherits
*/
using TargetName = std::string;
+ /** @brief Type used to specify the parameters of a fallback watchdog
+ */
+ struct Fallback {
+ Action action;
+ uint64_t interval;
+ };
+
/** @brief Constructs the Watchdog object
*
* @param[in] bus - DBus bus to attach to.
* @param[in] objPath - Object path to attach to.
* @param[in] event - reference to sd_event unique pointer
* @param[in] actionTargets - map of systemd targets called on timeout
+ * @param[in] fallback
*/
Watchdog(sdbusplus::bus::bus& bus,
const char* objPath,
EventPtr& event,
std::map<Action, TargetName>&& actionTargets =
- std::map<Action, TargetName>()) :
+ std::map<Action, TargetName>(),
+ std::experimental::optional<Fallback>&&
+ fallback = std::experimental::nullopt) :
WatchdogInherits(bus, objPath),
bus(bus),
actionTargets(std::move(actionTargets)),
+ fallback(std::move(fallback)),
timer(event, std::bind(&Watchdog::timeOutHandler, this))
{
// Nothing
@@ -106,14 +118,17 @@ class Watchdog : public WatchdogInherits
/** @brief Map of systemd units to be started when the timer expires */
std::map<Action, TargetName> actionTargets;
+ /** @brief Fallback timer options */
+ std::experimental::optional<Fallback> fallback;
+
/** @brief Contained timer object */
Timer timer;
/** @brief Optional Callback handler on timer expirartion */
void timeOutHandler();
- /** @brief Attempt to disable the watchdog if needed */
- void tryDisable();
+ /** @brief Attempt to enter the fallback watchdog or disables it */
+ void tryFallbackOrDisable();
};
} // namespace watchdog
OpenPOWER on IntegriCloud