Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a delay between killing teamd processes #3325

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e5683d5
Add a delay between killing teamd processes
saiarcot895 Oct 13, 2024
f4fd3ab
Update LAG removal code to use the same logic as cleaning up all LAGs
saiarcot895 Oct 22, 2024
7b6fc53
Update tests to test LAG cleanup and to test with the new code
saiarcot895 Oct 22, 2024
27f6d3c
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Oct 22, 2024
bdd47c7
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Oct 22, 2024
c5d84cf
Add more tests to cover more cases
saiarcot895 Oct 23, 2024
1dd20a0
Merge branch 'master' into teamd-delay-kill
dgsudharsan Oct 28, 2024
8f71480
Merge branch 'master' into teamd-delay-kill
dgsudharsan Nov 4, 2024
f39d60f
Merge branch 'master' into teamd-delay-kill
dgsudharsan Nov 11, 2024
e7ce08d
Wait 200ms instead of 100ms, and fix teamd wait code
saiarcot895 Nov 13, 2024
ce1fdae
Merge remote-tracking branch 'refs/remotes/personal/teamd-delay-kill'…
saiarcot895 Nov 13, 2024
726f800
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Nov 13, 2024
d2ccbb3
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Nov 25, 2024
181bdb9
Merge branch 'master' into teamd-delay-kill
saiarcot895 Nov 26, 2024
4d99bf4
Merge branch 'master' into teamd-delay-kill
saiarcot895 Dec 2, 2024
6472d33
Merge branch 'master' into teamd-delay-kill
dgsudharsan Dec 3, 2024
fff96a3
Merge branch 'master' into teamd-delay-kill
saiarcot895 Dec 10, 2024
cd38a76
Merge branch 'master' into teamd-delay-kill
judyjoseph Dec 15, 2024
9f6ab64
Try 10ms sleep
saiarcot895 Jan 3, 2025
9aaf399
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Jan 6, 2025
702ffda
Fix code style
saiarcot895 Jan 8, 2025
a4a5d83
Merge remote-tracking branch 'origin/master' into teamd-delay-kill
saiarcot895 Jan 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 36 additions & 65 deletions cfgmgr/teammgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <signal.h>


Expand Down Expand Up @@ -171,18 +173,30 @@ void TeamMgr::cleanTeamProcesses()
SWSS_LOG_ENTER();
SWSS_LOG_NOTICE("Cleaning up LAGs during shutdown...");

std::unordered_map<std::string, pid_t> aliasPidMap;
std::unordered_map<std::string, int> aliasPidMap;
uint32_t sleepCounter = 0;

for (const auto& alias: m_lagList)
{
std::string res;
pid_t pid;
if (++sleepCounter % 10 == 0) {
// Sleep for 100 milliseconds so as to not overwhelm the netlink
judyjoseph marked this conversation as resolved.
Show resolved Hide resolved
// socket buffers with events about interfaces going down
std::this_thread::sleep_for(std::chrono::milliseconds(200));
sleepCounter = 0;
}

try
{
std::stringstream cmd;
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid");
EXEC_WITH_ERROR_THROW(cmd.str(), res);
ifstream pidFile("/var/run/teamd/" + alias + ".pid");
if (pidFile.is_open()) {
pidFile >> pid;
aliasPidMap[alias] = pid;
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
} else {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{ -> new line

SWSS_LOG_NOTICE("Unable to read pid file for %s, skipping...", alias.c_str());
continue;
}
}
catch (const std::exception &e)
{
Expand All @@ -191,31 +205,11 @@ void TeamMgr::cleanTeamProcesses()
continue;
}

try
{
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10));
aliasPidMap[alias] = pid;

SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
}
catch (const std::exception &e)
{
SWSS_LOG_ERROR("Failed to read port channel %s pid: %s", alias.c_str(), e.what());
continue;
}

try
{
std::stringstream cmd;
cmd << "kill -TERM " << pid;
EXEC_WITH_ERROR_THROW(cmd.str(), res);

SWSS_LOG_NOTICE("Sent SIGTERM to port channel %s pid %d", alias.c_str(), pid);
}
catch (const std::exception &e)
{
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, e.what());
if (kill(pid, SIGTERM)) {
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, strerror(errno));
aliasPidMap.erase(alias);
} else {
SWSS_LOG_NOTICE("Sent SIGTERM to port channel %s pid %d", alias.c_str(), pid);
}
}

Expand All @@ -224,13 +218,11 @@ void TeamMgr::cleanTeamProcesses()
const auto &alias = cit.first;
const auto &pid = cit.second;

std::stringstream cmd;
std::string res;

SWSS_LOG_NOTICE("Waiting for port channel %s pid %d to stop...", alias.c_str(), pid);

cmd << "tail -f --pid=" << pid << " /dev/null";
EXEC_WITH_ERROR_THROW(cmd.str(), res);
while (!kill(pid, 0)) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}

SWSS_LOG_NOTICE("LAGs cleanup is done");
Expand Down Expand Up @@ -658,42 +650,21 @@ bool TeamMgr::removeLag(const string &alias)
{
SWSS_LOG_ENTER();

stringstream cmd;
string res;
pid_t pid;

try
{
std::stringstream cmd;
cmd << "cat " << shellquote("/var/run/teamd/" + alias + ".pid");
EXEC_WITH_ERROR_THROW(cmd.str(), res);
}
catch (const std::exception &e)
{
SWSS_LOG_NOTICE("Failed to remove non-existent port channel %s pid...", alias.c_str());
return false;
}

try
{
pid = static_cast<pid_t>(std::stoul(res, nullptr, 10));
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
}
catch (const std::exception &e)
{
SWSS_LOG_ERROR("Failed to read port channel %s pid: %s", alias.c_str(), e.what());
return false;
ifstream pidfile("/var/run/teamd/" + alias + ".pid");
if (pidfile.is_open()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@saiarcot895 , can you please move all { -> new line to be aligned with the coding style

pidfile >> pid;
SWSS_LOG_INFO("Read port channel %s pid %d", alias.c_str(), pid);
} else {
SWSS_LOG_NOTICE("Failed to remove non-existent port channel %s pid...", alias.c_str());
return false;
}
}

try
{
std::stringstream cmd;
cmd << "kill -TERM " << pid;
EXEC_WITH_ERROR_THROW(cmd.str(), res);
}
catch (const std::exception &e)
{
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, e.what());
if (kill(pid, SIGTERM)) {
SWSS_LOG_ERROR("Failed to send SIGTERM to port channel %s pid %d: %s", alias.c_str(), pid, strerror(errno));
return false;
}

Expand Down
4 changes: 2 additions & 2 deletions tests/mock_tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ tests_teammgrd_SOURCES = teammgrd/teammgr_ut.cpp \
tests_teammgrd_INCLUDES = $(tests_INCLUDES) -I$(top_srcdir)/cfgmgr -I$(top_srcdir)/lib
tests_teammgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_SAI)
tests_teammgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_GTEST) $(CFLAGS_SAI) $(tests_teammgrd_INCLUDES)
tests_teammgrd_LDADD = $(LDADD_GTEST) $(LDADD_SAI) -lnl-genl-3 -lhiredis -lhiredis \
-lswsscommon -lswsscommon -lgtest -lgtest_main -lzmq -lnl-3 -lnl-route-3 -lpthread -lgmock -lgmock_main
tests_teammgrd_LDADD = $(LDADD_GTEST) $(LDADD_SAI) -ldl -lhiredis \
-lswsscommon -lgtest -lgtest_main -lzmq -lpthread -lgmock -lgmock_main

## fpmsyncd unit tests

Expand Down
198 changes: 187 additions & 11 deletions tests/mock_tests/teammgrd/teammgr_ut.cpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,115 @@
#include "gtest/gtest.h"
#include "../mock_table.h"
#include "teammgr.h"
#include <dlfcn.h>

extern int (*callback)(const std::string &cmd, std::string &stdout);
extern std::vector<std::string> mockCallArgs;
static std::vector< std::pair<pid_t, int> > mockKillCommands;
static std::map<std::string, std::FILE*> pidFiles;

static int (*callback_kill)(pid_t pid, int sig) = NULL;
static std::pair<bool, FILE*> (*callback_fopen)(const char *pathname, const char *mode) = NULL;

static int cb_kill(pid_t pid, int sig)
{
mockKillCommands.push_back(std::make_pair(pid, sig));
if (!sig) {
errno = ESRCH;
return -1;
} else {
return 0;
}
}

int kill(pid_t pid, int sig)
{
if (callback_kill) {
return callback_kill(pid, sig);
}
int (*realfunc)(pid_t, int) =
(int(*)(pid_t, int))(dlsym (RTLD_NEXT, "kill"));
return realfunc(pid, sig);
}

static std::pair<bool, FILE*> cb_fopen(const char *pathname, const char *mode)
{
auto pidFileSearch = pidFiles.find(pathname);
if (pidFileSearch != pidFiles.end()) {
if (!pidFileSearch->second) {
errno = ENOENT;
}
return std::make_pair(true, pidFileSearch->second);
} else {
return std::make_pair(false, (FILE*)NULL);
}
}

FILE* fopen(const char *pathname, const char *mode)
{
if (callback_fopen) {
std::pair<bool, FILE*> callback_fd = callback_fopen(pathname, mode);
if (callback_fd.first) {
return callback_fd.second;
}
}
FILE* (*realfunc)(const char *, const char *) =
(FILE* (*)(const char *, const char *))(dlsym (RTLD_NEXT, "fopen"));
return realfunc(pathname, mode);
}

FILE* fopen64(const char *pathname, const char *mode)
{
if (callback_fopen) {
std::pair<bool, FILE*> callback_fd = callback_fopen(pathname, mode);
if (callback_fd.first) {
return callback_fd.second;
}
}
FILE* (*realfunc)(const char *, const char *) =
(FILE* (*)(const char *, const char *))(dlsym (RTLD_NEXT, "fopen64"));
return realfunc(pathname, mode);
}

int cb(const std::string &cmd, std::string &stdout)
{
mockCallArgs.push_back(cmd);
if (cmd.find("/usr/bin/teamd -r -t PortChannel1") != std::string::npos)
if (cmd.find("/usr/bin/teamd -r -t PortChannel382") != std::string::npos)
{
mkdir("/var/run/teamd", 0755);
std::FILE* pidFile = std::tmpfile();
std::fputs("1234", pidFile);
std::rewind(pidFile);
pidFiles["/var/run/teamd/PortChannel382.pid"] = pidFile;
return 1;
}
else if (cmd.find("cat \"/var/run/teamd/PortChannel1.pid\"") != std::string::npos)
else if (cmd.find("/usr/bin/teamd -r -t PortChannel812") != std::string::npos)
{
stdout = "1234";
pidFiles["/var/run/teamd/PortChannel812.pid"] = NULL;
return 1;
}
else if (cmd.find("/usr/bin/teamd -r -t PortChannel495") != std::string::npos)
{
mkdir("/var/run/teamd", 0755);
std::FILE* pidFile = std::tmpfile();
std::fputs("5678", pidFile);
std::rewind(pidFile);
pidFiles["/var/run/teamd/PortChannel495.pid"] = pidFile;
return 0;
}
else if (cmd.find("/usr/bin/teamd -r -t PortChannel198") != std::string::npos)
{
pidFiles["/var/run/teamd/PortChannel198.pid"] = NULL;
}
else {
for (int i = 600; i < 620; i++)
{
if (cmd.find(std::string("/usr/bin/teamd -r -t PortChannel") + std::to_string(i)) != std::string::npos)
{
pidFiles[std::string("/var/run/teamd/PortChannel") + std::to_string(i) + std::string(".pid")] = NULL;
}
}
}
return 0;
}

Expand Down Expand Up @@ -53,26 +146,109 @@ namespace teammgr_ut

cfg_lag_tables = tables;
mockCallArgs.clear();
mockKillCommands.clear();
pidFiles.clear();
callback = cb;
callback_kill = cb_kill;
callback_fopen = cb_fopen;
}

virtual void TearDown() override
{
callback = NULL;
callback_kill = NULL;
callback_fopen = NULL;
}
};

TEST_F(TeamMgrTest, testProcessKilledAfterAddLagFailure)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
cfg_lag_table.set("PortChannel1", { { "admin_status", "up" },
cfg_lag_table.set("PortChannel382", { { "admin_status", "up" },
{ "mtu", "9100" },
{ "lacp_key", "auto" },
{ "min_links", "2" } });
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
int kill_cmd_called = 0;
for (auto cmd : mockCallArgs){
if (cmd.find("kill -TERM 1234") != std::string::npos){
kill_cmd_called++;
}
ASSERT_NE(mockCallArgs.size(), 0);
EXPECT_NE(mockCallArgs.front().find("/usr/bin/teamd -r -t PortChannel382"), std::string::npos);
EXPECT_EQ(mockCallArgs.size(), 1);
EXPECT_EQ(mockKillCommands.size(), 1);
EXPECT_EQ(mockKillCommands.front().first, 1234);
EXPECT_EQ(mockKillCommands.front().second, SIGTERM);
}

TEST_F(TeamMgrTest, testProcessPidFileMissingAfterAddLagFailure)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
cfg_lag_table.set("PortChannel812", { { "admin_status", "up" },
{ "mtu", "9100" },
{ "fallback", "true" },
{ "lacp_key", "auto" },
{ "min_links", "1" } });
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
ASSERT_NE(mockCallArgs.size(), 0);
EXPECT_NE(mockCallArgs.front().find("/usr/bin/teamd -r -t PortChannel812"), std::string::npos);
EXPECT_EQ(mockCallArgs.size(), 1);
EXPECT_EQ(mockKillCommands.size(), 0);
}

TEST_F(TeamMgrTest, testProcessCleanupAfterAddLag)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
cfg_lag_table.set("PortChannel495", { { "admin_status", "up" },
{ "mtu", "9100" },
{ "lacp_key", "auto" },
{ "min_links", "2" } });
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
ASSERT_EQ(mockCallArgs.size(), 3);
ASSERT_NE(mockCallArgs.front().find("/usr/bin/teamd -r -t PortChannel495"), std::string::npos);
teammgr.cleanTeamProcesses();
EXPECT_EQ(mockKillCommands.size(), 2);
EXPECT_EQ(mockKillCommands.front().first, 5678);
EXPECT_EQ(mockKillCommands.front().second, SIGTERM);
}

TEST_F(TeamMgrTest, testProcessPidFileMissingDuringCleanup)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
cfg_lag_table.set("PortChannel198", { { "admin_status", "up" },
{ "mtu", "9100" },
{ "fallback", "true" },
{ "lacp_key", "auto" },
{ "min_links", "1" } });
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
ASSERT_NE(mockCallArgs.size(), 0);
EXPECT_NE(mockCallArgs.front().find("/usr/bin/teamd -r -t PortChannel198"), std::string::npos);
EXPECT_EQ(mockCallArgs.size(), 3);
teammgr.cleanTeamProcesses();
EXPECT_EQ(mockKillCommands.size(), 0);
}

TEST_F(TeamMgrTest, testSleepDuringCleanup)
{
swss::TeamMgr teammgr(m_config_db.get(), m_app_db.get(), m_state_db.get(), cfg_lag_tables);
swss::Table cfg_lag_table = swss::Table(m_config_db.get(), CFG_LAG_TABLE_NAME);
for (int i = 600; i < 620; i++)
{
cfg_lag_table.set(std::string("PortChannel") + std::to_string(i), { { "admin_status", "up" },
{ "mtu", "9100" },
{ "lacp_key", "auto" } });
}
ASSERT_EQ(kill_cmd_called, 1);
teammgr.addExistingData(&cfg_lag_table);
teammgr.doTask();
ASSERT_EQ(mockCallArgs.size(), 60);
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
teammgr.cleanTeamProcesses();
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
EXPECT_EQ(mockKillCommands.size(), 0);
EXPECT_GE(std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count(), 400);
}
}
}
Loading