From 48d0f60683222e16bf2fdcdfa88a2b8601eef1ce Mon Sep 17 00:00:00 2001 From: mike-dubrovsky Date: Thu, 12 Mar 2026 00:44:29 -0700 Subject: [PATCH] Skip APPL_STATE_DB writes when FIB suppression is disabled Route download speed degraded after BGP prefix suppression was introduced. RouteOrch::publishRouteState() unconditionally writes to APPL_STATE_DB and sends a pub/sub notification on every route add/remove, even when FIB suppression is disabled and no consumer needs the data. Changes: Config changes take effect after a full config reload or device reboot, not dynamically at runtime. This eliminates multi-process race conditions between orchagent, fpmsyncd, and FRR. - orchagent: add -F command-line flag (gEnableFibSuppress) to control FIB suppression at startup; guard publishRouteState() to return early when suppression is disabled - fpmsyncd: read suppress-fib-pending from CONFIG_DB at startup; remove dynamic CONFIG_DB subscription (SubscriberStateTable) since config changes now require a config reload or device reboot - fpmsyncd: conditionally create routeResponseChannel and add it to the select loop only when suppression is enabled - tests: update mock tests to set gEnableFibSuppress before exercising publishRouteState; update integration test to restart swss/fpmsyncd after toggling suppress-fib-pending --- fpmsyncd/fpmsyncd.cpp | 60 +----------------------- orchagent/main.cpp | 9 +++- orchagent/routeorch.cpp | 6 +++ tests/mock_tests/mock_orchagent_main.cpp | 1 + tests/mock_tests/routeorch_ut.cpp | 6 +++ tests/test_route.py | 15 +++++- 6 files changed, 34 insertions(+), 63 deletions(-) diff --git a/fpmsyncd/fpmsyncd.cpp b/fpmsyncd/fpmsyncd.cpp index a2c00bd3b03..e529589380b 100644 --- a/fpmsyncd/fpmsyncd.cpp +++ b/fpmsyncd/fpmsyncd.cpp @@ -8,7 +8,6 @@ #include "netdispatcher.h" #include "netlink.h" #include "notificationconsumer.h" -#include "subscriberstatetable.h" #include "warmRestartHelper.h" #include "fpmsyncd/fpmlink.h" #include "fpmsyncd/fpmsyncd.h" @@ -79,7 +78,6 @@ int main(int argc, char **argv) DBConnector db("APPL_DB", 0); DBConnector cfgDb("CONFIG_DB", 0); - SubscriberStateTable deviceMetadataTableSubscriber(&cfgDb, CFG_DEVICE_METADATA_TABLE_NAME); Table deviceMetadataTable(&cfgDb, CFG_DEVICE_METADATA_TABLE_NAME); DBConnector applStateDb("APPL_STATE_DB", 0); std::unique_ptr routeResponseChannel; @@ -116,6 +114,7 @@ int main(int argc, char **argv) routeResponseChannel = std::make_unique(&applStateDb, routeResponseChannelName); sync.setSuppressionEnabled(true); } + SWSS_LOG_NOTICE("FIB suppression state: %s", suppressionEnabledStr.c_str()); while (true) { @@ -142,8 +141,6 @@ int main(int argc, char **argv) s.addSelectable(&fpm); s.addSelectable(&netlink); - s.addSelectable(&deviceMetadataTableSubscriber); - if (sync.isSuppressionEnabled()) { s.addSelectable(routeResponseChannel.get()); @@ -249,61 +246,6 @@ int main(int argc, char **argv) s.removeSelectable(&eoiuCheckTimer); } } - else if (temps == &deviceMetadataTableSubscriber) - { - std::deque keyOpFvsQueue; - deviceMetadataTableSubscriber.pops(keyOpFvsQueue); - - for (const auto& keyOpFvs: keyOpFvsQueue) - { - const auto& key = kfvKey(keyOpFvs); - const auto& op = kfvOp(keyOpFvs); - const auto& fvs = kfvFieldsValues(keyOpFvs); - - if (op != SET_COMMAND) - { - continue; - } - - if (key != "localhost") - { - continue; - } - - for (const auto& fv: fvs) - { - const auto& field = fvField(fv); - const auto& value = fvValue(fv); - - if (field != "suppress-fib-pending") - { - continue; - } - - bool shouldEnable = (value == "enabled"); - - if (shouldEnable && !sync.isSuppressionEnabled()) - { - routeResponseChannel = std::make_unique(&applStateDb, routeResponseChannelName); - sync.setSuppressionEnabled(true); - s.addSelectable(routeResponseChannel.get()); - } - else if (!shouldEnable && sync.isSuppressionEnabled()) - { - /* When disabling suppression we mark all existing routes offloaded in zebra - * as there could be some transient routes which are pending response from - * orchagent, thus such updates might be missing. Since we are disabling suppression - * we no longer care about real HW offload status and can mark all routes as offloaded - * to avoid routes stuck in suppressed state after transition. */ - sync.markRoutesOffloaded(db); - - sync.setSuppressionEnabled(false); - s.removeSelectable(routeResponseChannel.get()); - routeResponseChannel.reset(); - } - } // end for fvs - } // end for keyOpFvsQueue - } else if (routeResponseChannel && (temps == routeResponseChannel.get())) { std::deque notifications; diff --git a/orchagent/main.cpp b/orchagent/main.cpp index 702064a5f37..4b4b47dfe0f 100644 --- a/orchagent/main.cpp +++ b/orchagent/main.cpp @@ -61,6 +61,7 @@ extern int gBatchSize; bool gRingMode = false; bool gSyncMode = false; +bool gEnableFibSuppress = false; sai_redis_communication_mode_t gRedisCommunicationMode = SAI_REDIS_COMMUNICATION_MODE_REDIS_ASYNC; string gAsicInstance; @@ -92,7 +93,7 @@ bool isChassisDbInUse() void usage() { - cout << "usage: orchagent [-h] [-r record_type] [-A] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF] [-I heart_beat_interval] [-R] [-M]" << endl; + cout << "usage: orchagent [-h] [-r record_type] [-A] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF] [-I heart_beat_interval] [-R] [-M] [-F]" << endl; cout << " -h: display this message" << endl; cout << " -r record_type: record orchagent logs with type (default 3)" << endl; cout << " Bit 0: sairedis.rec, Bit 1: swss.rec, Bit 2: responsepublisher.rec. For example:" << endl; @@ -118,6 +119,7 @@ void usage() cout << " -I heart_beat_interval: Heart beat interval in millisecond (default 10)" << endl; cout << " -R enable the ring thread feature" << endl; cout << " -M enable SAI MACSec POST" << endl; + cout << " -F enable BGP FIB suppression" << endl; } void sighup_handler(int signo) @@ -470,7 +472,7 @@ int main(int argc, char **argv) // Disable SAI MACSec POST by default. Use option -M to enable it. bool macsec_post_enabled = false; - while ((opt = getopt(argc, argv, "b:m:r:Af:j:d:i:hsz:k:q:c:t:v:I:R:M")) != -1) + while ((opt = getopt(argc, argv, "b:m:r:Af:j:d:i:hsz:k:q:c:t:v:I:R:MF")) != -1) { switch (opt) { @@ -595,6 +597,9 @@ int main(int argc, char **argv) case 'M': macsec_post_enabled = true; break; + case 'F': + gEnableFibSuppress = true; + break; default: /* '?' */ exit(EXIT_FAILURE); } diff --git a/orchagent/routeorch.cpp b/orchagent/routeorch.cpp index b8fba72469f..8603ff46b20 100644 --- a/orchagent/routeorch.cpp +++ b/orchagent/routeorch.cpp @@ -32,6 +32,7 @@ extern TunnelDecapOrch *gTunneldecapOrch; extern size_t gMaxBulkSize; extern string gMySwitchType; +extern bool gEnableFibSuppress; /* Default maximum number of next hop groups */ #define DEFAULT_NUMBER_OF_ECMP_GROUPS 128 @@ -3186,6 +3187,11 @@ void RouteOrch::publishRouteState(const RouteBulkContext& ctx, const ReturnCode& { SWSS_LOG_ENTER(); + if (!gEnableFibSuppress) + { + return; + } + std::vector fvs; /* Leave the fvs empty if the operation type is "DEL". diff --git a/tests/mock_tests/mock_orchagent_main.cpp b/tests/mock_tests/mock_orchagent_main.cpp index 1e44533f3ad..dd8c3b1ac10 100644 --- a/tests/mock_tests/mock_orchagent_main.cpp +++ b/tests/mock_tests/mock_orchagent_main.cpp @@ -19,6 +19,7 @@ string gMyHostName = "Linecard1"; string gMyAsicName = "Asic0"; bool gTraditionalFlexCounter = false; bool gSyncMode = false; +bool gEnableFibSuppress = false; sai_redis_communication_mode_t gRedisCommunicationMode = SAI_REDIS_COMMUNICATION_MODE_REDIS_ASYNC; bool gOrchUnhealthy = false; string gSaiErrorString; diff --git a/tests/mock_tests/routeorch_ut.cpp b/tests/mock_tests/routeorch_ut.cpp index 103dd888f92..ff364a2ce16 100644 --- a/tests/mock_tests/routeorch_ut.cpp +++ b/tests/mock_tests/routeorch_ut.cpp @@ -12,6 +12,7 @@ #include "bulker.h" extern string gMySwitchType; +extern bool gEnableFibSuppress; extern std::unique_ptr gMockResponsePublisher; @@ -388,6 +389,8 @@ namespace routeorch_test void TearDown() override { + gEnableFibSuppress = false; + RestoreSaiApis(); DEINIT_SAI_API_MOCK(route); @@ -718,6 +721,7 @@ namespace routeorch_test TEST_F(RouteOrchTest, RouteOrchTestSetDelResponse) { + gEnableFibSuppress = true; gMockResponsePublisher = std::make_unique(); std::deque entries; @@ -754,6 +758,7 @@ namespace routeorch_test TEST_F(RouteOrchTest, RouteOrchSetFullMaskSubnetPrefix) { + gEnableFibSuppress = true; gMockResponsePublisher = std::make_unique(); std::deque entries; @@ -772,6 +777,7 @@ namespace routeorch_test TEST_F(RouteOrchTest, RouteOrchLoopbackRoute) { + gEnableFibSuppress = true; gMockResponsePublisher = std::make_unique(); std::deque entries; diff --git a/tests/test_route.py b/tests/test_route.py index 647e7f05a47..ec6b67ea482 100644 --- a/tests/test_route.py +++ b/tests/test_route.py @@ -1077,15 +1077,22 @@ def is_offloaded(self, dvs, route): route_entry = json.loads(output) return bool(route_entry[route][0].get('offloaded')) - @pytest.mark.xfail(reason="BGP suppress FIB disabled on master/202405 - https://github.com/sonic-net/sonic-buildimage/issues/19092") @pytest.mark.parametrize("suppress_state", ["enabled", "disabled"]) def test_offload(self, suppress_state, setup, dvs): route = "1.1.1.0/24" - # enable route suppression + # configure suppress-fib-pending rc, _ = dvs.runcmd(f"config suppress-fib-pending {suppress_state}") assert rc == 0, "Failed to configure suppress-fib-pending" + # Container restart is used here as a quick substitute for a full config + # reload or device reboot, which is the only supported way to apply + # suppress-fib-pending config changes. This is intentionally unsupported + # in production but sufficient for DVS unit test purposes. + dvs.stop_fpmsyncd() + dvs.stop_swss() + dvs.start_swss() + dvs.start_fpmsyncd() time.sleep(5) try: @@ -1116,6 +1123,10 @@ def check_offloaded(): # make sure route suppression is disabled dvs.runcmd("config suppress-fib-pending disabled") + dvs.stop_fpmsyncd() + dvs.stop_swss() + dvs.start_swss() + dvs.start_fpmsyncd() class TestSubnetDecapVipRoute(TestRouteBase):