Merge commit '082905a8' into HEAD

2025-04-20 22:14:38 +00:00 · 2022-08-03 15:04:42 +02:00 · 2022-08-03 15:04:42 +02:00 · bc4ad83dac
commit bc4ad83dac
parent 73abd91ac6 082905a833
11 changed files with 342 additions and 263 deletions
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@ -2377,6 +2377,7 @@ avoid routing loops.
 <item> <rfc id="8203"> - BGP Administrative Shutdown Communication
 <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
 <item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications
+<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles
 </itemize>

 <sect1>Route selection rules
@ -2817,6 +2818,29 @@ using the following configuration parameters:
 	protocol itself (for example, if a route is received through eBGP and
 	therefore does not have such attribute). Default: 100 (0 in pre-1.2.0
 	versions of BIRD).
+
+	<tag><label id="bgp-local-role">local role <m/role-name/</tag>
+	BGP roles are a mechanism for route leak prevention and automatic route
+	filtering based on common BGP topology relationships. They are defined
+	in <rfc id="9234">. Instead of manually configuring filters and
+	communities, automatic filtering is done with the help of the OTC
+	attribute - a flag for routes that should be sent only to customers.
+	The same attribute is also used to automatically detect and filter route
+	leaks created by third parties.
+
+	This option is valid for EBGP sessions, but it is not recommended to be
+	used within AS confederations (which would require manual filtering of
+	<cf/bgp_otc/ attribute on confederation boundaries).
+
+	Possible <cf><m/role-name/</cf> values are: <cf/provider/,
+	<cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/.
+	Default: No local role assigned.
+
+	<tag><label id="bgp-require-roles">require roles <m/switch/</tag>
+	If this option is set, the BGP roles must be defined on both sides,
+	otherwise the session will not be established. This behavior is defined
+	in <rfc id="9234"> as "strict mode" and is used to enforce corresponding
+	configuration at your conterpart side. Default: disabled.
 </descrip>

 <sect1>Channel configuration
@ -3124,6 +3148,11 @@ some of them (marked with `<tt/O/') are optional.
 	This attribute contains accumulated IGP metric, which is a total
 	distance to the destination through multiple autonomous systems.
 	Currently, the attribute is not accessible from filters.
+
+	<tag><label id="bgp-otc">int bgp_otc [O]</tag>
+	This attribute is defined in <rfc id="9234">. OTC is a flag that marks
+	routes that should be sent only to customers. If <ref id="bgp-role"
+	name="local Role"> is configured it set automatically.
 </descrip>

 <sect1>Example
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@ -903,6 +903,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla
  bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad);
 }

+
+static void
+bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
+{
+  if (len != 4)
+    WITHDRAW(BAD_LENGTH, "OTC", len);
+
+  u32 val = get_u32(data);
+  bgp_set_attr_u32(to, s->pool, BA_ONLY_TO_CUSTOMER, flags, val);
+}
+
+
 static void
 bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
 {
@ -1116,6 +1128,13 @@ static const struct bgp_attr_desc bgp_attr_table[] = {
    .encode = bgp_encode_u32s,
    .decode = bgp_decode_large_community,
  },
+  [BA_ONLY_TO_CUSTOMER] = {
+    .name = "otc",
+    .type = EAF_TYPE_INT,
+    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+    .encode = bgp_encode_u32,
+    .decode = bgp_decode_otc,
+  },
  [BA_MPLS_LABEL_STACK] = {
    .name = "mpls_label_stack",
    .type = EAF_TYPE_INT_SET,
@ -1452,6 +1471,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a)
    REPORT("Discarding AIGP attribute received on non-AIGP session");
    bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP);
  }
+
+  /* Handle OTC ingress procedure, RFC 9234 */
+  if (bgp_channel_is_role_applicable(s->channel))
+  {
+    struct bgp_proto *p = s->proto;
+    eattr *e = bgp_find_attr(a->eattrs, BA_ONLY_TO_CUSTOMER);
+
+    /* Reject routes from downstream if they are leaked */
+    if (e && (p->cf->local_role == BGP_ROLE_PROVIDER ||
+	      p->cf->local_role == BGP_ROLE_RS_SERVER))
+      WITHDRAW("Route leak detected - OTC attribute from downstream");
+
+    /* Reject routes from peers if they are leaked */
+    if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as))
+      WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)",
+	       (uint) e->u.data);
+
+    /* Mark routes from upstream if it did not happened before */
+    if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER ||
+	       p->cf->local_role == BGP_ROLE_PEER ||
+	       p->cf->local_role == BGP_ROLE_RS_CLIENT))
+      bgp_set_attr_u32(&a->eattrs, s->pool, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as);
+  }
 }


@ -1681,6 +1723,7 @@ bgp_preexport(struct channel *C, rte *e)
  struct proto *SRC = e->src->proto;
  struct bgp_proto *p = (struct bgp_proto *) C->proto;
  struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL;
+  struct bgp_channel *c = (struct bgp_channel *) C;

  /* Reject our routes */
  if (src == p)
@ -1708,11 +1751,11 @@ bgp_preexport(struct channel *C, rte *e)
  }

  /* Handle well-known communities, RFC 1997 */
-  struct eattr *c;
+  struct eattr *a;
  if (p->cf->interpret_communities &&
-      (c = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY))))
+      (a = bgp_find_attr(e->attrs->eattrs, BA_COMMUNITY)))
  {
-    const struct adata *d = c->u.ptr;
+    const struct adata *d = a->u.ptr;

    /* Do not export anywhere */
    if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
@ -1731,6 +1774,16 @@ bgp_preexport(struct channel *C, rte *e)
      return -1;
  }

+  /* Do not export routes marked with OTC to upstream, RFC 9234 */
+  if (bgp_channel_is_role_applicable(c))
+  {
+    a = bgp_find_attr(e->attrs->eattrs, BA_ONLY_TO_CUSTOMER);
+    if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER ||
+	      p->cf->local_role==BGP_ROLE_PEER ||
+	      p->cf->local_role==BGP_ROLE_RS_CLIENT))
+      return -1;
+  }
+
  return 0;
 }

@ -1840,6 +1893,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at
    }
  }

+  /* Mark routes for downstream with OTC, RFC 9234 */
+  if (bgp_channel_is_role_applicable(c))
+  {
+    a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER);
+    if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER ||
+	       p->cf->local_role == BGP_ROLE_PEER ||
+	       p->cf->local_role == BGP_ROLE_RS_SERVER))
+      bgp_set_attr_u32(&attrs, pool, BA_ONLY_TO_CUSTOMER, 0, p->public_as);
+  }
+
  /*
   * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above
   * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@ -102,6 +102,7 @@
 * RFC 8212 - Default EBGP Route Propagation Behavior without Policies
 * RFC 8654 - Extended Message Support for BGP
 * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications
+ * RFC 9234 - Route Leak Prevention and Detection Using Roles
 * draft-ietf-idr-ext-opt-param-07
 * draft-uttaro-idr-bgp-persistence-04
 * draft-walton-bgp-hostname-capability-02
@ -1963,6 +1964,15 @@ bgp_postconfig(struct proto_config *CF)
  if (internal && cf->rs_client)
    cf_error("Only external neighbor can be RS client");

+  if (internal && (cf->local_role != BGP_ROLE_UNDEFINED))
+    cf_error("Local role cannot be set on IBGP sessions");
+
+  if (interior && (cf->local_role != BGP_ROLE_UNDEFINED))
+    log(L_WARN "BGP roles are not recommended to be used within AS confederations");
+
+  if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED))
+    cf_error("Local role must be set if roles are required");
+
  if (!cf->confederation && cf->confederation_member)
    cf_error("Confederation ID must be set for member sessions");

@ -2325,6 +2335,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count)
  cli_msg(code, b.start);
 }

+static const char *
+bgp_format_role_name(u8 role)
+{
+  static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" };
+  if (role == BGP_ROLE_UNDEFINED) return "undefined";
+  if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role];
+  return "?";
+}
+
 static void
 bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
 {
@ -2453,6 +2472,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)

  if (caps->hostname)
    cli_msg(-1006, "      Hostname: %s", caps->hostname);
+
+  if (caps->role != BGP_ROLE_UNDEFINED)
+    cli_msg(-1006, "      Role: %s", bgp_format_role_name(caps->role));
 }

 static void
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@ -113,6 +113,8 @@ struct bgp_config {
  int gr_mode;				/* Graceful restart mode (BGP_GR_*) */
  int llgr_mode;			/* Long-lived graceful restart mode (BGP_LLGR_*) */
  int setkey;				/* Set MD5 password to system SA/SP database */
+  u8  local_role;			/* Set peering role with neighbor [RFC 9234] */
+  int require_roles;			/* Require configured roles on both sides */
  /* Times below are in seconds */
  unsigned gr_time;			/* Graceful restart timeout */
  unsigned llgr_time;			/* Long-lived graceful restart stale time */
@ -166,6 +168,13 @@ struct bgp_channel_config {
 #define BGP_PT_INTERNAL		1
 #define BGP_PT_EXTERNAL		2

+#define BGP_ROLE_UNDEFINED 	255
+#define BGP_ROLE_PROVIDER 	0
+#define BGP_ROLE_RS_SERVER 	1
+#define BGP_ROLE_RS_CLIENT 	2
+#define BGP_ROLE_CUSTOMER 	3
+#define BGP_ROLE_PEER 		4
+
 #define NH_NO			0
 #define NH_ALL			1
 #define NH_IBGP			2
@ -226,6 +235,7 @@ struct bgp_caps {
  u8 ext_messages;			/* Extended message length,  RFC draft */
  u8 route_refresh;			/* Route refresh capability, RFC 2918 */
  u8 enhanced_refresh;			/* Enhanced route refresh,   RFC 7313 */
+  u8 role;				/* BGP role capability,      RFC 9234 */

  u8 gr_aware;				/* Graceful restart capability, RFC 4724 */
  u8 gr_flags;				/* Graceful restart flags */
@ -487,6 +497,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c)
 static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c)
 { return BGP_AFI(c->afi) == BGP_AFI_IPV6; }

+static inline int bgp_channel_is_role_applicable(struct bgp_channel *c)
+{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
+
+static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c)
+{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); }
+
 static inline uint bgp_max_packet_length(struct bgp_conn *conn)
 { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; }

@ -658,6 +674,7 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to);
 #define BA_AS4_AGGREGATOR       0x12	/* RFC 6793 */
 #define BA_AIGP			0x1a	/* RFC 7311 */
 #define BA_LARGE_COMMUNITY	0x20	/* RFC 8092 */
+#define BA_ONLY_TO_CUSTOMER	0x23	/* RFC 9234 */

 /* Bird's private internal BGP attributes */
 #define BA_MPLS_LABEL_STACK	0xfe	/* MPLS label stack transfer attribute */
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@ -31,7 +31,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
 	STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG,
 	LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS,
 	DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE,
-	FIRST, FREE, VALIDATE, BASE)
+	FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER,
+	RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC)

 %type <i> bgp_nh
 %type <i32> bgp_afi
@ -40,7 +41,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER,
 	CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION,
 	OUT, OF, RESOURCES)

-%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag
+%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name

 CF_GRAMMAR

@ -72,6 +73,7 @@ bgp_proto_start: proto_start BGP {
     BGP_CFG->llgr_mode = -1;
     BGP_CFG->llgr_time = 3600;
     BGP_CFG->setkey = 1;
+     BGP_CFG->local_role = BGP_ROLE_UNDEFINED;
     BGP_CFG->dynamic_name = "dynbgp";
     BGP_CFG->check_link = -1;
   }
@ -114,6 +116,14 @@ bgp_cease_flag:
 | OUT OF RESOURCES		{ $$ = 1 << 8; }
 ;

+bgp_role_name:
+   PEER      { $$ = BGP_ROLE_PEER; }
+ | PROVIDER  { $$ = BGP_ROLE_PROVIDER; }
+ | CUSTOMER  { $$ = BGP_ROLE_CUSTOMER; }
+ | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; }
+ | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; }
+ ;
+
 bgp_proto:
   bgp_proto_start proto_name '{'
 | bgp_proto proto_item ';'
@ -197,6 +207,8 @@ bgp_proto:
 | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; }
 | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';'
 | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; }
+ | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; }
+ | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; }
 ;

 bgp_afi:
@ -343,6 +355,8 @@ dynamic_attr: BGP_AIGP
 	{ $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ;
 dynamic_attr: BGP_LARGE_COMMUNITY
 	{ $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ;
+dynamic_attr: BGP_OTC
+	{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_ONLY_TO_CUSTOMER)); } ;



--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn)
  caps->ext_messages = p->cf->enable_extended_messages;
  caps->route_refresh = p->cf->enable_refresh;
  caps->enhanced_refresh = p->cf->enable_refresh;
+  caps->role = p->cf->local_role;

  if (caps->as4_support)
    caps->as4_number = p->public_as;
@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
    *buf++ = 0;			/* Capability data length */
  }

+  if (caps->role != BGP_ROLE_UNDEFINED)
+  {
+    *buf++ = 9;			/* Capability 9: Announce chosen BGP role */
+    *buf++ = 1;			/* Capability data length */
+    *buf++ = caps->role;
+  }
+
  if (caps->gr_aware)
  {
    *buf++ = 64;		/* Capability 64: Support for graceful restart */
@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
  struct bgp_proto *p = conn->bgp;
  struct bgp_caps *caps;
  struct bgp_af_caps *ac;
+  uint err_subcode = 0;
  int i, cl;
  u32 af;

  if (!conn->remote_caps)
+  {
    caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps));
+    caps->role = BGP_ROLE_UNDEFINED;
+  }
  else
  {
    caps = conn->remote_caps;
@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)
      caps->ext_messages = 1;
      break;

+    case  9: /* BGP role capability, RFC 9234 */
+      if (cl != 1)
+        goto err;
+
+      /* Reserved value */
+      if (pos[2] == BGP_ROLE_UNDEFINED)
+      { err_subcode = 11; goto err; }
+
+      /* Multiple inconsistent values */
+      if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2]))
+      { err_subcode = 11; goto err; }
+
+      caps->role = pos[2];
+      break;
+
    case 64: /* Graceful restart capability, RFC 4724 */
      if (cl % 4 != 2)
 	goto err;
@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len)

 err:
  mb_free(caps);
-  bgp_error(conn, 2, 0, NULL, 0);
+  bgp_error(conn, 2, err_subcode, NULL, 0);
  return -1;
 }

@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
    conn->received_as = asn;
  }

+  /* RFC 9234 4.2 - check role agreement */
+  u8 local_role = p->cf->local_role;
+  u8 neigh_role = caps->role;
+
+  if ((local_role != BGP_ROLE_UNDEFINED) &&
+      (neigh_role != BGP_ROLE_UNDEFINED) &&
+      !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) ||
+	(local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) ||
+	(local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) ||
+	(local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) ||
+	(local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT)))
+  { bgp_error(conn, 2, 11, NULL, 0); return; }
+
+  if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED))
+  { bgp_error(conn, 2, 11, NULL, 0); return; }
+
  /* Check the other connection */
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
  switch (other->state)
@ -2985,6 +3028,7 @@ static struct {
  { 2, 6, "Unacceptable hold time" },
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
+  { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */
  { 3, 0, "Invalid UPDATE message" },
  { 3, 1, "Malformed attribute list" },
  { 3, 2, "Unrecognized well-known attribute" },
--- a/sysdep/cf/README
+++ b/sysdep/cf/README
@ -4,7 +4,6 @@ Available configuration variables:
 CONFIG_AUTO_ROUTES	Device routes are added automagically by the kernel
 CONFIG_SELF_CONSCIOUS	We're able to recognize whether route was installed by us
 CONFIG_MULTIPLE_TABLES	The kernel supports multiple routing tables
-CONFIG_ALL_TABLES_AT_ONCE	Kernel scanner wants to process all tables at once
 CONFIG_SINGLE_ROUTE	There is only one route per network

 CONFIG_MC_PROPER_SRC	Multicast packets have source address according to socket saddr field
--- a/sysdep/cf/linux.h
+++ b/sysdep/cf/linux.h
@ -9,7 +9,6 @@
 #define CONFIG_AUTO_ROUTES
 #define CONFIG_SELF_CONSCIOUS
 #define CONFIG_MULTIPLE_TABLES
-#define CONFIG_ALL_TABLES_AT_ONCE
 #define CONFIG_IP6_SADR_KERNEL

 #define CONFIG_MC_PROPER_SRC
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@ -74,51 +74,16 @@
 #endif

 #define krt_ipv4(p) ((p)->af == AF_INET)
-#define krt_ecmp6(p) ((p)->af == AF_INET6)

 const int rt_default_ecmp = 16;

-/*
- * Structure nl_parse_state keeps state of received route processing. Ideally,
- * we could just independently parse received Netlink messages and immediately
- * propagate received routes to the rest of BIRD, but older Linux kernel (before
- * version 4.11) represents and announces IPv6 ECMP routes not as one route with
- * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
- * routes with the same prefix. More recent kernels work as with IPv4.
- *
- * Therefore, BIRD keeps currently processed route in nl_parse_state structure
- * and postpones its propagation until we expect it to be final; i.e., when
- * non-matching route is received or when the scan ends. When another matching
- * route is received, it is merged with the already processed route to form an
- * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
- * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
- * routes with RTA_MULTIPATH set are just considered non-matching.
- *
- * This is ignored for asynchronous notifications (every notification is handled
- * as a separate route). It is not an issue for our routes, as we ignore such
- * notifications anyways. But importing alien IPv6 ECMP routes does not work
- * properly with older kernels.
- *
- * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
- * for the same prefix.
- */
-
 struct nl_parse_state
 {
+  struct krt_proto *proto;
  struct linpool *pool;
  int scan;
-  int merge;

-  net *net;
-  rta *attrs;
-  struct krt_proto *proto;
-  s8 new;
-  s8 krt_src;
-  u8 krt_type;
-  u8 krt_proto;
-  u32 krt_metric;
-
-  u32 rta_flow;		/* Used during parsing */
+  u32 rta_flow;
 };

 /*
@ -161,16 +126,13 @@ nl_open_sock(struct nl_sock *nl)
    }
 }

-static void
+static int
 nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED)
 {
-  /*
-   * Strict checking is not necessary, it improves behavior on newer kernels.
-   * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT
-   * run-time), we can just ignore it.
-   */
 #ifdef SOL_NETLINK
-  setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
+  return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
+#else
+  return -1;
 #endif
 }

@ -198,10 +160,17 @@ nl_cfg_rx_buffer_size(struct config *cfg)
 static void
 nl_open(void)
 {
+  if ((nl_scan.fd >= 0) && (nl_req.fd >= 0))
+    return;
+
  nl_open_sock(&nl_scan);
  nl_open_sock(&nl_req);

-  nl_set_strict_dump(&nl_scan, 1);
+  if (nl_set_strict_dump(&nl_scan, 1) < 0)
+  {
+    log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once");
+    krt_use_shared_scan();
+  }
 }

 static void
@ -256,11 +225,13 @@ nl_request_dump_addr(int af)
 }

 static void
-nl_request_dump_route(int af)
+nl_request_dump_route(int af, int table_id)
 {
  struct {
    struct nlmsghdr nh;
    struct rtmsg rtm;
+    struct rtattr rta;
+    u32 table_id;
  } req = {
    .nh.nlmsg_type = RTM_GETROUTE,
    .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
@ -269,7 +240,17 @@ nl_request_dump_route(int af)
    .rtm.rtm_family = af,
  };

-  send(nl_scan.fd, &req, sizeof(req), 0);
+  if (table_id < 256)
+    req.rtm.rtm_table = table_id;
+  else
+  {
+    req.rta.rta_type = RTA_TABLE;
+    req.rta.rta_len = RTA_LENGTH(4);
+    req.table_id = table_id;
+    req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len;
+  }
+
+  send(nl_scan.fd, &req, req.nh.nlmsg_len, 0);
  nl_scan.last_hdr = NULL;
 }

@ -1325,7 +1306,7 @@ nh_bufsize(struct nexthop *nh)
 }

 static int
-nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
+nl_send_route(struct krt_proto *p, rte *e, int op)
 {
  eattr *ea;
  net *net = e->net;
@ -1407,15 +1388,17 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)

  /* For route delete, we do not specify remaining route attributes */
  if (op == NL_OP_DELETE)
-    goto dest;
+    goto done;

  /* Default scope is LINK for device routes, UNIVERSE otherwise */
  if (p->af == AF_MPLS)
    r->r.rtm_scope = RT_SCOPE_UNIVERSE;
  else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
    r->r.rtm_scope = ea->u.data;
+  else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw))
+    r->r.rtm_scope = RT_SCOPE_LINK;
  else
-    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+    r->r.rtm_scope = RT_SCOPE_UNIVERSE;

  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
@ -1438,13 +1421,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
  if (metrics[0])
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);

-
-dest:
-  switch (dest)
+  switch (a->dest)
    {
    case RTD_UNICAST:
      r->r.rtm_type = RTN_UNICAST;
-      if (nh->next && !krt_ecmp6(p))
+      struct nexthop *nh = &(a->nh);
+      if (nh->next)
 	nl_add_multipath(&r->h, rsize, nh, p->af, eattrs);
      else
      {
@ -1470,82 +1452,53 @@ dest:
      bug("krt_capable inconsistent with nl_send_route");
    }

+done:
  /* Ignore missing for DELETE */
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
 }

 static inline int
-nl_add_rte(struct krt_proto *p, rte *e)
+nl_allow_replace(struct krt_proto *p, rte *new)
 {
-  rta *a = e->attrs;
-  int err = 0;
+  /*
+   * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
+   * matching rtm_protocol, but that is OK when dedicated priority is used.
+   *
+   * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS
+   * (although it seems to be fixed in Linux 5.10 LTS) for sequence:
+   *
+   * ip route add 2001:db8::/32 via fe80::1 dev eth0
+   * ip route replace 2001:db8::/32 dev eth0
+   *
+   * (it ends with two routes instead of replacing the first by the second one)
+   *
+   * Replacing with direct and special type (e.g. unreachable) routes does not
+   * work, but replacing with regular routes work reliably
+   */

-  if (krt_ecmp6(p) && a->nh.next)
-  {
-    struct nexthop *nh = &(a->nh);
+  if (krt_ipv4(p))
+    return 1;

-    err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh);
-    if (err < 0)
-      return err;
-
-    for (nh = nh->next; nh; nh = nh->next)
-      err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh);
-
-    return err;
-  }
-
-  return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh));
+  rta *a = new->attrs;
+  return (a->dest == RTD_UNICAST) && ipa_nonzero(a->nh.gw);
 }

-static inline int
-nl_delete_rte(struct krt_proto *p, rte *e)
-{
-  int err = 0;
-
-  /* For IPv6, we just repeatedly request DELETE until we get error */
-  do
-    err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL);
-  while (krt_ecmp6(p) && !err);
-
-  return err;
-}
-
-static inline int
-nl_replace_rte(struct krt_proto *p, rte *e)
-{
-  rta *a = e->attrs;
-  return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh));
-}
-
-
 void
 krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
 {
  int err = 0;

-  /*
-   * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
-   * matching rtm_protocol, but that is OK when dedicated priority is used.
-   *
-   * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP
-   * and with some kernel versions ECMP replace crashes kernel. Would need more
-   * testing and checks for kernel versions.
-   *
-   * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the
-   * old route value, so we do not try to optimize IPv6 ECMP reconfigurations.
-   */
-
-  if (krt_ipv4(p) && old && new)
+  if (old && new && nl_allow_replace(p, new))
  {
-    err = nl_replace_rte(p, new);
+    err = nl_send_route(p, new, NL_OP_REPLACE);
  }
  else
  {
    if (old)
-      nl_delete_rte(p, old);
+      nl_send_route(p, old, NL_OP_DELETE);

    if (new)
-      err = nl_add_rte(p, new);
+      err = nl_send_route(p, new, NL_OP_ADD);
  }

  if (new)
@ -1557,71 +1510,6 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
  }
 }

-static int
-nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
-{
-  /* Route merging is used for IPv6 scans */
-  if (!s->scan || (rtm_family != AF_INET6))
-    return 0;
-
-  /* Saved and new route must have same network, proto/table, and priority */
-  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
-    return 0;
-
-  /* Both must be regular unicast routes */
-  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
-    return 0;
-
-  return 1;
-}
-
-static void
-nl_announce_route(struct nl_parse_state *s)
-{
-  rte *e = rte_get_temp(s->attrs, s->proto->p.main_source);
-  e->net = s->net;
-
-  ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr));
-  *ea = (ea_list) { .count = 2, .next = e->attrs->eattrs };
-  e->attrs->eattrs = ea;
-
-  ea->attrs[0] = (eattr) {
-    .id = EA_KRT_SOURCE,
-    .type = EAF_TYPE_INT,
-    .u.data = s->krt_proto,
-  };
-  ea->attrs[1] = (eattr) {
-    .id = EA_KRT_METRIC,
-    .type = EAF_TYPE_INT,
-    .u.data = s->krt_metric,
-  };
-
-  if (s->scan)
-    krt_got_route(s->proto, e, s->krt_src);
-  else
-    krt_got_route_async(s->proto, e, s->new, s->krt_src);
-
-  s->net = NULL;
-  s->attrs = NULL;
-  s->proto = NULL;
-  lp_flush(s->pool);
-}
-
-static inline void
-nl_parse_begin(struct nl_parse_state *s, int scan)
-{
-  memset(s, 0, sizeof (struct nl_parse_state));
-  s->pool = nl_linpool;
-  s->scan = scan;
-}
-
-static inline void
-nl_parse_end(struct nl_parse_state *s)
-{
-  if (s->net)
-    nl_announce_route(s);
-}
-

 #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0)
 #define SKIP(ARG, ...)  do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0)
@ -1759,13 +1647,29 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)

  net *net = net_get(p->p.main_channel->table, n);

-  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family))
-    nl_announce_route(s);
-
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
  ra->source = RTS_INHERIT;
  ra->scope = SCOPE_UNIVERSE;

+  {
+    ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + 2 * sizeof(eattr));
+    *ea = (ea_list) { .flags = EALF_SORTED, .count = 2 };
+    ea->next = ra->eattrs;
+    ra->eattrs = ea;
+
+    ea->attrs[0] = (eattr) {
+      .id = EA_KRT_SOURCE,
+      .type = EAF_TYPE_INT,
+      .u.data = i->rtm_protocol
+    };
+
+    ea->attrs[1] = (eattr) {
+      .id = EA_KRT_METRIC,
+      .type = EAF_TYPE_INT,
+      .u.data = priority,
+    };
+  }
+
  if (a[RTA_FLOW])
    s->rta_flow = rta_get_u32(a[RTA_FLOW]);
  else
@ -1942,60 +1846,40 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 	}
    }

-  /*
-   * Ideally, now we would send the received route to the rest of kernel code.
-   * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
-   * postpone it and merge next hops until the end of the sequence. Note that
-   * when doing merging of next hops, we expect the new route to be unipath.
-   * Otherwise, we ignore additional next hops in nexthop_insert().
-   */
+  rte *e = rte_get_temp(ra, p->p.main_source);
+  e->net = net;

-  if (!s->net)
-  {
-    /* Store the new route */
-    s->net = net;
-    s->attrs = ra;
-    s->proto = p;
-    s->new = new;
-    s->krt_src = krt_src;
-    s->krt_type = i->rtm_type;
-    s->krt_proto = i->rtm_protocol;
-    s->krt_metric = priority;
-  }
+  if (s->scan)
+    krt_got_route(p, e, krt_src);
  else
-  {
-    /* Merge next hops with the stored route */
-    rta *oa = s->attrs;
+    krt_got_route_async(p, e, new, krt_src);

-    struct nexthop *nhs = &oa->nh;
-    nexthop_insert(&nhs, &ra->nh);
-
-    /* Perhaps new nexthop is inserted at the first position */
-    if (nhs == &ra->nh)
-    {
-      /* Swap rtas */
-      s->attrs = ra;
-
-      /* Keep old eattrs */
-      ra->eattrs = oa->eattrs;
-    }
-  }
+  lp_flush(s->pool);
 }

 void
-krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
+krt_do_scan(struct krt_proto *p)
 {
-  struct nlmsghdr *h;
-  struct nl_parse_state s;
+  struct nl_parse_state s = {
+    .proto = p,
+    .pool = nl_linpool,
+    .scan = 1,
+  };

-  nl_parse_begin(&s, 1);
-  nl_request_dump_route(AF_UNSPEC);
+  /* Table-specific scan or shared scan */
+  if (p)
+    nl_request_dump_route(p->af, krt_table_id(p));
+  else
+    nl_request_dump_route(AF_UNSPEC, 0);
+
+  struct nlmsghdr *h;
  while (h = nl_get_scan())
+  {
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
      nl_parse_route(&s, h);
    else
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
-  nl_parse_end(&s);
+  }
 }

 /*
@ -2010,16 +1894,18 @@ static struct config *nl_last_config;	/* For tracking changes to nl_async_bufsiz
 static void
 nl_async_msg(struct nlmsghdr *h)
 {
-  struct nl_parse_state s;
+  struct nl_parse_state s = {
+    .proto = NULL,
+    .pool = nl_linpool,
+    .scan = 0,
+  };

  switch (h->nlmsg_type)
    {
    case RTM_NEWROUTE:
    case RTM_DELROUTE:
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
-      nl_parse_begin(&s, 0);
      nl_parse_route(&s, h);
-      nl_parse_end(&s);
      break;
    case RTM_NEWLINK:
    case RTM_DELLINK:
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@ -785,18 +785,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src)
  rte_free(e);
 }

+
 /*
 *	Periodic scanning
 */

-
-#ifdef CONFIG_ALL_TABLES_AT_ONCE
-
-static timer *krt_scan_timer;
-static int krt_scan_count;
+static timer *krt_scan_all_timer;
+static int krt_scan_all_count;
+static _Bool krt_scan_all_tables;

 static void
-krt_scan(timer *t UNUSED)
+krt_scan_all(timer *t UNUSED)
 {
  struct krt_proto *p;
  node *n;
@ -817,35 +816,42 @@ krt_scan(timer *t UNUSED)
 }

 static void
-krt_scan_timer_start(struct krt_proto *p)
+krt_scan_all_timer_start(struct krt_proto *p)
 {
-  if (!krt_scan_count)
-    krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0);
+  if (!krt_scan_all_count)
+    krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0);

-  krt_scan_count++;
+  krt_scan_all_count++;

-  tm_start(krt_scan_timer, 1 S);
+  tm_start(krt_scan_all_timer, 1 S);
 }

 static void
-krt_scan_timer_stop(struct krt_proto *p UNUSED)
+krt_scan_all_timer_stop(void)
 {
-  krt_scan_count--;
+  ASSERT(krt_scan_all_count > 0);

-  if (!krt_scan_count)
+  krt_scan_all_count--;
+
+  if (!krt_scan_all_count)
  {
-    rfree(krt_scan_timer);
-    krt_scan_timer = NULL;
+    rfree(krt_scan_all_timer);
+    krt_scan_all_timer = NULL;
  }
 }

 static void
-krt_scan_timer_kick(struct krt_proto *p UNUSED)
+krt_scan_all_timer_kick(void)
 {
-  tm_start(krt_scan_timer, 0);
+  tm_start(krt_scan_all_timer, 0);
+}
+
+void
+krt_use_shared_scan(void)
+{
+  krt_scan_all_tables = 1;
 }

-#else

 static void
 krt_scan(timer *t)
@ -863,26 +869,33 @@ krt_scan(timer *t)
 static void
 krt_scan_timer_start(struct krt_proto *p)
 {
-  p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
-  tm_start(p->scan_timer, 1 S);
+  if (krt_scan_all_tables)
+    krt_scan_all_timer_start(p);
+  else
+  {
+    p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
+    tm_start(p->scan_timer, 1 S);
+  }
 }

 static void
 krt_scan_timer_stop(struct krt_proto *p)
 {
-  tm_stop(p->scan_timer);
+  if (krt_scan_all_tables)
+    krt_scan_all_timer_stop();
+  else
+    tm_stop(p->scan_timer);
 }

 static void
 krt_scan_timer_kick(struct krt_proto *p)
 {
-  tm_start(p->scan_timer, 0);
+  if (krt_scan_all_tables)
+    krt_scan_all_timer_kick();
+  else
+    tm_start(p->scan_timer, 0);
 }

-#endif
-
-
-

 /*
 *	Updates
@ -992,11 +1005,6 @@ krt_postconfig(struct proto_config *CF)
  if (! proto_cf_main_channel(CF))
    cf_error("Channel not specified");

-#ifdef CONFIG_ALL_TABLES_AT_ONCE
-  if (krt_cf->scan_time != cf->scan_time)
-    cf_error("All kernel syncers must use the same table scan interval");
-#endif
-
  struct channel_config *cc = proto_cf_main_channel(CF);
  struct rtable_config *tab = cc->table;
  if (tab->krt_attached)
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@ -55,10 +55,7 @@ struct krt_proto {
  struct rtable *krt_table;	/* Internal table of inherited routes */
 #endif

-#ifndef CONFIG_ALL_TABLES_AT_ONCE
  timer *scan_timer;
-#endif
-
  struct bmap sync_map;		/* Keeps track which exported routes were successfully written to kernel */
  struct bmap seen_map;		/* Routes seen during last periodic scan */
  node krt_node;		/* Node in krt_proto_list */
@ -79,6 +76,7 @@ extern pool *krt_pool;

 struct proto_config * kif_init_config(int class);
 void kif_request_scan(void);
+void krt_use_shared_scan(void);
 void krt_got_route(struct krt_proto *p, struct rte *e, s8 src);
 void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);