diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 099ae6e9..94a37a73 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -75,51 +75,16 @@ #endif #define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) const int rt_default_ecmp = 16; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - struct nl_parse_state { + struct krt_proto *proto; struct linpool *pool; int scan; - int merge; - net_addr *net; - ea_list *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - - u32 rta_flow; /* Used during parsing */ + u32 rta_flow; }; /* @@ -1453,10 +1418,13 @@ nh_bufsize(struct nexthop_adata *nhad) } static int -nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop_adata *nh) +nl_send_route(struct krt_proto *p, const rte *e, int op) { eattr *ea; ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0); u32 priority = 0; @@ -1534,15 +1502,17 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; else if (ea = ea_find(eattrs, &ea_krt_scope)) r->r.rtm_scope = ea->u.data; + else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; if (ea = ea_find(eattrs, &ea_krt_prefsrc)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1565,13 +1535,11 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: switch (dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (!NEXTHOP_ONE(nh) && !krt_ecmp6(p)) + if (!NEXTHOP_ONE(nh)) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { @@ -1597,99 +1565,56 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - ea_list *ea = e->attrs; - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - eattr *nhea = ea_find(ea, &ea_gen_nexthop); - struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + if (krt_ipv4(p)) + return 1; - if (krt_ecmp6(p) && nhad && NEXTHOP_IS_REACHABLE(nhad) && !NEXTHOP_ONE(nhad)) - { - uint cnt = 0; - NEXTHOP_WALK(nh, nhad) - { - struct { - struct nexthop_adata nhad; - u32 labels[MPLS_MAX_LABEL_STACK]; - } nhx; - memcpy(&nhx.nhad.nh, nh, NEXTHOP_SIZE(nh)); - nhx.nhad.ad.length = (void *) NEXTHOP_NEXT(&nhx.nhad.nh) - (void *) nhx.nhad.ad.data; + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); - if (!cnt++) - { - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, &nhx.nhad); - if (err < 0) - return err; - } - else - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, &nhx.nhad); - } - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, - NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad); + return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw); } -static inline int -nl_delete_rte(struct krt_proto *p, const rte *e) -{ - int err = 0; - - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); - - return err; -} - -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); - struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; - return nl_send_route(p, e, NL_OP_REPLACE, - NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad); -} - - void krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1701,68 +1626,6 @@ krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const r } } -static int -nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte e0 = { - .attrs = s->attrs, - .net = s->net, - }; - - EA_LOCAL_LIST(2) ea = { - .l = { .count = 2, .next = e0.attrs }, - .a = { - EA_LITERAL_EMBEDDED(&ea_krt_source, 0, s->krt_proto), - EA_LITERAL_EMBEDDED(&ea_krt_metric, 0, s->krt_metric), - }, - }; - - e0.attrs = &ea.l; - - if (s->scan) - krt_got_route(s->proto, &e0, s->krt_src); - else - krt_got_route_async(s->proto, &e0, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) @@ -1898,11 +1761,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net6_prefix(&src), net6_pxlen(&src)); } - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - ea_list *ra = NULL; ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT); + ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol); + ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority); if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); @@ -2044,56 +1906,27 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t])); } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte e0 = { + .net = net, + .attrs = ra, + }; - if (!s->net) - { - /* Store the new route */ - s->net = lp_alloc(s->pool, net->length); - net_copy(s->net, net); - - ea_set_attr_data(&ra, &ea_gen_nexthop, 0, - nhad.ad.data, nhad.ad.length); - - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, &e0, krt_src); else - { - /* Merge next hops with the stored route */ - eattr *nhea = ea_find(s->attrs, &ea_gen_nexthop); - struct nexthop_adata *nhad_old = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + krt_got_route_async(p, &e0, new, krt_src); - if (nhad_old) - ea_set_attr(&s->attrs, - EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, - &(nexthop_merge(nhad_old, &nhad.nhad, - KRT_CF->merge_paths, s->pool)->ad) - )); - else - ea_set_attr_data(&s->attrs, &ea_gen_nexthop, 0, - nhad.ad.data, nhad.ad.length); - } + lp_flush(s->pool); } void krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; - - nl_parse_begin(&s, 1); + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; /* Table-specific scan or shared scan */ if (p) @@ -2101,6 +1934,7 @@ krt_do_scan(struct krt_proto *p) else nl_request_dump_route(AF_UNSPEC, 0); + struct nlmsghdr *h; while (h = nl_get_scan()) { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) @@ -2108,8 +1942,6 @@ krt_do_scan(struct krt_proto *p) else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); } - - nl_parse_end(&s); } /* @@ -2124,16 +1956,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: