From f8e273b5e7a3c721f4a30cf27a0b4fe54602e83f Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 14 Jun 2021 16:30:59 +0200 Subject: [PATCH 001/196] Nest: Fix export of tmpattrs through pipes In most cases of export there is no need to store back temporary attributes to rte, as receivers (protocols) access eattr list anyway. But pipe copies the original rte with old values, so we should store tmpattrs also during export. Thanks to Paul Donohue for the bugreport. --- nest/rt-table.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nest/rt-table.c b/nest/rt-table.c index 13209dd7..a7e31d85 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -618,6 +618,9 @@ export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int si goto reject; } + /* Needed for pipes */ + rte_store_tmp_attrs(rt, pool, NULL); + accept: if (rt != rt0) *rt_free = rt; From 3ebabab2778d05212cc07ebccf583159d5e0890a Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 14 Jun 2021 17:58:37 +0200 Subject: [PATCH 002/196] Revert "Nest: Fix export of tmpattrs through pipes" This reverts commit f8e273b5e7a3c721f4a30cf27a0b4fe54602e83f. --- nest/rt-table.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/nest/rt-table.c b/nest/rt-table.c index a7e31d85..13209dd7 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -618,9 +618,6 @@ export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int si goto reject; } - /* Needed for pipes */ - rte_store_tmp_attrs(rt, pool, NULL); - accept: if (rt != rt0) *rt_free = rt; From 1b9bf4e192a252db861acadc7f800d7046435a3f Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 14 Jun 2021 20:02:50 +0200 Subject: [PATCH 003/196] Nest: Fix export of tmpattrs through pipes Pipes copy the original rte with old values, so they require rte to be exported with stored tmpattrs. Other protocols access stored attributes using eattr list, so they require rte to be exported with expanded tmpattrs. This is temporary hack, we plan to remove whoe tmpattr mechanism. Thanks to Paul Donohue for the bugreport. --- nest/rt-table.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nest/rt-table.c b/nest/rt-table.c index 13209dd7..390b3277 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -618,6 +618,12 @@ export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int si goto reject; } +#ifdef CONFIG_PIPE + /* Pipes need rte with stored tmpattrs, remaining protocols need expanded tmpattrs */ + if (p->proto == &proto_pipe) + rte_store_tmp_attrs(rt, pool, NULL); +#endif + accept: if (rt != rt0) *rt_free = rt; From f761be6b30633054a54369eee7d08b951a366e5e Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 17 Jun 2021 16:56:51 +0200 Subject: [PATCH 004/196] Nest: Clean up main channel handling Remove assumption that main channel is the only channel. --- nest/protocol.h | 2 +- proto/ospf/config.Y | 5 ++--- proto/radv/config.Y | 1 + proto/radv/radv.c | 2 +- proto/rip/rip.c | 2 +- proto/rpki/rpki.c | 2 +- proto/static/static.c | 2 +- sysdep/unix/krt.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/nest/protocol.h b/nest/protocol.h index 48eb01d2..abcc505d 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -616,7 +616,7 @@ struct channel { struct channel_config *proto_cf_find_channel(struct proto_config *p, uint net_type); static inline struct channel_config *proto_cf_main_channel(struct proto_config *pc) -{ struct channel_config *cc = HEAD(pc->channels); return NODE_VALID(cc) ? cc : NULL; } +{ return proto_cf_find_channel(pc, pc->net_type); } struct channel *proto_find_channel_by_table(struct proto *p, struct rtable *t); struct channel *proto_find_channel_by_name(struct proto *p, const char *n); diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index fd2cfe8a..4b7d5a36 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -85,7 +85,7 @@ ospf_proto_finish(void) struct ospf_iface_patt *ic; /* Define default channel */ - if (EMPTY_LIST(this_proto->channels)) + if (! proto_cf_main_channel(this_proto)) { uint net_type = this_proto->net_type = ospf_cfg_is_v2() ? NET_IP4 : NET_IP6; channel_config_new(NULL, net_label[net_type], net_type, this_proto); @@ -248,8 +248,7 @@ ospf_channel_start: net_type ospf_af_mc $$ = this_channel = channel_config_get(NULL, net_label[$1], $1, this_proto); /* Save the multicast flag */ - if (this_channel == proto_cf_main_channel(this_proto)) - OSPF_CFG->af_mc = $2; + OSPF_CFG->af_mc = $2; }; ospf_channel: ospf_channel_start channel_opt_list channel_end; diff --git a/proto/radv/config.Y b/proto/radv/config.Y index dda9cfcd..8d4a3ab9 100644 --- a/proto/radv/config.Y +++ b/proto/radv/config.Y @@ -46,6 +46,7 @@ proto: radv_proto ; radv_proto_start: proto_start RADV { this_proto = proto_config_new(&proto_radv, $1); + this_proto->net_type = NET_IP6; init_list(&RADV_CFG->patt_list); init_list(&RADV_CFG->pref_list); diff --git a/proto/radv/radv.c b/proto/radv/radv.c index b4235917..66e8eb4b 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -564,7 +564,7 @@ radv_postconfig(struct proto_config *CF) // struct radv_config *cf = (void *) CF; /* Define default channel */ - if (EMPTY_LIST(CF->channels)) + if (! proto_cf_main_channel(CF)) channel_config_new(NULL, net_label[NET_IP6], NET_IP6, CF); } diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 8b4719f7..e1a235a0 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -1105,7 +1105,7 @@ rip_postconfig(struct proto_config *CF) // struct rip_config *cf = (void *) CF; /* Define default channel */ - if (EMPTY_LIST(CF->channels)) + if (! proto_cf_main_channel(CF)) channel_config_new(NULL, net_label[CF->net_type], CF->net_type, CF); } diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c index 799cb877..ab0837f3 100644 --- a/proto/rpki/rpki.c +++ b/proto/rpki/rpki.c @@ -923,7 +923,7 @@ rpki_postconfig(struct proto_config *CF) { /* Define default channel */ if (EMPTY_LIST(CF->channels)) - channel_config_new(NULL, net_label[CF->net_type], CF->net_type, CF); + cf_error("Channel not specified"); } static void diff --git a/proto/static/static.c b/proto/static/static.c index 661f1aac..2789c1bb 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -434,7 +434,7 @@ static_postconfig(struct proto_config *CF) struct static_config *cf = (void *) CF; struct static_route *r; - if (EMPTY_LIST(CF->channels)) + if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); struct channel_config *cc = proto_cf_main_channel(CF); diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index ceb88563..7c2614b1 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -1013,7 +1013,7 @@ krt_postconfig(struct proto_config *CF) if (cf->c.class == SYM_TEMPLATE) return; - if (EMPTY_LIST(CF->channels)) + if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); #ifdef CONFIG_ALL_TABLES_AT_ONCE From 13225f1dbff54619476f2d8f6bc779dbb4983e3e Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sun, 5 Apr 2020 03:24:46 +0200 Subject: [PATCH 005/196] Filter: Faster prefix sets Use 16-way (4bit) branching in prefix trie instead of basic binary branching. The change makes IPv4 prefix sets almost 3x faster, but with more memory consumption and much more complicated algorithm. Together with a previous filter change, it makes IPv4 prefix sets about ~4.3x faster and slightly smaller (on my test data). --- filter/data.h | 12 +- filter/test.conf | 6 + filter/trie.c | 287 ++++++++++++++++++++++++++++++++++------------- lib/birdlib.h | 3 + lib/ip.h | 17 ++- 5 files changed, 244 insertions(+), 81 deletions(-) diff --git a/filter/data.h b/filter/data.h index d296776d..21967deb 100644 --- a/filter/data.h +++ b/filter/data.h @@ -140,18 +140,22 @@ struct f_tree { void *data; }; +#define TRIE_STEP 4 + struct f_trie_node4 { ip4_addr addr, mask, accept; - uint plen; - struct f_trie_node4 *c[2]; + u16 plen; + u16 local; + struct f_trie_node4 *c[1 << TRIE_STEP]; }; struct f_trie_node6 { ip6_addr addr, mask, accept; - uint plen; - struct f_trie_node6 *c[2]; + u16 plen; + u16 local; + struct f_trie_node6 *c[1 << TRIE_STEP]; }; struct f_trie_node diff --git a/filter/test.conf b/filter/test.conf index 3a8804a1..1edcd64e 100644 --- a/filter/test.conf +++ b/filter/test.conf @@ -558,6 +558,12 @@ prefix set pxs; bt_assert(2000::/29 !~ pxs); bt_assert(1100::/10 !~ pxs); bt_assert(2010::/26 !~ pxs); + + pxs = [ 52E0::/13{13,128} ]; + bt_assert(52E7:BE81:379B:E6FD:541F:B0D0::/93 ~ pxs); + + pxs = [ 41D8:8718::/30{0,30}, 413A:99A8:6C00::/38{38,128} ]; + bt_assert(4180::/9 ~ pxs); } bt_test_suite(t_prefix6_set, "Testing prefix IPv6 sets"); diff --git a/filter/trie.c b/filter/trie.c index 1a4e1ac3..cf805afc 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -86,7 +86,10 @@ #define ipa_mkmask(x) ip6_mkmask(x) #define ipa_masklen(x) ip6_masklen(&x) #define ipa_pxlen(x,y) ip6_pxlen(x,y) -#define ipa_getbit(x,n) ip6_getbit(x,n) +#define ipa_getbit(a,p) ip6_getbit(a,p) +#define ipa_getbits(a,p,n) ip6_getbits(a,p,n) +#define ipa_setbits(a,p,n) ip6_setbits(a,p,n) +#define trie_local_mask(a,b,c) trie_local_mask6(a,b,c) #define ipt_from_ip4(x) _MI6(_I(x), 0, 0, 0) #define ipt_to_ip4(x) _MI4(_I0(x)) @@ -109,10 +112,11 @@ f_new_trie(linpool *lp, uint data_size) } static inline struct f_trie_node4 * -new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) +new_node4(struct f_trie *t, uint plen, uint local, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) { struct f_trie_node4 *n = lp_allocz(t->lp, sizeof(struct f_trie_node4) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -120,10 +124,11 @@ new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr a } static inline struct f_trie_node6 * -new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) +new_node6(struct f_trie *t, uint plen, uint local, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) { struct f_trie_node6 *n = lp_allocz(t->lp, sizeof(struct f_trie_node6) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -131,24 +136,24 @@ new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr a } static inline struct f_trie_node * -new_node(struct f_trie *t, int plen, ip_addr paddr, ip_addr pmask, ip_addr amask) +new_node(struct f_trie *t, uint plen, uint local, ip_addr paddr, ip_addr pmask, ip_addr amask) { if (t->ipv4) - return (struct f_trie_node *) new_node4(t, plen, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); + return (struct f_trie_node *) new_node4(t, plen, local, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); else - return (struct f_trie_node *) new_node6(t, plen, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); + return (struct f_trie_node *) new_node6(t, plen, local, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); } static inline void attach_node4(struct f_trie_node4 *parent, struct f_trie_node4 *child) { - parent->c[ip4_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip4_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void attach_node6(struct f_trie_node6 *parent, struct f_trie_node6 *child) { - parent->c[ip6_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip6_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void @@ -160,10 +165,139 @@ attach_node(struct f_trie_node *parent, struct f_trie_node *child, int v4) attach_node6(&parent->v6, &child->v6); } + +static inline uint +trie_local_mask4(ip4_addr px, uint plen, uint nlen) +{ + uint step = plen - nlen; + uint pos = (1u << step) + ip4_getbits(px, nlen, step); + return 1u << pos; +} + +static inline uint +trie_local_mask6(ip6_addr px, uint plen, uint nlen) +{ + uint step = plen - nlen; + uint pos = (1u << step) + ip6_getbits(px, nlen, step); + return 1u << pos; +} + +static inline uint +trie_amask_to_local(ip_addr px, ip_addr amask, uint nlen) +{ + uint local = 0; + + for (uint plen = MAX(nlen, 1); plen < (nlen + TRIE_STEP); plen++) + if (ipa_getbit(amask, plen - 1)) + local |= trie_local_mask(px, plen, nlen); + + return local; +} + #define GET_ADDR(N,F,X) ((X) ? ipt_from_ip4((N)->v4.F) : ipa_from_ip6((N)->v6.F)) #define SET_ADDR(N,F,X,V) ({ if (X) (N)->v4.F =ipt_to_ip4(V); else (N)->v6.F =ipa_to_ip6(V); }) +#define ADD_LOCAL(N,X,V) ({ uint v_ = (V); if (X) (N)->v4.local |= v_; else (N)->v6.local |= v_; }) + #define GET_CHILD(N,F,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) + + +static void * +trie_add_node(struct f_trie *t, uint plen, ip_addr px, uint local, uint l, uint h) +{ + uint l_ = l ? (l - 1) : 0; + ip_addr amask = (l_ < h) ? ipa_xor(ipa_mkmask(l_), ipa_mkmask(h)) : IPA_NONE; + ip_addr pmask = ipa_mkmask(plen); + ip_addr paddr = ipa_and(px, pmask); + struct f_trie_node *o = NULL; + struct f_trie_node *n = &t->root; + int v4 = t->ipv4; + + /* Add all bits for each active level (0x0002 0x000c 0x00f0 0xff00) */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (plen + i)) && ((plen + i) <= h)) + local |= ((1u << (1u << i)) - 1) << (1u << i); + + DBG("Insert node %I/%u (%I %x)\n", paddr, plen, amask, local); + while (n) + { + ip_addr naddr = GET_ADDR(n, addr, v4); + ip_addr nmask = GET_ADDR(n, mask, v4); + ip_addr accept = GET_ADDR(n, accept, v4); + ip_addr cmask = ipa_and(nmask, pmask); + uint nlen = v4 ? n->v4.plen : n->v6.plen; + + DBG("Found node %I/%u (%I %x)\n", + naddr, nlen, accept, v4 ? n->v4.local : n->v6.local); + + if (ipa_compare(ipa_and(paddr, cmask), ipa_and(naddr, cmask))) + { + /* We are out of path - we have to add branching node 'b' + between node 'o' and node 'n', and attach new node 'a' + as the other child of 'b'. */ + int blen = ROUND_DOWN_POW2(ipa_pxlen(paddr, naddr), TRIE_STEP); + ip_addr bmask = ipa_mkmask(blen); + ip_addr baddr = ipa_and(px, bmask); + + /* Merge accept masks from children to get accept mask for node 'b' */ + ip_addr baccm = ipa_and(ipa_or(amask, accept), bmask); + uint bloc = trie_amask_to_local(naddr, accept, blen) | + trie_amask_to_local(paddr, amask, blen); + + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); + struct f_trie_node *b = new_node(t, blen, bloc, baddr, bmask, baccm); + attach_node(o, b, v4); + attach_node(b, n, v4); + attach_node(b, a, v4); + + DBG("Case 1\n"); + return a; + } + + if (plen < nlen) + { + /* We add new node 'a' between node 'o' and node 'n' */ + amask = ipa_or(amask, ipa_and(accept, pmask)); + local |= trie_amask_to_local(naddr, accept, plen); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); + attach_node(o, a, v4); + attach_node(a, n, v4); + + DBG("Case 2\n"); + return a; + } + + if (plen == nlen) + { + /* We already found added node in trie. Just update accept and local mask */ + accept = ipa_or(accept, amask); + SET_ADDR(n, accept, v4, accept); + ADD_LOCAL(n, v4, local); + + DBG("Case 3\n"); + return n; + } + + /* Update accept mask part M2 and go deeper */ + accept = ipa_or(accept, ipa_and(amask, nmask)); + SET_ADDR(n, accept, v4, accept); + ADD_LOCAL(n, v4, trie_amask_to_local(paddr, amask, nlen)); + + DBG("Step %u\n", ipa_getbits(paddr, nlen)); + + /* n->plen < plen and plen <= 32 (128) */ + o = n; + n = GET_CHILD(n, c, v4, ipa_getbits(paddr, nlen, TRIE_STEP)); + } + + /* We add new tail node 'a' after node 'o' */ + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); + attach_node(o, a, v4); + + DBG("Case 4\n"); + return a; +} + /** * trie_add_prefix * @t: trie to add to @@ -180,7 +314,6 @@ attach_node(struct f_trie_node *parent, struct f_trie_node *child, int v4) * a pointer to the root node is returned. Returns NULL when called with * mismatched IPv4/IPv6 net type. */ - void * trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) { @@ -203,82 +336,66 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) return NULL; } + DBG("\nInsert net %N (%u-%u)\n", net, l, h); + if (l == 0) t->zero = 1; - else - l--; if (h < plen) plen = h; - ip_addr amask = ipa_xor(ipa_mkmask(l), ipa_mkmask(h)); - ip_addr pmask = ipa_mkmask(plen); - ip_addr paddr = ipa_and(px, pmask); - struct f_trie_node *o = NULL; - struct f_trie_node *n = &t->root; + /* Primary node length, plen rounded down */ + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); - while (n) - { - ip_addr naddr = GET_ADDR(n, addr, v4); - ip_addr nmask = GET_ADDR(n, mask, v4); - ip_addr accept = GET_ADDR(n, accept, v4); - ip_addr cmask = ipa_and(nmask, pmask); - uint nlen = v4 ? n->v4.plen : n->v6.plen; + if (plen == nlen) + return trie_add_node(t, nlen, px, 0, l, h); - if (ipa_compare(ipa_and(paddr, cmask), ipa_and(naddr, cmask))) - { - /* We are out of path - we have to add branching node 'b' - between node 'o' and node 'n', and attach new node 'a' - as the other child of 'b'. */ - int blen = ipa_pxlen(paddr, naddr); - ip_addr bmask = ipa_mkmask(blen); - ip_addr baddr = ipa_and(px, bmask); + /* Secondary node length, plen rouned up */ + uint slen = nlen + TRIE_STEP; + void *node = NULL; - /* Merge accept masks from children to get accept mask for node 'b' */ - ip_addr baccm = ipa_and(ipa_or(amask, accept), bmask); + /* + * For unaligned prefix lengths it is more complicated. We need to encode + * matching prefixes of lengths from l to h. There are three cases of lengths: + * + * 1) 0..nlen are encoded by the accept mask of the primary node + * 2) nlen..(slen-1) are encoded by the local mask of the primary node + * 3) slen..max are encoded in secondary nodes + */ - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); - struct f_trie_node *b = new_node(t, blen, baddr, bmask, baccm); - attach_node(o, b, v4); - attach_node(b, n, v4); - attach_node(b, a, v4); - return a; - } + if (l < slen) + { + uint local = 0; - if (plen < nlen) - { - /* We add new node 'a' between node 'o' and node 'n' */ - amask = ipa_or(amask, ipa_and(accept, pmask)); - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); - attach_node(o, a, v4); - attach_node(a, n, v4); - return a; - } + /* Compute local bits for accepted nlen..(slen-1) prefixes */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (nlen + i)) && ((nlen + i) <= h)) + { + uint pos = (1u << i) + ipa_getbits(px, nlen, i); + uint len = ((nlen + i) <= plen) ? 1 : (1u << (nlen + i - plen)); - if (plen == nlen) - { - /* We already found added node in trie. Just update accept mask */ - accept = ipa_or(accept, amask); - SET_ADDR(n, accept, v4, accept); - return n; - } + /* We need to fill 'len' bits starting at 'pos' position */ + local |= ((1u << len) - 1) << pos; + } - /* Update accept mask part M2 and go deeper */ - accept = ipa_or(accept, ipa_and(amask, nmask)); - SET_ADDR(n, accept, v4, accept); + /* Add the primary node */ + node = trie_add_node(t, nlen, px, local, l, nlen); + } - /* n->plen < plen and plen <= 32 (128) */ - o = n; - n = GET_CHILD(n, c, v4, ipa_getbit(paddr, nlen) ? 1 : 0); - } + if (slen <= h) + { + uint l2 = MAX(l, slen); + uint max = (1u << (slen - plen)); - /* We add new tail node 'a' after node 'o' */ - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); - attach_node(o, a, v4); + /* Add secondary nodes */ + for (uint i = 0; i < max; i++) + node = trie_add_node(t, slen, ipa_setbits(px, slen - 1, i), 0, l2, h); + } - return a; + return node; } + static int trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) { @@ -289,6 +406,8 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask4(px, plen, nlen); const struct f_trie_node4 *n = &t->root.v4; while (n) @@ -299,6 +418,10 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) if (ip4_compare(ip4_and(paddr, cmask), ip4_and(n->addr, cmask))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip4_getbit(n->accept, plentest)) return 1; @@ -308,7 +431,7 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) return 0; /* Choose children */ - n = n->c[(ip4_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip4_getbits(paddr, n->plen, TRIE_STEP)]; } return 0; @@ -324,6 +447,8 @@ trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask6(px, plen, nlen); const struct f_trie_node6 *n = &t->root.v6; while (n) @@ -334,6 +459,10 @@ trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) if (ip6_compare(ip6_and(paddr, cmask), ip6_and(n->addr, cmask))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip6_getbit(n->accept, plentest)) return 1; @@ -343,7 +472,7 @@ trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) return 0; /* Choose children */ - n = n->c[(ip6_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip6_getbits(paddr, n->plen, TRIE_STEP)]; } return 0; @@ -392,7 +521,11 @@ trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) (! ip4_equal(t1->accept, t2->accept))) return 0; - return trie_node_same4(t1->c[0], t2->c[0]) && trie_node_same4(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same4(t1->c[i], t2->c[i])) + return 0; + + return 1; } static int @@ -409,7 +542,11 @@ trie_node_same6(const struct f_trie_node6 *t1, const struct f_trie_node6 *t2) (! ip6_equal(t1->accept, t2->accept))) return 0; - return trie_node_same6(t1->c[0], t2->c[0]) && trie_node_same6(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same6(t1->c[i], t2->c[i])) + return 0; + + return 1; } /** @@ -440,8 +577,8 @@ trie_node_format4(const struct f_trie_node4 *t, buffer *buf) if (ip4_nonzero(t->accept)) buffer_print(buf, "%I4/%d{%I4}, ", t->addr, t->plen, t->accept); - trie_node_format4(t->c[0], buf); - trie_node_format4(t->c[1], buf); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + trie_node_format4(t->c[i], buf); } static void @@ -453,8 +590,8 @@ trie_node_format6(const struct f_trie_node6 *t, buffer *buf) if (ip6_nonzero(t->accept)) buffer_print(buf, "%I6/%d{%I6}, ", t->addr, t->plen, t->accept); - trie_node_format6(t->c[0], buf); - trie_node_format6(t->c[1], buf); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + trie_node_format6(t->c[i], buf); } /** diff --git a/lib/birdlib.h b/lib/birdlib.h index 431b7c0d..81d4908a 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -32,6 +32,9 @@ struct align_probe { char x; long int y; }; #define MAX(a,b) MAX_(a,b) #endif +#define ROUND_DOWN_POW2(a,b) ((a) & ~((b)-1)) +#define ROUND_UP_POW2(a,b) (((a)+((b)-1)) & ~((b)-1)) + #define U64(c) UINT64_C(c) #define ABS(a) ((a)>=0 ? (a) : -(a)) #define DELTA(a,b) (((a)>=(b))?(a)-(b):(b)-(a)) diff --git a/lib/ip.h b/lib/ip.h index 5b179acb..cc36ce64 100644 --- a/lib/ip.h +++ b/lib/ip.h @@ -280,10 +280,16 @@ static inline uint ip6_pxlen(ip6_addr a, ip6_addr b) } static inline u32 ip4_getbit(ip4_addr a, uint pos) -{ return _I(a) & (0x80000000 >> pos); } +{ return (_I(a) >> (31 - pos)) & 1; } + +static inline u32 ip4_getbits(ip4_addr a, uint pos, uint n) +{ return (_I(a) >> ((32 - n) - pos)) & ((1u << n) - 1); } static inline u32 ip6_getbit(ip6_addr a, uint pos) -{ return a.addr[pos / 32] & (0x80000000 >> (pos % 32)); } +{ return (a.addr[pos / 32] >> (31 - (pos % 32))) & 0x1; } + +static inline u32 ip6_getbits(ip6_addr a, uint pos, uint n) +{ return (a.addr[pos / 32] >> ((32 - n) - (pos % 32))) & ((1u << n) - 1); } static inline u32 ip4_setbit(ip4_addr *a, uint pos) { return _I(*a) |= (0x80000000 >> pos); } @@ -297,6 +303,13 @@ static inline u32 ip4_clrbit(ip4_addr *a, uint pos) static inline u32 ip6_clrbit(ip6_addr *a, uint pos) { return a->addr[pos / 32] &= ~(0x80000000 >> (pos % 32)); } +static inline ip4_addr ip4_setbits(ip4_addr a, uint pos, uint val) +{ _I(a) |= val << (31 - pos); return a; } + +static inline ip6_addr ip6_setbits(ip6_addr a, uint pos, uint val) +{ a.addr[pos / 32] |= val << (31 - pos % 32); return a; } + + static inline ip4_addr ip4_opposite_m1(ip4_addr a) { return _MI4(_I(a) ^ 1); } From 562a2b8c29a50cca5731b0a19e99a87a261ab4ef Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sun, 5 Apr 2020 03:56:07 +0200 Subject: [PATCH 006/196] Filter: Fix trie test Generated prefixes must be valid. --- filter/trie_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/filter/trie_test.c b/filter/trie_test.c index b2b36716..5e931e4e 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -58,7 +58,8 @@ get_random_ip6_prefix(void) struct f_prefix p; u8 pxlen = xrandom(120)+8; ip6_addr ip6 = ip6_build(bt_random(),bt_random(),bt_random(),bt_random()); - net_addr_ip6 net6 = NET_ADDR_IP6(ip6, pxlen); + ip6_addr mask = ip6_mkmask(pxlen); + net_addr_ip6 net6 = NET_ADDR_IP6(ip6_and(ip6, mask), pxlen); p.net = *((net_addr*) &net6); @@ -87,7 +88,7 @@ generate_random_ipv6_prefixes(list *prefixes) struct f_prefix_node *px = calloc(1, sizeof(struct f_prefix_node)); px->prefix = f; - bt_debug("ADD\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&px->prefix.net)), px->prefix.net.pxlen, px->prefix.lo, px->prefix.hi); + bt_debug("ADD\t" PRIip6 "/%d{%d,%d}\n", ARGip6(net6_prefix(&px->prefix.net)), px->prefix.net.pxlen, px->prefix.lo, px->prefix.hi); add_tail(prefixes, &px->n); } } From dd61278c9db1d4bea29f0a21aa460c7fe931eb32 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 6 Apr 2020 14:20:16 +0200 Subject: [PATCH 007/196] Filter: Update trie documentation --- filter/trie.c | 113 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/filter/trie.c b/filter/trie.c index cf805afc..dbed5ace 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -1,7 +1,8 @@ /* * Filters: Trie for prefix sets * - * Copyright 2009 Ondrej Zajicek + * (c) 2009--2020 Ondrej Zajicek + * (c) 2009--2020 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,53 +10,68 @@ /** * DOC: Trie for prefix sets * - * We use a (compressed) trie to represent prefix sets. Every node - * in the trie represents one prefix (&addr/&plen) and &plen also - * indicates the index of the bit in the address that is used to - * branch at the node. If we need to represent just a set of - * prefixes, it would be simple, but we have to represent a - * set of prefix patterns. Each prefix pattern consists of - * &ppaddr/&pplen and two integers: &low and &high, and a prefix - * &paddr/&plen matches that pattern if the first MIN(&plen, &pplen) - * bits of &paddr and &ppaddr are the same and &low <= &plen <= &high. + * We use a (compressed) trie to represent prefix sets. Every node in the trie + * represents one prefix (&addr/&plen) and &plen also indicates the index of + * bits in the address that are used to branch at the node. Note that such + * prefix is not necessary a member of the prefix set, it is just a canonical + * prefix associated with a node. Prefix lengths of nodes are aligned to + * multiples of &TRIE_STEP (4) and there is 16-way branching in each + * node. Therefore, we say that a node is associated with a range of prefix + * lengths (&plen .. &plen + TRIE_STEP - 1). * - * We use a bitmask (&accept) to represent accepted prefix lengths - * at a node. As there are 33 prefix lengths (0..32 for IPv4), but - * there is just one prefix of zero length in the whole trie so we - * have &zero flag in &f_trie (indicating whether the trie accepts - * prefix 0.0.0.0/0) as a special case, and &accept bitmask + * The prefix set is not just a set of prefixes, it is defined by a set of + * prefix patterns. Each prefix pattern consists of &ppaddr/&pplen and two + * integers: &low and &high. The tested prefix &paddr/&plen matches that pattern + * if the first MIN(&plen, &pplen) bits of &paddr and &ppaddr are the same and + * &low <= &plen <= &high. + * + * There are two ways to represent accepted prefixes for a node. First, there is + * a bitmask &local, which represents independently all 15 prefixes that extend + * the canonical prefix of the node and are within a range of prefix lengths + * associated with the node. E.g., for node 10.0.0.0/8 they are 10.0.0.0/8, + * 10.0.0.0/9, 10.128.0.0/9, .. 10.224.0.0/11. This order (first by length, then + * lexicographically) is used for indexing the bitmask &local, starting at + * position 1. I.e., index is 2^(plen - base) + offset within the same length, + * see function trie_local_mask6() for details. + * + * Second, we use a bitmask &accept to represent accepted prefix lengths at a + * node. The bit is set means that all prefixes of given length that are either + * subprefixes or superprefixes of the canonical prefix are accepted. As there + * are 33 prefix lengths (0..32 for IPv4), but there is just one prefix of zero + * length in the whole trie so we have &zero flag in &f_trie (indicating whether + * the trie accepts prefix 0.0.0.0/0) as a special case, and &accept bitmask * represents accepted prefix lengths from 1 to 32. * - * There are two cases in prefix matching - a match when the length - * of the prefix is smaller that the length of the prefix pattern, - * (&plen < &pplen) and otherwise. The second case is simple - we - * just walk through the trie and look at every visited node - * whether that prefix accepts our prefix length (&plen). The - * first case is tricky - we don't want to examine every descendant - * of a final node, so (when we create the trie) we have to propagate - * that information from nodes to their ascendants. + * One complication is handling of prefix patterns with unaligned prefix length. + * When such pattern is to be added, we add a primary node above (with rounded + * down prefix length &nlen) and a set of secondary nodes below (with rounded up + * prefix lengths &slen). Accepted prefix lengths of the original prefix pattern + * are then represented in different places based on their lengths. For prefixes + * shorter than &nlen, it is &accept bitmask of the primary node, for prefixes + * between &nlen and &slen - 1 it is &local bitmask of the primary node, and for + * prefixes longer of equal &slen it is &accept bitmasks of secondary nodes. * - * Suppose that we have two masks (M1 and M2) for a node. Mask M1 - * represents accepted prefix lengths by just the node and mask M2 - * represents accepted prefix lengths by the node or any of its - * descendants. Therefore M2 is a bitwise or of M1 and children's - * M2 and this is a maintained invariant during trie building. - * Basically, when we want to match a prefix, we walk through the trie, - * check mask M1 for our prefix length and when we came to - * final node, we check mask M2. + * There are two cases in prefix matching - a match when the length of the + * prefix is smaller that the length of the prefix pattern, (&plen < &pplen) and + * otherwise. The second case is simple - we just walk through the trie and look + * at every visited node whether that prefix accepts our prefix length (&plen). + * The first case is tricky - we do not want to examine every descendant of a + * final node, so (when we create the trie) we have to propagate that + * information from nodes to their ascendants. * - * There are two differences in the real implementation. First, - * we use a compressed trie so there is a case that we skip our - * final node (if it is not in the trie) and we came to node that - * is either extension of our prefix, or completely out of path - * In the first case, we also have to check M2. + * There are two kinds of propagations - propagation from child's &accept + * bitmask to parent's &accept bitmask, and propagation from child's &accept + * bitmask to parent's &local bitmask. The first kind is simple - as all + * superprefixes of a parent are also all superprefixes of appropriate length of + * a child, then we can just add (by bitwise or) a child &accept mask masked by + * parent prefix length mask to the parent &accept mask. This handles prefixes + * shorter than node &plen. * - * Second, we really need not to maintain two separate bitmasks. - * Checks for mask M1 are always larger than &applen and we need - * just the first &pplen bits of mask M2 (if trie compression - * hadn't been used it would suffice to know just $applen-th bit), - * so we have to store them together in &accept mask - the first - * &pplen bits of mask M2 and then mask M1. + * The second kind of propagation is necessary to handle superprefixes of a + * child that are represented by parent &local mask - that are in the range of + * prefix lengths associated with the parent. For each accepted (by child + * &accept mask) prefix length from that range, we need to set appropriate bit + * in &local mask. See function trie_amask_to_local() for details. * * There are four cases when we walk through a trie: * @@ -65,8 +81,7 @@ * - we are beyond the end of path (node length > &plen) * - we are still on path and keep walking (node length < &plen) * - * The walking code in trie_match_prefix() is structured according to - * these cases. + * The walking code in trie_match_net() is structured according to these cases. */ #include "nest/bird.h" @@ -166,6 +181,10 @@ attach_node(struct f_trie_node *parent, struct f_trie_node *child, int v4) } +/* + * Compute appropriate mask representing prefix px/plen in local bitmask of node + * with prefix length nlen. Assuming that nlen <= plen < (nlen + TRIE_STEP). + */ static inline uint trie_local_mask4(ip4_addr px, uint plen, uint nlen) { @@ -182,6 +201,12 @@ trie_local_mask6(ip6_addr px, uint plen, uint nlen) return 1u << pos; } +/* + * Compute an appropriate local mask (for a node with prefix length nlen) + * representing prefixes of px that are accepted by amask and fall within the + * range associated with that node. Used for propagation of child accept mask + * to parent local mask. + */ static inline uint trie_amask_to_local(ip_addr px, ip_addr amask, uint nlen) { From e709dc09e61c4604821d10b81604d38616b81a0b Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Tue, 21 Apr 2020 13:49:29 +0200 Subject: [PATCH 008/196] Filter: Improve prefix trie tests Add tests explicitly matching insides and outsides of trie and update tests to do testing of both IPv4 and IPv6 tries. --- filter/trie_test.c | 402 +++++++++++++++++++++++++++++++++++---------- test/birdtest.c | 6 + test/birdtest.h | 2 + 3 files changed, 323 insertions(+), 87 deletions(-) diff --git a/filter/trie_test.c b/filter/trie_test.c index 5e931e4e..6418427e 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -14,7 +14,7 @@ #include "conf/conf.h" #define TESTS_NUM 10 -#define PREFIXES_NUM 10 +#define PREFIXES_NUM 32 #define PREFIX_TESTS_NUM 10000 #define BIG_BUFFER_SIZE 10000 @@ -31,145 +31,371 @@ xrandom(u32 max) return (bt_random() % max); } +static inline uint +get_exp_random(void) +{ + uint r, n = 0; + + for (r = bt_random(); r & 1; r = r >> 1) + n++; + + return n; +} + +static inline int +matching_ip4_nets(const net_addr_ip4 *a, const net_addr_ip4 *b) +{ + ip4_addr cmask = ip4_mkmask(MIN(a->pxlen, b->pxlen)); + return ip4_compare(ip4_and(a->prefix, cmask), ip4_and(b->prefix, cmask)) == 0; +} + +static inline int +matching_ip6_nets(const net_addr_ip6 *a, const net_addr_ip6 *b) +{ + ip6_addr cmask = ip6_mkmask(MIN(a->pxlen, b->pxlen)); + return ip6_compare(ip6_and(a->prefix, cmask), ip6_and(b->prefix, cmask)) == 0; +} + +static inline int +matching_nets(const net_addr *a, const net_addr *b) +{ + if (a->type != b->type) + return 0; + + return (a->type == NET_IP4) ? + matching_ip4_nets((const net_addr_ip4 *) a, (const net_addr_ip4 *) b) : + matching_ip6_nets((const net_addr_ip6 *) a, (const net_addr_ip6 *) b); +} + static int -is_prefix_included(list *prefixes, struct f_prefix *needle) +is_prefix_included(list *prefixes, const net_addr *needle) { struct f_prefix_node *n; WALK_LIST(n, *prefixes) - { - ip6_addr cmask = ip6_mkmask(MIN(n->prefix.net.pxlen, needle->net.pxlen)); - - ip6_addr ip = net6_prefix(&n->prefix.net); - ip6_addr needle_ip = net6_prefix(&needle->net); - - if ((ipa_compare(ipa_and(ip, cmask), ipa_and(needle_ip, cmask)) == 0) && - (n->prefix.lo <= needle->net.pxlen) && (needle->net.pxlen <= n->prefix.hi)) + if (matching_nets(&n->prefix.net, needle) && + (n->prefix.lo <= needle->pxlen) && (needle->pxlen <= n->prefix.hi)) { - bt_debug("FOUND\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&n->prefix.net)), n->prefix.net.pxlen, n->prefix.lo, n->prefix.hi); + char buf[64]; + bt_format_net(buf, 64, &n->prefix.net); + bt_debug("FOUND %s %d-%d\n", buf, n->prefix.lo, n->prefix.hi); + return 1; /* OK */ } - } + return 0; /* FAIL */ } -static struct f_prefix -get_random_ip6_prefix(void) +static void +get_random_net(net_addr *net, int v6) { - struct f_prefix p; - u8 pxlen = xrandom(120)+8; - ip6_addr ip6 = ip6_build(bt_random(),bt_random(),bt_random(),bt_random()); - ip6_addr mask = ip6_mkmask(pxlen); - net_addr_ip6 net6 = NET_ADDR_IP6(ip6_and(ip6, mask), pxlen); - - p.net = *((net_addr*) &net6); - - if (bt_random() % 2) + if (!v6) { - p.lo = 0; - p.hi = p.net.pxlen; + uint pxlen = xrandom(24)+8; + ip4_addr ip4 = ip4_from_u32((u32) bt_random()); + net_fill_ip4(net, ip4_and(ip4, ip4_mkmask(pxlen)), pxlen); } else { - p.lo = p.net.pxlen; - p.hi = net_max_prefix_length[p.net.type]; + uint pxlen = xrandom(120)+8; + ip6_addr ip6 = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + net_fill_ip6(net, ip6_and(ip6, ip6_mkmask(pxlen)), pxlen); } - - return p; } static void -generate_random_ipv6_prefixes(list *prefixes) +get_random_prefix(struct f_prefix *px, int v6) { - int i; - for (i = 0; i < PREFIXES_NUM; i++) + get_random_net(&px->net, v6); + + if (bt_random() % 2) { - struct f_prefix f = get_random_ip6_prefix(); - - struct f_prefix_node *px = calloc(1, sizeof(struct f_prefix_node)); - px->prefix = f; - - bt_debug("ADD\t" PRIip6 "/%d{%d,%d}\n", ARGip6(net6_prefix(&px->prefix.net)), px->prefix.net.pxlen, px->prefix.lo, px->prefix.hi); - add_tail(prefixes, &px->n); + px->lo = 0; + px->hi = px->net.pxlen; + } + else + { + px->lo = px->net.pxlen; + px->hi = net_max_prefix_length[px->net.type]; } } +static void +get_random_ip4_subnet(net_addr_ip4 *net, const net_addr_ip4 *src, int pxlen) +{ + *net = NET_ADDR_IP4(ip4_and(src->prefix, ip4_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip4_addr rnd = ip4_from_u32((u32) bt_random()); + ip4_addr mask = ip4_xor(ip4_mkmask(src->pxlen), ip4_mkmask(pxlen)); + net->prefix = ip4_or(net->prefix, ip4_and(rnd, mask)); + } +} + +static void +get_random_ip6_subnet(net_addr_ip6 *net, const net_addr_ip6 *src, int pxlen) +{ + *net = NET_ADDR_IP6(ip6_and(src->prefix, ip6_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip6_addr rnd = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + ip6_addr mask = ip6_xor(ip6_mkmask(src->pxlen), ip6_mkmask(pxlen)); + net->prefix = ip6_or(net->prefix, ip6_and(rnd, mask)); + } +} + +static void +get_random_subnet(net_addr *net, const net_addr *src, int pxlen) +{ + if (src->type == NET_IP4) + get_random_ip4_subnet((net_addr_ip4 *) net, (const net_addr_ip4 *) src, pxlen); + else + get_random_ip6_subnet((net_addr_ip6 *) net, (const net_addr_ip6 *) src, pxlen); +} + +static void +get_inner_net(net_addr *net, const struct f_prefix *src) +{ + int pxlen, step; + + if (bt_random() % 2) + { + step = get_exp_random(); + step = MIN(step, src->hi - src->lo); + pxlen = (bt_random() % 2) ? (src->lo + step) : (src->hi - step); + } + else + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + + get_random_subnet(net, &src->net, pxlen); +} + +static void +swap_random_bits_ip4(net_addr_ip4 *net, int num) +{ + for (int i = 0; i < num; i++) + { + ip4_addr swap = IP4_NONE; + ip4_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip4_xor(net->prefix, swap); + } +} + +static void +swap_random_bits_ip6(net_addr_ip6 *net, int num) +{ + for (int i = 0; i < num; i++) + { + ip6_addr swap = IP6_NONE; + ip6_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip6_xor(net->prefix, swap); + } +} + +static void +swap_random_bits(net_addr *net, int num) +{ + if (net->type == NET_IP4) + swap_random_bits_ip4((net_addr_ip4 *) net, num); + else + swap_random_bits_ip6((net_addr_ip6 *) net, num); +} + +static void +get_outer_net(net_addr *net, const struct f_prefix *src) +{ + int pxlen, step; + int inside = 0; + int max = net_max_prefix_length[src->net.type]; + + if ((src->lo > 0) && (bt_random() % 3)) + { + step = 1 + get_exp_random(); + step = MIN(step, src->lo); + pxlen = src->lo - step; + } + else if ((src->hi < max) && (bt_random() % 2)) + { + step = 1 + get_exp_random(); + step = MIN(step, max - src->hi); + pxlen = src->hi + step; + } + else + { + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + inside = 1; + } + + get_random_subnet(net, &src->net, pxlen); + + /* Perhaps swap some bits in prefix */ + if ((net->pxlen > 0) && (inside || (bt_random() % 4))) + swap_random_bits(net, 1 + get_exp_random()); +} + +static list * +make_random_prefix_list(linpool *lp, int num, int v6) +{ + list *prefixes = lp_allocz(lp, sizeof(struct f_prefix_node)); + init_list(prefixes); + + for (int i = 0; i < num; i++) + { + struct f_prefix_node *px = lp_allocz(lp, sizeof(struct f_prefix_node)); + get_random_prefix(&px->prefix, v6); + add_tail(prefixes, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); + } + + return prefixes; +} + +static struct f_trie * +make_trie_from_prefix_list(linpool *lp, list *prefixes) +{ + struct f_trie *trie = f_new_trie(lp, 0); + + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + return trie; +} + +static void +test_match_net(list *prefixes, struct f_trie *trie, const net_addr *net) +{ + char buf[64]; + bt_format_net(buf, 64, net); + bt_debug("TEST %s\n", buf); + + int should_be = is_prefix_included(prefixes, net); + int is_there = trie_match_net(trie, net); + + bt_assert_msg(should_be == is_there, "Prefix %s %s match", buf, + (should_be ? "should" : "should not")); +} + static int -t_match_net(void) +t_match_random_net(void) { bt_bird_init(); bt_config_parse(BT_CONFIG_SIMPLE); - uint round; - for (round = 0; round < TESTS_NUM; round++) + int v6 = 0; + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM; round++) { - list prefixes; /* of structs f_extended_prefix */ - init_list(&prefixes); - struct f_trie *trie = f_new_trie(config->mem, 0); + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); - generate_random_ipv6_prefixes(&prefixes); - struct f_prefix_node *n; - WALK_LIST(n, prefixes) + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + net_addr net; + get_random_net(&net, v6); + test_match_net(prefixes, trie, &net); } - int i; - for (i = 0; i < PREFIX_TESTS_NUM; i++) - { - struct f_prefix f = get_random_ip6_prefix(); - bt_debug("TEST\t" PRIip6 "/%d\n", ARGip6(net6_prefix(&f.net)), f.net.pxlen); - - int should_be = is_prefix_included(&prefixes, &f); - int is_there = trie_match_net(trie, &f.net); - bt_assert_msg(should_be == is_there, "Prefix " PRIip6 "/%d %s", ARGip6(net6_prefix(&f.net)), f.net.pxlen, (should_be ? "should be found in trie" : "should not be found in trie")); - } - - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) - { - free(n); - } + v6 = !v6; + lp_flush(lp); } bt_bird_cleanup(); return 1; } +static int +t_match_inner_net(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + int v6 = 0; + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) + { + net_addr net; + get_inner_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); + + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); + } + + v6 = !v6; + lp_flush(lp); + } + + bt_bird_cleanup(); + return 1; +} + +static int +t_match_outer_net(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + int v6 = 0; + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) + { + net_addr net; + get_outer_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); + + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); + } + + v6 = !v6; + lp_flush(lp); + } + + v6 = !v6; + bt_bird_cleanup(); + return 1; +} + static int t_trie_same(void) { bt_bird_init(); bt_config_parse(BT_CONFIG_SIMPLE); - int round; - for (round = 0; round < TESTS_NUM*4; round++) + int v6 = 0; + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM*4; round++) { - struct f_trie * trie1 = f_new_trie(config->mem, 0); - struct f_trie * trie2 = f_new_trie(config->mem, 0); - - list prefixes; /* a list of f_extended_prefix structures */ - init_list(&prefixes); - int i; - for (i = 0; i < 100; i++) - generate_random_ipv6_prefixes(&prefixes); + list *prefixes = make_random_prefix_list(lp, 100 * PREFIXES_NUM, v6); + struct f_trie *trie1 = f_new_trie(lp, 0); + struct f_trie *trie2 = f_new_trie(lp, 0); struct f_prefix_node *n; - WALK_LIST(n, prefixes) - { + WALK_LIST(n, *prefixes) trie_add_prefix(trie1, &n->prefix.net, n->prefix.lo, n->prefix.hi); - } - WALK_LIST_BACKWARDS(n, prefixes) - { + + WALK_LIST_BACKWARDS(n, *prefixes) trie_add_prefix(trie2, &n->prefix.net, n->prefix.lo, n->prefix.hi); - } bt_assert(trie_same(trie1, trie2)); - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) - { - free(n); - } + v6 = !v6; + lp_flush(lp); } return 1; @@ -180,7 +406,9 @@ main(int argc, char *argv[]) { bt_init(argc, argv); - bt_test_suite(t_match_net, "Testing random prefix matching"); + bt_test_suite(t_match_random_net, "Testing random prefix matching"); + bt_test_suite(t_match_inner_net, "Testing random inner prefix matching"); + bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); return bt_exit_value(); diff --git a/test/birdtest.c b/test/birdtest.c index a1da078f..d739e78b 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -501,6 +501,12 @@ bt_fmt_ipa(char *buf, size_t size, const void *data) bsnprintf(buf, size, "(null)"); } +void +bt_format_net(char *buf, size_t size, const void *data) +{ + bsnprintf(buf, size, "%N", (const net_addr *) data); +} + int bt_is_char(byte c) { diff --git a/test/birdtest.h b/test/birdtest.h index caec529b..7a0c2fc4 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -165,6 +165,8 @@ struct bt_batch { void bt_fmt_str(char *buf, size_t size, const void *data); void bt_fmt_unsigned(char *buf, size_t size, const void *data); void bt_fmt_ipa(char *buf, size_t size, const void *data); +void bt_format_net(char *buf, size_t size, const void *data); + int bt_assert_batch__(struct bt_batch *opts); int bt_is_char(byte c); From 067f69a56de0e0e61d423ec5aa68095aa28e3124 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 25 Sep 2021 16:00:30 +0200 Subject: [PATCH 009/196] Filter: Add prefix trie benchmarks Add trie tests intended as benchmarks that use external datasets instead of generated prefixes. As datasets are not included, they are commented out by default. --- filter/trie_test.c | 235 +++++++++++++++++++++++++++++++++++++++++++++ test/birdtest.c | 6 ++ test/birdtest.h | 1 + 3 files changed, 242 insertions(+) diff --git a/filter/trie_test.c b/filter/trie_test.c index 6418427e..3e8ce84d 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -16,7 +16,10 @@ #define TESTS_NUM 10 #define PREFIXES_NUM 32 #define PREFIX_TESTS_NUM 10000 +#define PREFIX_BENCH_NUM 100000000 +#define TRIE_BUFFER_SIZE 1024 +#define TEST_BUFFER_SIZE (1024*1024) #define BIG_BUFFER_SIZE 10000 /* Wrapping structure for storing f_prefixes structures in list */ @@ -266,6 +269,142 @@ make_trie_from_prefix_list(linpool *lp, list *prefixes) return trie; } +/* + * Read sequence of prefixes from file handle and return prefix list. + * Each prefix is on one line, sequence terminated by empty line or eof. + * Arg @plus means prefix should include all longer ones. + */ +static list * +read_prefix_list(linpool *lp, FILE *f, int v6, int plus) +{ + ASSERT(!v6); + + uint a0, a1, a2, a3, pl; + char s[32]; + int n; + + list *pxlist = lp_allocz(lp, sizeof(struct f_prefix_node)); + init_list(pxlist); + + errno = 0; + while (fgets(s, 32, f)) + { + if (s[0] == '\n') + return pxlist; + + n = sscanf(s, "%u.%u.%u.%u/%u", &a0, &a1, &a2, &a3, &pl); + + if (n != 5) + bt_abort_msg("Invalid content of trie_data"); + + struct f_prefix_node *px = lp_allocz(lp, sizeof(struct f_prefix_node)); + net_fill_ip4(&px->prefix.net, ip4_build(a0, a1, a2, a3), pl); + px->prefix.lo = pl; + px->prefix.hi = plus ? IP4_MAX_PREFIX_LENGTH : pl; + add_tail(pxlist, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); + } + + bt_syscall(errno, "fgets()"); + return EMPTY_LIST(*pxlist) ? NULL : pxlist; +} + +/* + * Open file, read multiple sequences of prefixes from it. Fill @data with + * prefix lists and @trie with generated tries. Return number of sequences / + * tries. Use separate linpool @lp0 for prefix lists and @lp1 for tries. + * Arg @plus means prefix should include all longer ones. + */ +static int +read_prefix_file(const char *filename, int plus, + linpool *lp0, linpool *lp1, + list *data[], struct f_trie *trie[]) +{ + FILE *f = fopen(filename, "r"); + bt_syscall(!f, "fopen(%s)", filename); + + int n = 0; + list *pxlist; + while (pxlist = read_prefix_list(lp0, f, 0, plus)) + { + data[n] = pxlist; + trie[n] = make_trie_from_prefix_list(lp1, pxlist); + bt_debug("NEXT\n"); + n++; + } + + fclose(f); + bt_debug("DONE reading %d tries\n", n); + + return n; +} + +/* + * Select random subset of @dn prefixes from prefix list @src of length @sn, + * and store them to buffer @dst (of size @dn). Prefixes may be chosen multiple + * times. Randomize order of prefixes in @dst buffer. + */ +static void +select_random_prefix_subset(list *src[], net_addr dst[], int sn, int dn) +{ + int pn = 0; + + if (!dn) + return; + + /* Compute total prefix number */ + for (int i = 0; i < sn; i++) + pn += list_length(src[i]); + + /* Change of selecting a prefix */ + int rnd = (pn / dn) + 10; + int n = 0; + + /* Iterate indefinitely over src array */ + for (int i = 0; 1; i++, i = (i < sn) ? i : 0) + { + struct f_prefix_node *px; + WALK_LIST(px, *src[i]) + { + if (xrandom(rnd) != 0) + continue; + + net_copy(&dst[n], &px->prefix.net); + n++; + + /* We have enough */ + if (n == dn) + goto done; + } + } + +done: + /* Shuffle networks */ + for (int i = 0; i < dn; i++) + { + int j = xrandom(dn); + + if (i == j) + continue; + + net_addr tmp; + net_copy(&tmp, &dst[i]); + net_copy(&dst[i], &dst[j]); + net_copy(&dst[j], &tmp); + } +} + +/* Fill @dst buffer with @dn randomly generated /32 prefixes */ +static void +make_random_addresses(net_addr dst[], int dn) +{ + for (int i = 0; i < dn; i++) + net_fill_ip4(&dst[i], ip4_from_u32((u32) bt_random()), IP4_MAX_PREFIX_LENGTH); +} + static void test_match_net(list *prefixes, struct f_trie *trie, const net_addr *net) { @@ -371,6 +510,99 @@ t_match_outer_net(void) return 1; } +/* + * Read prefixes from @filename, build set of tries, prepare test data and do + * PREFIX_BENCH_NUM trie lookups. With @plus = 0, use random subset of known + * prefixes as test data, with @plus = 1, use randomly generated /32 prefixes + * as test data. + */ +static int +benchmark_trie_dataset(const char *filename, int plus) +{ + int n = 0; + linpool *lp0 = lp_new_default(&root_pool); + linpool *lp1 = lp_new_default(&root_pool); + list *data[TRIE_BUFFER_SIZE]; + struct f_trie *trie[TRIE_BUFFER_SIZE]; + net_addr *nets; + + bt_reset_suite_case_timer(); + bt_log_suite_case_result(1, "Reading %s", filename, n); + n = read_prefix_file(filename, plus, lp0, lp1, data, trie); + bt_log_suite_case_result(1, "Read prefix data, %d lists, ", n); + + size_t trie_size = rmemsize(lp1) * 1000 / (1024*1024); + bt_log_suite_case_result(1, "Trie size %u.%03u MB", + (uint) (trie_size / 1000), (uint) (trie_size % 1000)); + + int t = PREFIX_BENCH_NUM / n; + int tb = MIN(t, TEST_BUFFER_SIZE); + nets = lp_alloc(lp0, tb * sizeof(net_addr)); + + if (!plus) + select_random_prefix_subset(data, nets, n, tb); + else + make_random_addresses(nets, tb); + + bt_log_suite_case_result(1, "Make test data, %d (%d) tests", t, tb); + bt_reset_suite_case_timer(); + + /* + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + test_match_net(data[j], trie[j], &nets[i]); + */ + + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + if (trie_match_net(trie[j], &nets[i % TEST_BUFFER_SIZE])) + match++; + + bt_log_suite_case_result(1, "Matching done, %d / %d matches", match, t * n); + + rfree(lp0); + rfree(lp1); + + return 1; +} + +static int UNUSED +t_bench_trie_datasets_subset(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 0); + benchmark_trie_dataset("trie-data-bgp-10", 0); + benchmark_trie_dataset("trie-data-bgp-100", 0); + benchmark_trie_dataset("trie-data-bgp-1000", 0); + + bt_bird_cleanup(); + + return 1; +} + +static int UNUSED +t_bench_trie_datasets_random(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 1); + benchmark_trie_dataset("trie-data-bgp-10", 1); + benchmark_trie_dataset("trie-data-bgp-100", 1); + benchmark_trie_dataset("trie-data-bgp-1000", 1); + + bt_bird_cleanup(); + + return 1; +} + + static int t_trie_same(void) { @@ -411,5 +643,8 @@ main(int argc, char *argv[]) bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); + // bt_test_suite(t_bench_trie_datasets_subset, "Benchmark tries from datasets by random subset of nets"); + // bt_test_suite(t_bench_trie_datasets_random, "Benchmark tries from datasets by generated addresses"); + return bt_exit_value(); } diff --git a/test/birdtest.c b/test/birdtest.c index d739e78b..053954e1 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -309,6 +309,12 @@ bt_log_suite_case_result(int result, const char *fmt, ...) } } +void +bt_reset_suite_case_timer(void) +{ + clock_gettime(CLOCK_MONOTONIC, &bt_suite_case_begin); +} + int bt_test_suite_base(int (*fn)(const void *), const char *id, const void *fn_arg, int forked, int timeout, const char *dsc, ...) { diff --git a/test/birdtest.h b/test/birdtest.h index 7a0c2fc4..ad5f8f9c 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -32,6 +32,7 @@ extern const char *bt_test_id; void bt_init(int argc, char *argv[]); int bt_exit_value(void); +void bt_reset_suite_case_timer(void); int bt_test_suite_base(int (*test_fn)(const void *), const char *test_id, const void *test_fn_argument, int forked, int timeout, const char *dsc, ...); static inline u64 bt_random(void) { return ((u64) random() & 0xffffffff) | ((u64) random() << 32); } From 9f24fef5e91fb4df301242ede91ee7ac1b46b8a8 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 20 Oct 2021 01:51:28 +0200 Subject: [PATCH 010/196] Conf: Fix crash during shutdown BIRD implements shutdown by reconfiguring to fake empty configuration. Such fake config structure is created from the last running config and shares some data, including symbol table. This allows access to (removed) routing tables and causes crash when 'show route' command is used during shutdown. Clean up symbol table, table list and links to default tables, so removed routing tables cannot be accessed during shutdown. --- conf/conf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/conf.c b/conf/conf.c index 58abcde1..a2b01667 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -520,6 +520,9 @@ order_shutdown(int gr) memcpy(c, config, sizeof(struct config)); init_list(&c->protos); init_list(&c->tables); + init_list(&c->symbols); + memset(c->def_tables, 0, sizeof(c->def_tables)); + HASH_INIT(c->sym_hash, c->pool, 4); c->shutdown = 1; c->gr_down = gr; From 71c18d9f53ec0ea5eb512fdb6510d0c3350f96b4 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 13 Nov 2021 21:11:18 +0100 Subject: [PATCH 011/196] Trie: Simplify network matching code Introduce ipX_prefix_equal() and use it to simplify network matching code. --- filter/trie.c | 22 +++++------------ lib/ip.h | 18 ++++++++++++++ lib/ip_test.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 16 deletions(-) diff --git a/filter/trie.c b/filter/trie.c index dbed5ace..5d9cc952 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -424,9 +424,6 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) static int trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) { - ip4_addr pmask = ip4_mkmask(plen); - ip4_addr paddr = ip4_and(px, pmask); - if (plen == 0) return t->zero; @@ -437,10 +434,8 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) while (n) { - ip4_addr cmask = ip4_and(n->mask, pmask); - /* We are out of path */ - if (ip4_compare(ip4_and(paddr, cmask), ip4_and(n->addr, cmask))) + if (!ip4_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; /* Check local mask */ @@ -452,11 +447,11 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[ip4_getbits(paddr, n->plen, TRIE_STEP)]; + n = n->c[ip4_getbits(px, n->plen, TRIE_STEP)]; } return 0; @@ -465,9 +460,6 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) static int trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) { - ip6_addr pmask = ip6_mkmask(plen); - ip6_addr paddr = ip6_and(px, pmask); - if (plen == 0) return t->zero; @@ -478,10 +470,8 @@ trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) while (n) { - ip6_addr cmask = ip6_and(n->mask, pmask); - /* We are out of path */ - if (ip6_compare(ip6_and(paddr, cmask), ip6_and(n->addr, cmask))) + if (!ip6_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; /* Check local mask */ @@ -493,11 +483,11 @@ trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[ip6_getbits(paddr, n->plen, TRIE_STEP)]; + n = n->c[ip6_getbits(px, n->plen, TRIE_STEP)]; } return 0; diff --git a/lib/ip.h b/lib/ip.h index cc36ce64..9eef2e16 100644 --- a/lib/ip.h +++ b/lib/ip.h @@ -279,6 +279,24 @@ static inline uint ip6_pxlen(ip6_addr a, ip6_addr b) return 32 * i + 31 - u32_log2(a.addr[i] ^ b.addr[i]); } +static inline int ip4_prefix_equal(ip4_addr a, ip4_addr b, uint n) +{ + return (_I(a) ^ _I(b)) < ((u64) 1 << (32 - n)); +} + +static inline int ip6_prefix_equal(ip6_addr a, ip6_addr b, uint n) +{ + uint n0 = n / 32; + uint n1 = n % 32; + + return + ((n0 <= 0) || (_I0(a) == _I0(b))) && + ((n0 <= 1) || (_I1(a) == _I1(b))) && + ((n0 <= 2) || (_I2(a) == _I2(b))) && + ((n0 <= 3) || (_I3(a) == _I3(b))) && + (!n1 || ((a.addr[n0] ^ b.addr[n0]) < (1u << (32 - n1)))); +} + static inline u32 ip4_getbit(ip4_addr a, uint pos) { return (_I(a) >> (31 - pos)) & 1; } diff --git a/lib/ip_test.c b/lib/ip_test.c index 36d10d68..eee0a427 100644 --- a/lib/ip_test.c +++ b/lib/ip_test.c @@ -167,6 +167,70 @@ t_ip6_ntop(void) return bt_assert_batch(test_vectors, test_ipa_ntop, bt_fmt_ipa, bt_fmt_str); } +static int +t_ip4_prefix_equal(void) +{ + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 16)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 17)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 21)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 22)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x00000000), ip4_from_u32(0xffffffff), 0)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 0)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345679), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x92345678), 32)); + + return 1; +} + +static int +t_ip6_prefix_equal(void) +{ + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 49)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20020db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234567e, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 106)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 107)); + + bt_assert( ip6_prefix_equal(ip6_build(0xfeef0db8, 0x87654321, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 0)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 128)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202021), + 128)); + + return 1; +} + int main(int argc, char *argv[]) { @@ -176,6 +240,8 @@ main(int argc, char *argv[]) bt_test_suite(t_ip6_pton, "Converting IPv6 string to ip6_addr struct"); bt_test_suite(t_ip4_ntop, "Converting ip4_addr struct to IPv4 string"); bt_test_suite(t_ip6_ntop, "Converting ip6_addr struct to IPv6 string"); + bt_test_suite(t_ip4_prefix_equal, "Testing ip4_prefix_equal()"); + bt_test_suite(t_ip6_prefix_equal, "Testing ip6_prefix_equal()"); return bt_exit_value(); } From 062e69bf520e5788913bdd564076ad9892b24a87 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 19 Nov 2021 18:04:32 +0100 Subject: [PATCH 012/196] Trie: Implement trie walking code Trie walking allows enumeration of prefixes in a trie in the usual lexicographic order. Optionally, trie enumeration can be restricted to a chosen subnet (and its descendants). --- filter/data.h | 23 ++++- filter/trie.c | 245 ++++++++++++++++++++++++++++++++++++++++++++- filter/trie_test.c | 158 +++++++++++++++++++++++++++-- 3 files changed, 413 insertions(+), 13 deletions(-) diff --git a/filter/data.h b/filter/data.h index 21967deb..4a0ee865 100644 --- a/filter/data.h +++ b/filter/data.h @@ -140,7 +140,8 @@ struct f_tree { void *data; }; -#define TRIE_STEP 4 +#define TRIE_STEP 4 +#define TRIE_STACK_LENGTH 33 struct f_trie_node4 { @@ -175,6 +176,16 @@ struct f_trie struct f_trie_node root; /* Root trie node */ }; +struct f_trie_walk_state +{ + u8 ipv4; + u8 accept_length; /* Current inter-node prefix position */ + u8 start_pos; /* Initial prefix position in stack[0] */ + u8 local_pos; /* Current intra-node prefix position */ + u8 stack_pos; /* Current node in stack below */ + const struct f_trie_node *stack[TRIE_STACK_LENGTH]; +}; + struct f_tree *f_new_tree(void); struct f_tree *build_tree(struct f_tree *); const struct f_tree *find_tree(const struct f_tree *t, const struct f_val *val); @@ -185,9 +196,19 @@ void tree_walk(const struct f_tree *t, void (*hook)(const struct f_tree *, void struct f_trie *f_new_trie(linpool *lp, uint data_size); void *trie_add_prefix(struct f_trie *t, const net_addr *n, uint l, uint h); int trie_match_net(const struct f_trie *t, const net_addr *n); +void trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *from); +int trie_walk_next(struct f_trie_walk_state *s, net_addr *net); int trie_same(const struct f_trie *t1, const struct f_trie *t2); void trie_format(const struct f_trie *t, buffer *buf); +#define TRIE_WALK(trie, net, from) ({ \ + net_addr net; \ + struct f_trie_walk_state tws_; \ + trie_walk_init(&tws_, trie, from); \ + while (trie_walk_next(&tws_, &net)) + +#define TRIE_WALK_END }) + #define F_CMP_ERROR 999 const char *f_type_name(enum f_type t); diff --git a/filter/trie.c b/filter/trie.c index 5d9cc952..21b5b5d7 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -1,8 +1,8 @@ /* * Filters: Trie for prefix sets * - * (c) 2009--2020 Ondrej Zajicek - * (c) 2009--2020 CZ.NIC z.s.p.o. + * (c) 2009--2021 Ondrej Zajicek + * (c) 2009--2021 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -82,6 +82,24 @@ * - we are still on path and keep walking (node length < &plen) * * The walking code in trie_match_net() is structured according to these cases. + * + * Iteration over prefixes in a trie can be done using TRIE_WALK() macro, or + * directly using trie_walk_init() and trie_walk_next() functions. The second + * approeach allows suspending the iteration and continuing in it later. + * Prefixes are enumerated in the usual lexicographic order and may be + * restricted to a subset of the trie (all subnets of a specified prefix). + * + * Note that the trie walk does not reliably enumerate `implicit' prefixes + * defined by &low and &high fields in prefix patterns, it is supposed to be + * used on tries constructed from `explicit' prefixes (&low == &plen == &high + * in call to trie_add_prefix()). + * + * The trie walk has three basic state variables stored in the struct + * &f_trie_walk_state -- the current node in &stack[stack_pos], &accept_length + * for iteration over inter-node prefixes (non-branching prefixes on compressed + * path between the current node and its parent node, stored in the bitmap + * &accept of the current node) and &local_pos for iteration over intra-node + * prefixes (stored in the bitmap &local). */ #include "nest/bird.h" @@ -224,7 +242,7 @@ trie_amask_to_local(ip_addr px, ip_addr amask, uint nlen) #define ADD_LOCAL(N,X,V) ({ uint v_ = (V); if (X) (N)->v4.local |= v_; else (N)->v6.local |= v_; }) -#define GET_CHILD(N,F,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) +#define GET_CHILD(N,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) static void * @@ -312,7 +330,7 @@ trie_add_node(struct f_trie *t, uint plen, ip_addr px, uint local, uint l, uint /* n->plen < plen and plen <= 32 (128) */ o = n; - n = GET_CHILD(n, c, v4, ipa_getbits(paddr, nlen, TRIE_STEP)); + n = GET_CHILD(n, v4, ipa_getbits(paddr, nlen, TRIE_STEP)); } /* We add new tail node 'a' after node 'o' */ @@ -522,6 +540,225 @@ trie_match_net(const struct f_trie *t, const net_addr *n) } } + +#define SAME_PREFIX(A,B,X,L) ((X) ? ip4_prefix_equal((A)->v4.addr, net4_prefix(B), (L)) : ip6_prefix_equal((A)->v6.addr, net6_prefix(B), (L))) +#define GET_NET_BITS(N,X,A,B) ((X) ? ip4_getbits(net4_prefix(N), (A), (B)) : ip6_getbits(net6_prefix(N), (A), (B))) + +/** + * trie_walk_init + * @s: walk state + * @t: trie + * @net: optional subnet for walk + * + * Initialize walk state for subsequent walk through nodes of the trie @t by + * trie_walk_next(). The argument @net allows to restrict walk to given subnet, + * otherwise full walk over all nodes is used. This is done by finding node at + * or below @net and starting position in it. + */ +void +trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *net) +{ + *s = (struct f_trie_walk_state) { + .ipv4 = t->ipv4, + .accept_length = 0, + .start_pos = 1, + .local_pos = 1, + .stack_pos = 0, + .stack[0] = &t->root + }; + + if (!net) + return; + + /* We want to find node of level at least plen */ + int plen = ROUND_DOWN_POW2(net->pxlen, TRIE_STEP); + const struct f_trie_node *n = &t->root; + const int v4 = t->ipv4; + + while (n) + { + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* We are out of path */ + if (!SAME_PREFIX(n, net, v4, MIN(net->pxlen, nlen))) + break; + + /* We found final node */ + if (nlen >= plen) + { + if (nlen == plen) + { + /* Find proper local_pos, while accept_length is not used */ + int step = net->pxlen - plen; + s->start_pos = s->local_pos = (1u << step) + GET_NET_BITS(net, v4, plen, step); + s->accept_length = plen; + } + else + { + /* Start from pos 1 in local node, but first try accept mask */ + s->accept_length = net->pxlen; + } + + s->stack[0] = n; + return; + } + + /* Choose child */ + n = GET_CHILD(n, v4, GET_NET_BITS(net, v4, nlen, TRIE_STEP)); + } + + s->stack[0] = NULL; + return; +} + +#define GET_ACCEPT_BIT(N,X,B) ((X) ? ip4_getbit((N)->v4.accept, (B)) : ip6_getbit((N)->v6.accept, (B))) +#define GET_LOCAL_BIT(N,X,B) (((X) ? (N)->v4.local : (N)->v6.local) & (1u << (B))) + +/** + * trie_walk_next + * @s: walk state + * @net: return value + * + * Find the next prefix in the trie walk and return it in the buffer @net. + * Prefixes are walked in the usual lexicographic order and may be restricted + * to a subset of the trie during walk setup by trie_walk_init(). Note that the + * trie walk does not iterate reliably over 'implicit' prefixes defined by &low + * and &high fields in prefix patterns, it is supposed to be used on tries + * constructed from 'explicit' prefixes (&low == &plen == &high in call to + * trie_add_prefix()). + * + * Result: 1 if the next prefix was found, 0 for the end of walk. + */ +int +trie_walk_next(struct f_trie_walk_state *s, net_addr *net) +{ + const struct f_trie_node *n = s->stack[s->stack_pos]; + int len = s->accept_length; + int pos = s->local_pos; + int v4 = s->ipv4; + + /* + * The walk has three basic state variables -- n, len and pos. In each node n, + * we first walk superprefixes (by len in &accept bitmask), and then we walk + * internal positions (by pos in &local bitmask). These positions are: + * + * 1 + * 2 3 + * 4 5 6 7 + * 8 9 A B C D E F + * + * We walk them depth-first, including virtual positions 10-1F that are + * equivalent of position 1 in child nodes 0-F. + */ + + if (!n) + { + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + +next_node:; + /* Current node prefix length */ + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* First, check for accept prefix */ + for (; len < nlen; len++) + if (GET_ACCEPT_BIT(n, v4, len - 1)) + { + if (v4) + net_fill_ip4(net, ip4_and(n->v4.addr, ip4_mkmask(len)), len); + else + net_fill_ip6(net, ip6_and(n->v6.addr, ip6_mkmask(len)), len); + + s->local_pos = pos; + s->accept_length = len + 1; + return 1; + } + +next_pos: + /* Bottom of this node */ + if (pos >= (1 << TRIE_STEP)) + { + const struct f_trie_node *child = GET_CHILD(n, v4, pos - (1 << TRIE_STEP)); + int dir = 0; + + /* No child node */ + if (!child) + { + /* Step up until return from left child (pos is even) */ + do + { + /* Step up from start node */ + if ((s->stack_pos == 0) && (pos == s->start_pos)) + { + s->stack[0] = NULL; + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + + /* Top of this node */ + if (pos == 1) + { + ASSERT(s->stack_pos); + const struct f_trie_node *old = n; + + /* Move to parent node */ + s->stack_pos--; + n = s->stack[s->stack_pos]; + nlen = v4 ? n->v4.plen : n->v6.plen; + + pos = v4 ? + ip4_getbits(old->v4.addr, nlen, TRIE_STEP) : + ip6_getbits(old->v6.addr, nlen, TRIE_STEP); + pos += (1 << TRIE_STEP); + len = nlen; + + ASSERT(GET_CHILD(n, v4, pos - (1 << TRIE_STEP)) == old); + } + + /* Step up */ + dir = pos % 2; + pos = pos / 2; + } + while (dir); + + /* Continue with step down to the right child */ + pos = 2 * pos + 1; + goto next_pos; + } + + /* Move to child node */ + pos = 1; + len = nlen + TRIE_STEP; + + s->stack_pos++; + n = s->stack[s->stack_pos] = child; + goto next_node; + } + + /* Check for local prefix */ + if (GET_LOCAL_BIT(n, v4, pos)) + { + /* Convert pos to address of local network */ + int x = (pos >= 2) + (pos >= 4) + (pos >= 8); + int y = pos & ((1u << x) - 1); + + if (v4) + net_fill_ip4(net, !x ? n->v4.addr : ip4_setbits(n->v4.addr, nlen + x - 1, y), nlen + x); + else + net_fill_ip6(net, !x ? n->v6.addr : ip6_setbits(n->v6.addr, nlen + x - 1, y), nlen + x); + + s->local_pos = 2 * pos; + s->accept_length = len; + return 1; + } + + /* Step down */ + pos = 2 * pos; + goto next_pos; +} + + static int trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) { diff --git a/filter/trie_test.c b/filter/trie_test.c index 3e8ce84d..bb9a2f26 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -45,6 +45,13 @@ get_exp_random(void) return n; } +static int +compare_prefixes(const void *a, const void *b) +{ + return net_compare(&((const struct f_prefix *) a)->net, + &((const struct f_prefix *) b)->net); +} + static inline int matching_ip4_nets(const net_addr_ip4 *a, const net_addr_ip4 *b) { @@ -106,11 +113,15 @@ get_random_net(net_addr *net, int v6) } static void -get_random_prefix(struct f_prefix *px, int v6) +get_random_prefix(struct f_prefix *px, int v6, int tight) { get_random_net(&px->net, v6); - if (bt_random() % 2) + if (tight) + { + px->lo = px->hi = px->net.pxlen; + } + else if (bt_random() % 2) { px->lo = 0; px->hi = px->net.pxlen; @@ -238,7 +249,7 @@ get_outer_net(net_addr *net, const struct f_prefix *src) } static list * -make_random_prefix_list(linpool *lp, int num, int v6) +make_random_prefix_list(linpool *lp, int num, int v6, int tight) { list *prefixes = lp_allocz(lp, sizeof(struct f_prefix_node)); init_list(prefixes); @@ -246,7 +257,7 @@ make_random_prefix_list(linpool *lp, int num, int v6) for (int i = 0; i < num; i++) { struct f_prefix_node *px = lp_allocz(lp, sizeof(struct f_prefix_node)); - get_random_prefix(&px->prefix, v6); + get_random_prefix(&px->prefix, v6, tight); add_tail(prefixes, &px->n); char buf[64]; @@ -429,7 +440,7 @@ t_match_random_net(void) linpool *lp = lp_new_default(&root_pool); for (int round = 0; round < TESTS_NUM; round++) { - list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6, 0); struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); for (int i = 0; i < PREFIX_TESTS_NUM; i++) @@ -457,7 +468,7 @@ t_match_inner_net(void) linpool *lp = lp_new_default(&root_pool); for (int round = 0; round < TESTS_NUM; round++) { - list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6, 0); struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); struct f_prefix_node *n = HEAD(*prefixes); @@ -488,7 +499,7 @@ t_match_outer_net(void) linpool *lp = lp_new_default(&root_pool); for (int round = 0; round < TESTS_NUM; round++) { - list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6); + list *prefixes = make_random_prefix_list(lp, PREFIXES_NUM, v6, 0); struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); struct f_prefix_node *n = HEAD(*prefixes); @@ -613,7 +624,7 @@ t_trie_same(void) linpool *lp = lp_new_default(&root_pool); for (int round = 0; round < TESTS_NUM*4; round++) { - list *prefixes = make_random_prefix_list(lp, 100 * PREFIXES_NUM, v6); + list *prefixes = make_random_prefix_list(lp, 100 * PREFIXES_NUM, v6, 0); struct f_trie *trie1 = f_new_trie(lp, 0); struct f_trie *trie2 = f_new_trie(lp, 0); @@ -630,6 +641,136 @@ t_trie_same(void) lp_flush(lp); } + bt_bird_cleanup(); + return 1; +} + +static inline void +log_networks(const net_addr *a, const net_addr *b) +{ + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) + { + char buf0[64]; + char buf1[64]; + bt_format_net(buf0, 64, a); + bt_format_net(buf1, 64, b); + bt_debug("Found %s expected %s\n", buf0, buf1); + } +} + +static int +t_trie_walk(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM*8; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){1, 10, 100, 1000}[level / 2]; + int pos = 0, end = 0; + list *prefixes = make_random_prefix_list(lp, num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); + + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + pxset[pos++] = n->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + + /* Full walk */ + bt_debug("Full walk (round %d, %d nets)\n", round, num); + + pos = 0; + TRIE_WALK(trie, net, NULL) + { + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; + + pos++; + } + TRIE_WALK_END; + + bt_assert(pos == num); + bt_debug("Full walk done\n"); + + + /* Prepare net for subnet walk - start with random prefix */ + pos = bt_random() % num; + end = pos + (int[]){2, 2, 3, 4}[level / 2]; + end = MIN(end, num); + + struct f_prefix from = pxset[pos]; + + /* Find a common superprefix to several subsequent prefixes */ + for (; pos < end; pos++) + { + if (net_equal(&from.net, &pxset[pos].net)) + continue; + + int common = !v6 ? + ip4_pxlen(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)) : + ip6_pxlen(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); + from.net.pxlen = MIN(from.net.pxlen, common); + + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); + } + + /* Fix irrelevant bits */ + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), ip4_mkmask(net4_pxlen(&from.net))); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), ip6_mkmask(net6_pxlen(&from.net))); + + + /* Find initial position for final prefix */ + for (pos = 0; pos < num; pos++) + if (compare_prefixes(&pxset[pos], &from) >= 0) + break; + + int p0 = pos; + char buf0[64]; + bt_format_net(buf0, 64, &from.net); + bt_debug("Subnet walk for %s (round %d, %d nets)\n", buf0, round, num); + + /* Subnet walk */ + TRIE_WALK(trie, net, &from.net) + { + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + bt_assert(net_in_netX(&net, &from.net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; + + pos++; + } + TRIE_WALK_END; + + bt_assert((pos == num) || !net_in_netX(&pxset[pos].net, &from.net)); + bt_debug("Subnet walk done for %s (found %d nets)\n", buf0, pos - p0); + + lp_flush(lp); + } + + bt_bird_cleanup(); return 1; } @@ -642,6 +783,7 @@ main(int argc, char *argv[]) bt_test_suite(t_match_inner_net, "Testing random inner prefix matching"); bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); + bt_test_suite(t_trie_walk, "Testing TRIE_WALK() on random tries"); // bt_test_suite(t_bench_trie_datasets_subset, "Benchmark tries from datasets by random subset of nets"); // bt_test_suite(t_bench_trie_datasets_random, "Benchmark tries from datasets by generated addresses"); From 644e9ca94e2d10ba0c2de45f94523da2414328e3 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 24 Nov 2021 17:30:13 +0100 Subject: [PATCH 013/196] Directly mapped pages are kept for future use if temporarily not needed --- lib/resource.h | 1 + nest/cmds.c | 7 +++++- sysdep/unix/alloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/lib/resource.h b/lib/resource.h index e65455c8..76e3745f 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -98,6 +98,7 @@ void buffer_realloc(void **buf, unsigned *size, unsigned need, unsigned item_siz u64 get_page_size(void); void *alloc_page(void); void free_page(void *); +extern uint pages_kept; #ifdef HAVE_LIBDMALLOC /* diff --git a/nest/cmds.c b/nest/cmds.c index 18f39eb5..f58923a7 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -91,7 +91,12 @@ cmd_show_memory(void) print_size("Routing tables:", rmemsize(rt_table_pool)); print_size("Route attributes:", rmemsize(rta_pool)); print_size("Protocols:", rmemsize(proto_pool)); - print_size("Total:", rmemsize(&root_pool)); + size_t total = rmemsize(&root_pool); +#ifdef HAVE_MMAP + print_size("Standby memory:", get_page_size() * pages_kept); + total += get_page_size() * pages_kept; +#endif + print_size("Total:", total); cli_msg(0, ""); } diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index c525f713..5dd70c99 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -8,6 +8,8 @@ #include "nest/bird.h" #include "lib/resource.h" +#include "lib/lists.h" +#include "lib/event.h" #include #include @@ -17,8 +19,17 @@ #endif #ifdef HAVE_MMAP +#define KEEP_PAGES 512 + static u64 page_size = 0; static _Bool use_fake = 0; + +uint pages_kept = 0; +static list pages_list; + +static void cleanup_pages(void *data); +static event page_cleanup_event = { .hook = cleanup_pages }; + #else static const u64 page_size = 4096; /* Fake page size */ #endif @@ -48,6 +59,15 @@ void * alloc_page(void) { #ifdef HAVE_MMAP + if (pages_kept) + { + node *page = TAIL(pages_list); + rem_node(page); + pages_kept--; + memset(page, 0, get_page_size()); + return page; + } + if (!use_fake) { void *ret = mmap(NULL, get_page_size(), PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -71,10 +91,39 @@ free_page(void *ptr) #ifdef HAVE_MMAP if (!use_fake) { - if (munmap(ptr, get_page_size()) < 0) - bug("munmap(%p) failed: %m", ptr); + if (!pages_kept) + init_list(&pages_list); + + memset(ptr, 0, sizeof(node)); + add_tail(&pages_list, ptr); + + if (++pages_kept > KEEP_PAGES) + ev_schedule(&page_cleanup_event); } else #endif free(ptr); } + +#ifdef HAVE_MMAP +static void +cleanup_pages(void *data UNUSED) +{ + for (uint seen = 0; (pages_kept > KEEP_PAGES) && (seen < KEEP_PAGES); seen++) + { + void *ptr = HEAD(pages_list); + rem_node(ptr); + if (munmap(ptr, get_page_size()) == 0) + pages_kept--; +#ifdef ENOMEM + else if (errno == ENOMEM) + add_tail(&pages_list, ptr); +#endif + else + bug("munmap(%p) failed: %m", ptr); + } + + if (pages_kept > KEEP_PAGES) + ev_schedule(&page_cleanup_event); +} +#endif From 14fc24f3a53ebc5525b854ccdc93274aa74a400f Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 26 Nov 2021 03:26:36 +0100 Subject: [PATCH 014/196] Trie: Implement longest-prefix-match queries and walks The prefix trie now supports longest-prefix-match query by function trie_match_longest_ipX() and it can be extended to iteration over all covering prefixes for a given prefix (from longest to shortest) using TRIE_WALK_TO_ROOT_IPx() macro. --- filter/data.h | 51 ++++++++++++ filter/trie.c | 190 ++++++++++++++++++++++++++++++++++++++++++++- filter/trie_test.c | 115 +++++++++++++++++++++++++++ test/birdtest.c | 5 +- 4 files changed, 359 insertions(+), 2 deletions(-) diff --git a/filter/data.h b/filter/data.h index 4a0ee865..28c7a888 100644 --- a/filter/data.h +++ b/filter/data.h @@ -196,11 +196,61 @@ void tree_walk(const struct f_tree *t, void (*hook)(const struct f_tree *, void struct f_trie *f_new_trie(linpool *lp, uint data_size); void *trie_add_prefix(struct f_trie *t, const net_addr *n, uint l, uint h); int trie_match_net(const struct f_trie *t, const net_addr *n); +int trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0); +int trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0); void trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *from); int trie_walk_next(struct f_trie_walk_state *s, net_addr *net); int trie_same(const struct f_trie *t1, const struct f_trie *t2); void trie_format(const struct f_trie *t, buffer *buf); +static inline int +trie_match_next_longest_ip4(net_addr_ip4 *n, ip4_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip4_clrbit(&n->prefix, n->pxlen); + + if (ip4_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + +static inline int +trie_match_next_longest_ip6(net_addr_ip6 *n, ip6_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip6_clrbit(&n->prefix, n->pxlen); + + if (ip6_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + + +#define TRIE_WALK_TO_ROOT_IP4(trie, net, dst) ({ \ + net_addr_ip4 dst; \ + ip4_addr _found; \ + for (int _n = trie_match_longest_ip4(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip4(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_IP6(trie, net, dst) ({ \ + net_addr_ip6 dst; \ + ip6_addr _found; \ + for (int _n = trie_match_longest_ip6(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip6(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_END }) + + #define TRIE_WALK(trie, net, from) ({ \ net_addr net; \ struct f_trie_walk_state tws_; \ @@ -209,6 +259,7 @@ void trie_format(const struct f_trie *t, buffer *buf); #define TRIE_WALK_END }) + #define F_CMP_ERROR 999 const char *f_type_name(enum f_type t); diff --git a/filter/trie.c b/filter/trie.c index 21b5b5d7..66b56297 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -85,7 +85,7 @@ * * Iteration over prefixes in a trie can be done using TRIE_WALK() macro, or * directly using trie_walk_init() and trie_walk_next() functions. The second - * approeach allows suspending the iteration and continuing in it later. + * approach allows suspending the iteration and continuing in it later. * Prefixes are enumerated in the usual lexicographic order and may be * restricted to a subset of the trie (all subnets of a specified prefix). * @@ -100,6 +100,13 @@ * path between the current node and its parent node, stored in the bitmap * &accept of the current node) and &local_pos for iteration over intra-node * prefixes (stored in the bitmap &local). + * + * The trie also supports longest-prefix-match query by trie_match_longest_ip4() + * and it can be extended to iteration over all covering prefixes for a given + * prefix (from longest to shortest) using TRIE_WALK_TO_ROOT_IP4() macro. There + * are also IPv6 versions (for practical reasons, these functions and macros are + * separate for IPv4 and IPv6). There is the same limitation to enumeration of + * `implicit' prefixes like with the previous TRIE_WALK() macro. */ #include "nest/bird.h" @@ -541,6 +548,187 @@ trie_match_net(const struct f_trie *t, const net_addr *n) } +/** + * trie_match_longest_ip4 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip4() or macro TRIE_WALK_TO_ROOT_IP4(). + * + * This function assumes IPv4 trie, there is also an IPv6 variant. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0) +{ + ASSERT(t->ipv4); + + const struct f_trie_node4 *n = &t->root.v4; + int len = 0; + + ip4_addr found = IP4_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip4_prefix_equal(net->prefix, n->addr, MIN(net->pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > net->pxlen) + goto done; + + if (ip4_getbit(n->accept, len - 1)) + { + /* len is always < 32 due to len < n->plen */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP4_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip4_getbit(net->prefix, len), len++) + { + if (len > net->pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 32 due to special case above */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip4_getbits(net->prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + net_copy_ip4(dst, net); + dst->prefix = ip4_and(dst->prefix, ip4_mkmask(last)); + dst->pxlen = last; + + if (found0) + *found0 = found; + + return 1; +} + + +/** + * trie_match_longest_ip6 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip6() or macro TRIE_WALK_TO_ROOT_IP6(). + * + * This function assumes IPv6 trie, there is also an IPv4 variant. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0) +{ + ASSERT(!t->ipv4); + + const struct f_trie_node6 *n = &t->root.v6; + int len = 0; + + ip6_addr found = IP6_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip6_prefix_equal(net->prefix, n->addr, MIN(net->pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > net->pxlen) + goto done; + + if (ip6_getbit(n->accept, len - 1)) + { + /* len is always < 128 due to len < n->plen */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP6_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip6_getbit(net->prefix, len), len++) + { + if (len > net->pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 128 due to special case above */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip6_getbits(net->prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + net_copy_ip6(dst, net); + dst->prefix = ip6_and(dst->prefix, ip6_mkmask(last)); + dst->pxlen = last; + + if (found0) + *found0 = found; + + return 1; +} + #define SAME_PREFIX(A,B,X,L) ((X) ? ip4_prefix_equal((A)->v4.addr, net4_prefix(B), (L)) : ip6_prefix_equal((A)->v6.addr, net6_prefix(B), (L))) #define GET_NET_BITS(N,X,A,B) ((X) ? ip4_getbits(net4_prefix(N), (A), (B)) : ip6_getbits(net6_prefix(N), (A), (B))) diff --git a/filter/trie_test.c b/filter/trie_test.c index bb9a2f26..eee48284 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -774,6 +774,120 @@ t_trie_walk(void) return 1; } +static int +find_covering_nets(struct f_prefix *prefixes, int num, const net_addr *net, net_addr *found) +{ + struct f_prefix key; + net_addr *n = &key.net; + int found_num = 0; + + net_copy(n, net); + + while (1) + { + struct f_prefix *px = + bsearch(&key, prefixes, num, sizeof(struct f_prefix), compare_prefixes); + + if (px) + { + net_copy(&found[found_num], n); + found_num++; + } + + if (n->pxlen == 0) + return found_num; + + n->pxlen--; + + if (n->type == NET_IP4) + ip4_clrbit(&((net_addr_ip4 *) n)->prefix, n->pxlen); + else + ip6_clrbit(&((net_addr_ip6 *) n)->prefix, n->pxlen); + } +} + +static int +t_trie_walk_to_root(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + linpool *lp = lp_new_default(&root_pool); + for (int round = 0; round < TESTS_NUM * 4; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){32, 512}[level / 2]; + int pos = 0; + int st = 0, sn = 0, sm = 0; + + list *prefixes = make_random_prefix_list(lp, num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(lp, prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); + + struct f_prefix_node *pxn; + WALK_LIST(pxn, *prefixes) + pxset[pos++] = pxn->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + int i; + for (i = 0; i < (PREFIX_TESTS_NUM / 10); i++) + { + net_addr from; + get_random_net(&from, v6); + + net_addr found[129]; + int found_num = find_covering_nets(pxset, num, &from, found); + int n = 0; + + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) + { + char buf[64]; + bt_format_net(buf, 64, &from); + bt_debug("Lookup for %s (expect %d)\n", buf, found_num); + } + + /* Walk to root, separate for IPv4 and IPv6 */ + if (!v6) + { + TRIE_WALK_TO_ROOT_IP4(trie, (net_addr_ip4 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + else + { + TRIE_WALK_TO_ROOT_IP6(trie, (net_addr_ip6 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + + bt_assert(n == found_num); + + /* Stats */ + st += n; + sn += !!n; + sm = MAX(sm, n); + } + + bt_debug("Success in %d / %d, sum %d, max %d\n", sn, i, st, sm); + + lp_flush(lp); + } + + bt_bird_cleanup(); + return 1; +} + int main(int argc, char *argv[]) { @@ -784,6 +898,7 @@ main(int argc, char *argv[]) bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); bt_test_suite(t_trie_walk, "Testing TRIE_WALK() on random tries"); + bt_test_suite(t_trie_walk_to_root, "Testing TRIE_WALK_TO_ROOT() on random tries"); // bt_test_suite(t_bench_trie_datasets_subset, "Benchmark tries from datasets by random subset of nets"); // bt_test_suite(t_bench_trie_datasets_random, "Benchmark tries from datasets by generated addresses"); diff --git a/test/birdtest.c b/test/birdtest.c index 053954e1..6ad743ce 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -510,7 +510,10 @@ bt_fmt_ipa(char *buf, size_t size, const void *data) void bt_format_net(char *buf, size_t size, const void *data) { - bsnprintf(buf, size, "%N", (const net_addr *) data); + if (data) + bsnprintf(buf, size, "%N", (const net_addr *) data); + else + bsnprintf(buf, size, "(null)"); } int From f772afc525156498900770ffe5a98349df89a45c Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Sat, 27 Nov 2021 00:21:12 +0100 Subject: [PATCH 015/196] Memory statistics split into Effective and Overhead This feature is intended mostly for checking that BIRD's allocation strategies don't consume much memory space. There are some cases where withdrawing routes in a specific order lead to memory fragmentation and this output should give the user at least a notion of how much memory is actually used for data storage and how much memory is "just allocated" or used for overhead. Also raising the "system allocator overhead estimation" from 8 to 16 bytes; it is probably even more. I've found 16 as a local minimum in best scenarios among reachable machines. I couldn't find any reasonable method to estimate this value when BIRD starts up. This commit also fixes the inaccurate computation of memory overhead for slabs where the "system allocater overhead estimation" was improperly added to the size of mmap-ed memory. --- lib/mempool.c | 12 +++++++----- lib/resource.c | 32 +++++++++++++++++++++++--------- lib/resource.h | 12 +++++++++--- lib/slab.c | 33 ++++++++++++++++++++++++--------- nest/cmds.c | 44 +++++++++++++++++++++++++++++++++++--------- 5 files changed, 98 insertions(+), 35 deletions(-) diff --git a/lib/mempool.c b/lib/mempool.c index 758882ce..90d7c774 100644 --- a/lib/mempool.c +++ b/lib/mempool.c @@ -45,7 +45,7 @@ struct linpool { static void lp_free(resource *); static void lp_dump(resource *); static resource *lp_lookup(resource *, unsigned long); -static size_t lp_memsize(resource *r); +static struct resmem lp_memsize(resource *r); static struct resclass lp_class = { "LinPool", @@ -287,7 +287,7 @@ lp_dump(resource *r) m->total_large); } -static size_t +static struct resmem lp_memsize(resource *r) { linpool *m = (linpool *) r; @@ -299,9 +299,11 @@ lp_memsize(resource *r) for(c=m->first_large; c; c=c->next) cnt++; - return ALLOC_OVERHEAD + sizeof(struct linpool) + - cnt * (ALLOC_OVERHEAD + sizeof(struct lp_chunk)) + - m->total + m->total_large; + return (struct resmem) { + .effective = m->total + m->total_large, + .overhead = ALLOC_OVERHEAD + sizeof(struct linpool) + + cnt * (ALLOC_OVERHEAD + sizeof(struct lp_chunk)), + }; } diff --git a/lib/resource.c b/lib/resource.c index 4c4b92ec..5d4c7780 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -2,6 +2,7 @@ * BIRD Resource Manager * * (c) 1998--2000 Martin Mares + * (c) 2021 Maria Matejka * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -37,7 +38,7 @@ struct pool { static void pool_dump(resource *); static void pool_free(resource *); static resource *pool_lookup(resource *, unsigned long); -static size_t pool_memsize(resource *P); +static struct resmem pool_memsize(resource *P); static struct resclass pool_class = { "Pool", @@ -97,15 +98,22 @@ pool_dump(resource *P) indent -= 3; } -static size_t +static struct resmem pool_memsize(resource *P) { pool *p = (pool *) P; resource *r; - size_t sum = sizeof(pool) + ALLOC_OVERHEAD; + struct resmem sum = { + .effective = 0, + .overhead = sizeof(pool) + ALLOC_OVERHEAD, + }; WALK_LIST(r, p->inside) - sum += rmemsize(r); + { + struct resmem add = rmemsize(r); + sum.effective += add.effective; + sum.overhead += add.overhead; + } return sum; } @@ -193,14 +201,17 @@ rdump(void *res) debug("NULL\n"); } -size_t +struct resmem rmemsize(void *res) { resource *r = res; if (!r) - return 0; + return (struct resmem) {}; if (!r->class->memsize) - return r->class->size + ALLOC_OVERHEAD; + return (struct resmem) { + .effective = r->class->size - sizeof(resource), + .overhead = ALLOC_OVERHEAD + sizeof(resource), + }; return r->class->memsize(r); } @@ -305,11 +316,14 @@ mbl_lookup(resource *r, unsigned long a) return NULL; } -static size_t +static struct resmem mbl_memsize(resource *r) { struct mblock *m = (struct mblock *) r; - return ALLOC_OVERHEAD + sizeof(struct mblock) + m->size; + return (struct resmem) { + .effective = m->size, + .overhead = ALLOC_OVERHEAD + sizeof(struct mblock), + }; } static struct resclass mb_class = { diff --git a/lib/resource.h b/lib/resource.h index 76e3745f..9ec41ed8 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -2,6 +2,7 @@ * BIRD Resource Manager * * (c) 1998--1999 Martin Mares + * (c) 2021 Maria Matejka * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -11,6 +12,11 @@ #include "lib/lists.h" +struct resmem { + size_t effective; /* Memory actually used for data storage */ + size_t overhead; /* Overhead memory imposed by allocator strategies */ +}; + /* Resource */ typedef struct resource { @@ -26,11 +32,11 @@ struct resclass { void (*free)(resource *); /* Freeing function */ void (*dump)(resource *); /* Dump to debug output */ resource *(*lookup)(resource *, unsigned long); /* Look up address (only for debugging) */ - size_t (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ + struct resmem (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ }; /* Estimate of system allocator overhead per item, for memory consumtion stats */ -#define ALLOC_OVERHEAD 8 +#define ALLOC_OVERHEAD 16 /* Generic resource manipulation */ @@ -40,7 +46,7 @@ void resource_init(void); pool *rp_new(pool *, const char *); /* Create new pool */ void rfree(void *); /* Free single resource */ void rdump(void *); /* Dump to debug output */ -size_t rmemsize(void *res); /* Return size of memory used by the resource */ +struct resmem rmemsize(void *res); /* Return size of memory used by the resource */ void rlookup(unsigned long); /* Look up address (only for debugging) */ void rmove(void *, pool *); /* Move to a different pool */ diff --git a/lib/slab.c b/lib/slab.c index b0a01ae7..6cab6b7b 100644 --- a/lib/slab.c +++ b/lib/slab.c @@ -42,7 +42,7 @@ static void slab_free(resource *r); static void slab_dump(resource *r); static resource *slab_lookup(resource *r, unsigned long addr); -static size_t slab_memsize(resource *r); +static struct resmem slab_memsize(resource *r); #ifdef FAKE_SLAB @@ -128,7 +128,7 @@ slab_dump(resource *r) debug("(%d objects per %d bytes)\n", cnt, s->size); } -static size_t +static struct resmem slab_memsize(resource *r) { slab *s = (slab *) r; @@ -138,7 +138,10 @@ slab_memsize(resource *r) WALK_LIST(o, s->objs) cnt++; - return ALLOC_OVERHEAD + sizeof(struct slab) + cnt * (ALLOC_OVERHEAD + s->size); + return (struct resmem) { + .effective = cnt * s->size, + .overhead = ALLOC_OVERHEAD + sizeof(struct slab) + cnt * ALLOC_OVERHEAD, + }; } @@ -363,21 +366,33 @@ slab_dump(resource *r) debug("(%de+%dp+%df blocks per %d objs per %d bytes)\n", ec, pc, fc, s->objs_per_slab, s->obj_size); } -static size_t +static struct resmem slab_memsize(resource *r) { slab *s = (slab *) r; size_t heads = 0; struct sl_head *h; - WALK_LIST(h, s->empty_heads) - heads++; - WALK_LIST(h, s->partial_heads) - heads++; WALK_LIST(h, s->full_heads) heads++; - return ALLOC_OVERHEAD + sizeof(struct slab) + heads * (ALLOC_OVERHEAD + get_page_size()); + size_t items = heads * s->objs_per_slab; + + WALK_LIST(h, s->partial_heads) + { + heads++; + items += h->num_full; + } + + WALK_LIST(h, s->empty_heads) + heads++; + + size_t eff = items * s->obj_size; + + return (struct resmem) { + .effective = eff, + .overhead = ALLOC_OVERHEAD + sizeof(struct slab) + heads * get_page_size() - eff, + }; } static resource * diff --git a/nest/cmds.c b/nest/cmds.c index f58923a7..1a16f9c7 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -67,18 +67,43 @@ cmd_show_symbols(struct sym_show_data *sd) } } -static void -print_size(char *dsc, size_t val) +#define SIZE_SUFFIX " kMGT" +#define SIZE_FORMAT "% 4u.%1u % 1cB" +#define SIZE_ARGS(a) (a).val, (a).decimal, SIZE_SUFFIX[(a).magnitude] + +struct size_args { + u64 val:48; + u64 decimal:8; + u64 magnitude:8; +}; + +static struct size_args +get_size_args(u64 val) { - char *px = " kMG"; - int i = 0; - while ((val >= 10000) && (i < 3)) +#define VALDEC 10 /* One decimal place */ + val *= VALDEC; + + uint i = 0; + while ((val >= 10000 * VALDEC) && (i < 4)) { val = (val + 512) / 1024; i++; } - cli_msg(-1018, "%-17s %4u %cB", dsc, (unsigned) val, px[i]); + return (struct size_args) { + .val = (val / VALDEC), + .decimal = (val % VALDEC), + .magnitude = i, + }; +} + +static void +print_size(char *dsc, struct resmem vals) +{ + struct size_args effective = get_size_args(vals.effective); + struct size_args overhead = get_size_args(vals.overhead); + + cli_msg(-1018, "%-17s " SIZE_FORMAT " " SIZE_FORMAT, dsc, SIZE_ARGS(effective), SIZE_ARGS(overhead)); } extern pool *rt_table_pool; @@ -88,13 +113,14 @@ void cmd_show_memory(void) { cli_msg(-1018, "BIRD memory usage"); + cli_msg(-1018, "%-17s Effective Overhead", ""); print_size("Routing tables:", rmemsize(rt_table_pool)); print_size("Route attributes:", rmemsize(rta_pool)); print_size("Protocols:", rmemsize(proto_pool)); - size_t total = rmemsize(&root_pool); + struct resmem total = rmemsize(&root_pool); #ifdef HAVE_MMAP - print_size("Standby memory:", get_page_size() * pages_kept); - total += get_page_size() * pages_kept; + print_size("Standby memory:", (struct resmem) { .overhead = get_page_size() * pages_kept }); + total.overhead += get_page_size() * pages_kept; #endif print_size("Total:", total); cli_msg(0, ""); From 78ddfd2600a31305a78dc205b65deba6fb2e0240 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 29 Nov 2021 19:00:24 +0100 Subject: [PATCH 016/196] Trie: Clarify handling of less-common net types For convenience, Trie functions generally accept as input values not only NET_IPx types of nets, but also NET_VPNx and NET_ROAx types. But returned values are always NET_IPx types. --- filter/trie.c | 62 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/filter/trie.c b/filter/trie.c index 66b56297..50d349fe 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -373,9 +373,23 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) switch (net->type) { - case NET_IP4: px = ipt_from_ip4(net4_prefix(net)); v4 = 1; break; - case NET_IP6: px = ipa_from_ip6(net6_prefix(net)); v4 = 0; break; - default: bug("invalid type"); + case NET_IP4: + case NET_VPN4: + case NET_ROA4: + px = ipt_from_ip4(net4_prefix(net)); + v4 = 1; + break; + + case NET_IP6: + case NET_VPN6: + case NET_ROA6: + case NET_IP6_SADR: + px = ipa_from_ip6(net6_prefix(net)); + v4 = 0; + break; + + default: + bug("invalid type"); } if (t->ipv4 != v4) @@ -562,7 +576,9 @@ trie_match_net(const struct f_trie *t, const net_addr *n) * can be used to enumerate all matching prefixes for the network @net using * function trie_match_next_longest_ip4() or macro TRIE_WALK_TO_ROOT_IP4(). * - * This function assumes IPv4 trie, there is also an IPv6 variant. + * This function assumes IPv4 trie, there is also an IPv6 variant. The @net + * argument is typed as net_addr_ip4, but would accept any IPv4-based net_addr, + * like net4_prefix(). Anyway, returned @dst is always net_addr_ip4. * * Result: 1 if a matching prefix was found, 0 if not. */ @@ -571,6 +587,9 @@ trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr { ASSERT(t->ipv4); + const ip4_addr prefix = net->prefix; + const int pxlen = net->pxlen; + const struct f_trie_node4 *n = &t->root.v4; int len = 0; @@ -580,13 +599,13 @@ trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr while (n) { /* We are out of path */ - if (!ip4_prefix_equal(net->prefix, n->addr, MIN(net->pxlen, n->plen))) + if (!ip4_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) goto done; /* Check accept mask */ for (; len < n->plen; len++) { - if (len > net->pxlen) + if (len > pxlen) goto done; if (ip4_getbit(n->accept, len - 1)) @@ -607,9 +626,9 @@ trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr } /* Check local mask */ - for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip4_getbit(net->prefix, len), len++) + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip4_getbit(prefix, len), len++) { - if (len > net->pxlen) + if (len > pxlen) goto done; if (n->local & (1u << pos)) @@ -621,16 +640,14 @@ trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr } /* Choose child */ - n = n->c[ip4_getbits(net->prefix, n->plen, TRIE_STEP)]; + n = n->c[ip4_getbits(prefix, n->plen, TRIE_STEP)]; } done: if (last < 0) return 0; - net_copy_ip4(dst, net); - dst->prefix = ip4_and(dst->prefix, ip4_mkmask(last)); - dst->pxlen = last; + *dst = NET_ADDR_IP4(ip4_and(prefix, ip4_mkmask(last)), last); if (found0) *found0 = found; @@ -653,7 +670,9 @@ done: * can be used to enumerate all matching prefixes for the network @net using * function trie_match_next_longest_ip6() or macro TRIE_WALK_TO_ROOT_IP6(). * - * This function assumes IPv6 trie, there is also an IPv4 variant. + * This function assumes IPv6 trie, there is also an IPv4 variant. The @net + * argument is typed as net_addr_ip6, but would accept any IPv6-based net_addr, + * like net6_prefix(). Anyway, returned @dst is always net_addr_ip6. * * Result: 1 if a matching prefix was found, 0 if not. */ @@ -662,6 +681,9 @@ trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr { ASSERT(!t->ipv4); + const ip6_addr prefix = net->prefix; + const int pxlen = net->pxlen; + const struct f_trie_node6 *n = &t->root.v6; int len = 0; @@ -671,13 +693,13 @@ trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr while (n) { /* We are out of path */ - if (!ip6_prefix_equal(net->prefix, n->addr, MIN(net->pxlen, n->plen))) + if (!ip6_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) goto done; /* Check accept mask */ for (; len < n->plen; len++) { - if (len > net->pxlen) + if (len > pxlen) goto done; if (ip6_getbit(n->accept, len - 1)) @@ -698,9 +720,9 @@ trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr } /* Check local mask */ - for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip6_getbit(net->prefix, len), len++) + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip6_getbit(prefix, len), len++) { - if (len > net->pxlen) + if (len > pxlen) goto done; if (n->local & (1u << pos)) @@ -712,16 +734,14 @@ trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr } /* Choose child */ - n = n->c[ip6_getbits(net->prefix, n->plen, TRIE_STEP)]; + n = n->c[ip6_getbits(prefix, n->plen, TRIE_STEP)]; } done: if (last < 0) return 0; - net_copy_ip6(dst, net); - dst->prefix = ip6_and(dst->prefix, ip6_mkmask(last)); - dst->pxlen = last; + *dst = NET_ADDR_IP6(ip6_and(prefix, ip6_mkmask(last)), last); if (found0) *found0 = found; From b21104c97e59128973501fc23570e2d929f48923 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 18 Dec 2021 00:58:47 +0100 Subject: [PATCH 017/196] Nest: Do not ignore secondary flag changes in ifa updates Compare all IA_* flags that are set by sysdep iface code. The old code ignores IA_SECONDARY flag when comparing whether iface address updates from kernel changed anything. This is usually not an issue as kernel removes all secondary addresses due to removal of the primary one, but it breaks when sysctl 'promote_secondaries' is enabled and kernel promotes secondary addresses to primary ones. Thanks to 'Alexander' for the bugreport. --- nest/iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nest/iface.c b/nest/iface.c index 83a633a3..682340c5 100644 --- a/nest/iface.c +++ b/nest/iface.c @@ -591,7 +591,7 @@ ifa_update(struct ifa *a) if (ipa_equal(b->brd, a->brd) && ipa_equal(b->opposite, a->opposite) && b->scope == a->scope && - !((b->flags ^ a->flags) & IA_PEER)) + !((b->flags ^ a->flags) & (IA_SECONDARY | IA_PEER | IA_HOST))) { b->flags |= IA_UPDATED; return b; From 00410fd6c17697a5919cb32a44f7117dd3a0834a Mon Sep 17 00:00:00 2001 From: Simon Ruderich Date: Sat, 18 Dec 2021 03:17:48 +0100 Subject: [PATCH 018/196] Doc: bgp: remove "advertise ipv4" The option was removed in d15b0b0a ("BGP redesign", 2016-12-07) but the documentation wasn't updated. --- doc/bird.sgml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 39dadaf2..4e4804de 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2607,13 +2607,6 @@ using the following configuration parameters: disabled. Default: on, with automatic fallback to off when received capability-related error. - BGP could use hardware link state into consideration. If enabled, BIRD tracks the link state of the associated interface and when link diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index e4d754b1..b6f21bc6 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -157,6 +157,8 @@ bgp_open(struct bgp_proto *p) ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : (p->ipv4 ? IPA_NONE4 : IPA_NONE6); uint port = p->cf->local_port; + uint flags = p->cf->free_bind ? SKF_FREEBIND : 0; + uint flag_mask = SKF_FREEBIND; /* FIXME: Add some global init? */ if (!bgp_linpool) @@ -165,8 +167,11 @@ bgp_open(struct bgp_proto *p) /* We assume that cf->iface is defined iff cf->local_ip is link-local */ WALK_LIST(bs, bgp_sockets) - if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->sport == port) && - (bs->sk->iface == ifa) && (bs->sk->vrf == p->p.vrf)) + if (ipa_equal(bs->sk->saddr, addr) && + (bs->sk->sport == port) && + (bs->sk->iface == ifa) && + (bs->sk->vrf == p->p.vrf) && + ((bs->sk->flags & flag_mask) == flags)) { bs->uc++; p->sock = bs; @@ -180,7 +185,7 @@ bgp_open(struct bgp_proto *p) sk->sport = port; sk->iface = ifa; sk->vrf = p->p.vrf; - sk->flags = 0; + sk->flags = flags; sk->tos = IP_PREC_INTERNET_CONTROL; sk->rbsize = BGP_RX_BUFFER_SIZE; sk->tbsize = BGP_TX_BUFFER_SIZE; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index cca4b448..5e025ccd 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -86,6 +86,7 @@ struct bgp_config { int peer_type; /* Internal or external BGP (BGP_PT_*, optional) */ int multihop; /* Number of hops if multihop */ int strict_bind; /* Bind listening socket to local address */ + int free_bind; /* Bind listening socket with SKF_FREEBIND */ int ttl_security; /* Enable TTL security [RFC 5082] */ int compare_path_lengths; /* Use path lengths when selecting best route */ int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 2dfbdca9..7cbc9985 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -31,7 +31,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST) + FIRST, FREE) %type bgp_nh %type bgp_afi @@ -155,6 +155,7 @@ bgp_proto: } | bgp_proto DYNAMIC NAME DIGITS expr ';' { BGP_CFG->dynamic_name_digits = $5; if ($5>10) cf_error("Dynamic name digits must be at most 10"); } | bgp_proto STRICT BIND bool ';' { BGP_CFG->strict_bind = $4; } + | bgp_proto FREE BIND bool ';' { BGP_CFG->free_bind = $4; } | bgp_proto PATH METRIC bool ';' { BGP_CFG->compare_path_lengths = $4; } | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } From d0dd1d20cd40e75e417d58569fac3ff0bf1db41a Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 19:07:57 +0100 Subject: [PATCH 032/196] Netlink: Explicitly skip received cloned routes Kernel uses cloned routes to keep route cache entries, but reports them together with regular routes. They were skipped implicitly as they do not have rtm_protocol filled. Add explicit check for cloned flag and skip such routes explicitly. Also, improve debug logs of skipped routes. --- sysdep/linux/netlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index e127052a..7cea5322 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1535,7 +1535,8 @@ nl_parse_end(struct nl_parse_state *s) } -#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) +#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) +#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) static void nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) @@ -1588,10 +1589,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; if (!a[RTA_DST]) - SKIP("MPLS route without RTA_DST"); + SKIP0("MPLS route without RTA_DST\n"); if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1) - SKIP("MPLS route with multi-label RTA_DST"); + SKIP0("MPLS route with multi-label RTA_DST\n"); net_fill_mpls(&dst, rta_mpls_stack[0]); break; @@ -1609,6 +1610,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) else table_id = i->rtm_table; + if (i->rtm_flags & RTM_F_CLONED) + SKIP("cloned\n"); + /* Do we know this table? */ p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id); if (!p) From e818f16448e918ed07633480291283f3449dd9e4 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 21:53:40 +0100 Subject: [PATCH 033/196] Netlink: Enable strict checking for KRT dumps Add strict checking for netlink KRT dumps to avoid PMTU cache records from FNHE table dump along with KRT. Linux Kernel added FNHE table dump to the netlink API in patch: https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbrivio@redhat.com/ Therefore, since Linux 5.3 these route cache entries are dumped together with regular routes during periodic KRT scans, which in some cases may be huge amount of useless data. This can be avoided by using strict checking for netlink dumps: https://lore.kernel.org/netdev/20181008031644.15989-1-dsahern@kernel.org/ The patch mitigates the risk of receiving unknown and potentially large number of FNHE records that would block BIRD I/O in each sync. There is a known issue caused by the GRE tunnels on Linux that seems to be creating one FNHE record for each destination IP address that is routed through the tunnel, even when the PMTU equals to GRE interface MTU. Thanks to Tomas Hlavacek for the original patch. --- sysdep/linux/netlink.c | 70 +++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7cea5322..71b290fd 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -130,7 +130,7 @@ struct nl_sock uint last_size; }; -#define NL_RX_SIZE 8192 +#define NL_RX_SIZE 32768 #define NL_OP_DELETE 0 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) @@ -157,11 +157,19 @@ nl_open_sock(struct nl_sock *nl) } } +static void +nl_set_strict_dump(struct nl_sock *nl, int strict) +{ + setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +} + static void nl_open(void) { nl_open_sock(&nl_scan); nl_open_sock(&nl_req); + + nl_set_strict_dump(&nl_scan, 1); } static void @@ -180,20 +188,60 @@ nl_send(struct nl_sock *nl, struct nlmsghdr *nh) } static void -nl_request_dump(int af, int cmd) +nl_request_dump_link(void) { struct { struct nlmsghdr nh; - struct rtgenmsg g; + struct ifinfomsg ifi; } req = { - .nh.nlmsg_type = cmd, - .nh.nlmsg_len = sizeof(req), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, - .g.rtgen_family = af + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifi.ifi_family = AF_UNSPEC, }; - nl_send(&nl_scan, &req.nh); + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; } +static void +nl_request_dump_addr(int af) +{ + struct { + struct nlmsghdr nh; + struct ifaddrmsg ifa; + } req = { + .nh.nlmsg_type = RTM_GETADDR, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifa.ifa_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + +static void +nl_request_dump_route(int af) +{ + struct { + struct nlmsghdr nh; + struct rtmsg rtm; + } req = { + .nh.nlmsg_type = RTM_GETROUTE, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .rtm.rtm_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + + static struct nlmsghdr * nl_get_reply(struct nl_sock *nl) { @@ -1151,7 +1199,7 @@ kif_do_scan(struct kif_proto *p UNUSED) if_start_update(); - nl_request_dump(AF_UNSPEC, RTM_GETLINK); + nl_request_dump_link(); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK) nl_parse_link(h, 1); @@ -1178,14 +1226,14 @@ kif_do_scan(struct kif_proto *p UNUSED) } } - nl_request_dump(AF_INET, RTM_GETADDR); + nl_request_dump_addr(AF_INET); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); else log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - nl_request_dump(AF_INET6, RTM_GETADDR); + nl_request_dump_addr(AF_INET6); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); @@ -1902,7 +1950,7 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL struct nl_parse_state s; nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); + nl_request_dump_route(AF_UNSPEC); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); From 8988264a64dc9985303332568832b108dba3acd3 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 23:15:05 +0100 Subject: [PATCH 034/196] Netlink: Add workaround for older kernel headers --- sysdep/linux/netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 71b290fd..27b1a617 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -69,6 +69,10 @@ #define RTA_ENCAP 22 #endif +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + #define krt_ipv4(p) ((p)->af == AF_INET) #define krt_ecmp6(p) ((p)->af == AF_INET6) From bbc33f6ec310d98b9100fb883a2b8908ede1b5a8 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 15 Jan 2022 22:39:40 +0100 Subject: [PATCH 035/196] Netlink: Add another workaround for older kernel headers Unfortunately, SOL_NETLINK is both recently added and arch-dependent, so we cannot just define it. --- sysdep/linux/netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 27b1a617..ccd62f26 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -164,7 +164,14 @@ nl_open_sock(struct nl_sock *nl) static void nl_set_strict_dump(struct nl_sock *nl, int strict) { + /* + * Strict checking is not necessary, it improves behavior on newer kernels. + * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT + * run-time), we can just ignore it. + */ +#ifdef SOL_NETLINK setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#endif } static void From 81ee6cda2e60bbd3d97ab63da30657a54b09feda Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 17 Jan 2022 05:11:29 +0100 Subject: [PATCH 036/196] Netlink: Add option to specify netlink socket receive buffer size Add option 'netlink rx buffer' to specify netlink socket receive buffer size. Uses SO_RCVBUFFORCE, so it can override rmem_max limit. Thanks to Trisha Biswas and Michal for the original patches. --- doc/bird.sgml | 6 + sysdep/linux/krt-sys.h | 1 + sysdep/linux/netlink.Y | 4 +- sysdep/linux/netlink.c | 54 + sysdep/linux/netlink.c.orig | 2179 +++++++++++++++++++++++++++++++++++ 5 files changed, 2243 insertions(+), 1 deletion(-) create mode 100644 sysdep/linux/netlink.c.orig diff --git a/doc/bird.sgml b/doc/bird.sgml index 0112622e..f10b15e2 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -3248,6 +3248,12 @@ channels. allows to specify a limit on maximal number of nexthops in one route. By default, multipath merging is disabled. If enabled, default value of the limit is 16. + +