From 038fcf1c8b7716415235384de5dc47d07dc45b85 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Mon, 27 Sep 2021 17:44:19 +0200 Subject: [PATCH 1/8] Locking route attributes cache To access route attribute cache from multiple threads at once, we have to lock the cache on writing. The route attributes data structures are safe to read unless somebody tries to tamper with the cache itself. --- nest/route.h | 10 +++++----- nest/rt-attr.c | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/nest/route.h b/nest/route.h index a01eff1a..b327e42a 100644 --- a/nest/route.h +++ b/nest/route.h @@ -590,7 +590,7 @@ struct rte_src { typedef struct rta { struct rta *next, **pprev; /* Hash chain */ - u32 uc; /* Use count */ + _Atomic u32 uc; /* Use count */ u32 hash_key; /* Hash over important fields */ struct ea_list *eattrs; /* Extended Attribute chain */ struct hostentry *hostentry; /* Hostentry for recursive next-hops */ @@ -862,15 +862,15 @@ static inline size_t rta_size(const rta *a) { return sizeof(rta) + sizeof(u32)*a #define RTA_MAX_SIZE (sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK) rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ static inline int rta_is_cached(rta *r) { return r->cached; } -static inline rta *rta_clone(rta *r) { r->uc++; return r; } +static inline rta *rta_clone(rta *r) { ASSERT_DIE(0 < atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel)); return r; } void rta__free(rta *r); -static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } +static inline void rta_free(rta *r) { if (r && (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel))) rta__free(r); } rta *rta_do_cow(rta *o, linpool *lp); static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } static inline void rta_uncache(rta *r) { r->cached = 0; r->uc = 0; } -void rta_dump(rta *); +void rta_dump(const rta *); void rta_dump_all(void); -void rta_show(struct cli *, rta *); +void rta_show(struct cli *, const rta *); u32 rt_get_igp_metric(rte *); struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); diff --git a/nest/rt-attr.c b/nest/rt-attr.c index f7e33d72..20f9835d 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -1281,9 +1281,16 @@ rta_lookup(rta *o) ea_normalize(o->eattrs); h = rta_hash(o); + + RTA_LOCK; + for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next) if (r->hash_key == h && rta_same(r, o)) - return rta_clone(r); + { + atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); + RTA_UNLOCK; + return r; + } r = rta_copy(o); r->hash_key = h; @@ -1294,12 +1301,21 @@ rta_lookup(rta *o) if (++rta_cache_count > rta_cache_limit) rta_rehash(); + RTA_UNLOCK; return r; } void rta__free(rta *a) { + RTA_LOCK; + if (atomic_load_explicit(&a->uc, memory_order_acquire)) + { + /* Somebody has cloned this rta inbetween. This sometimes happens. */ + RTA_UNLOCK; + return; + } + ASSERT(rta_cache_count && a->cached); rta_cache_count--; *a->pprev = a->next; @@ -1311,6 +1327,7 @@ rta__free(rta *a) ea_free(a->eattrs); a->cached = 0; sl_free(rta_slab(a), a); + RTA_UNLOCK; } rta * @@ -1335,7 +1352,7 @@ rta_do_cow(rta *o, linpool *lp) * This function takes a &rta and dumps its contents to the debug output. */ void -rta_dump(rta *a) +rta_dump(const rta *a) { static char *rts[] = { "", "RTS_STATIC", "RTS_INHERIT", "RTS_DEVICE", "RTS_STAT_DEV", "RTS_REDIR", "RTS_RIP", @@ -1350,7 +1367,7 @@ rta_dump(rta *a) debug(" !CACHED"); debug(" <-%I", a->from); if (a->dest == RTD_UNICAST) - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) + for (const struct nexthop *nh = &(a->nh); nh; nh = nh->next) { if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); if (nh->labels) debug(" L %d", nh->label[0]); @@ -1377,6 +1394,8 @@ rta_dump_all(void) rta *a; uint h; + RTA_LOCK; + debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit); for(h=0; hnext) @@ -1386,10 +1405,12 @@ rta_dump_all(void) debug("\n"); } debug("\n"); + + RTA_UNLOCK; } void -rta_show(struct cli *c, rta *a) +rta_show(struct cli *c, const rta *a) { cli_printf(c, -1008, "\tType: %s %s", rta_src_names[a->source], ip_scope_text(a->scope)); From 18f66055e3f7eec2108e18679652aee47f5de6b2 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 29 Sep 2021 17:59:50 +0200 Subject: [PATCH 2/8] Global table update pool removed --- nest/proto.c | 2 + nest/protocol.h | 3 ++ nest/route.h | 8 +-- nest/rt-table.c | 116 +++++++++++++++++------------------------- proto/bgp/packets.c | 4 +- proto/static/static.c | 2 +- 6 files changed, 61 insertions(+), 74 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index 930fad1d..f55f347a 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -245,6 +245,8 @@ proto_add_channel(struct proto *p, struct channel_config *cf) channel_init_limit(c, &c->in_limit, PLD_IN, &cf->in_limit); channel_init_limit(c, &c->out_limit, PLD_OUT, &cf->out_limit); + c->rte_update_pool = lp_new_default(proto_pool); + c->net_type = cf->net_type; c->ra_mode = cf->ra_mode; c->preference = cf->preference; diff --git a/nest/protocol.h b/nest/protocol.h index 440297a1..1647fbba 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -509,6 +509,9 @@ struct channel { u8 limit_actions[PLD_MAX]; /* Limit actions enum */ u8 limit_active; /* Flags for active limits */ + linpool *rte_update_pool; + uint rte_update_nest_cnt; + struct channel_import_stats { /* Import - from protocol to core */ u32 updates_received; /* Number of route updates received */ diff --git a/nest/route.h b/nest/route.h index b327e42a..e2dc9986 100644 --- a/nest/route.h +++ b/nest/route.h @@ -197,6 +197,8 @@ typedef struct rtable { struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ + linpool *nhu_lp; /* Linpool used for NHU */ + list subscribers; /* Subscribers for notifications */ struct timer *settle_timer; /* Settle time for notifications */ @@ -874,12 +876,12 @@ void rta_show(struct cli *, const rta *); u32 rt_get_igp_metric(rte *); struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); -void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls); +void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp); static inline void -rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls) +rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls, linpool *lp) { - rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls); + rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls, lp); } /* diff --git a/nest/rt-table.c b/nest/rt-table.c index c67f5bf8..e307449a 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -49,8 +49,6 @@ pool *rt_table_pool; -static linpool *rte_update_pool; - list routing_tables; /* Data structures for export journal */ @@ -74,9 +72,6 @@ static void rt_feed_channel(void *); static inline void rt_export_used(rtable *tab); static void rt_export_cleanup(void *tab); -static inline void rte_update_lock(void); -static inline void rte_update_unlock(void); - const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", [TIS_UP] = "UP", @@ -112,6 +107,19 @@ const char *rt_export_state_name(u8 state) struct event_cork rt_cork; +static inline void +rte_update_lock(struct channel *c) +{ + c->rte_update_nest_cnt++; +} + +static inline void +rte_update_unlock(struct channel *c) +{ + if (!--c->rte_update_nest_cnt) + lp_flush(c->rte_update_pool); +} + /* Like fib_route(), but skips empty net entries */ static inline void * net_route_ip4(rtable *t, net_addr_ip4 *n) @@ -525,7 +533,7 @@ reject_noset: static inline rte * export_filter(struct channel *c, rte *rt, int silent) { - return export_filter_(c, rt, rte_update_pool, silent); + return export_filter_(c, rt, c->rte_update_pool, silent); } void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); @@ -618,6 +626,8 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p { struct channel *c = SKIP_BACK(struct channel, out_req, req); + rte_update_lock(c); + rte nb0, *new_best = NULL; const rte *old_best = NULL; @@ -672,13 +682,12 @@ done: } /* Nothing to export */ - if (!new_best && !old_best) - { + if (new_best || old_best) + do_rt_notify(c, n, new_best, old_best); + else DBG("rt_notify_accepted: nothing to export\n"); - return; - } - do_rt_notify(c, n, new_best, old_best); + rte_update_unlock(c); } @@ -750,6 +759,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen { struct channel *c = SKIP_BACK(struct channel, out_req, req); + rte_update_lock(c); // struct proto *p = c->proto; #if 0 /* TODO: Find whether this check is possible when processing multiple changes at once. */ @@ -786,17 +796,19 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Prepare new merged route */ - rte *new_merged = count ? rt_export_merged(c, feed, count, rte_update_pool, 0) : NULL; + rte *new_merged = count ? rt_export_merged(c, feed, count, c->rte_update_pool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); + + rte_update_unlock(c); } void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - + rte_update_lock(c); rte *old = RTES_OR_NULL(rpe->old_best); struct rte_storage *new_best = rpe->new_best; @@ -812,13 +824,15 @@ rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_ rte n0, *new = RTES_CLONE(new_best, &n0); rt_notify_basic(c, net, new, old); } + + rte_update_unlock(c); } void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - + rte_update_lock(c); struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; rte *old = RTES_OR_NULL(rpe->old); struct rte_storage *new_any = rpe->new; @@ -835,18 +849,23 @@ rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pend rte n0, *new = RTES_CLONE(new_any, &n0); rt_notify_basic(c, net, new, old); } + + rte_update_unlock(c); } void rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); + rte_update_lock(c); for (uint i=0; irpe_next); if (!c->rpe_next) break; - - rte_update_unlock(); } rt_send_export_event(c); @@ -1447,21 +1462,6 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr } -static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */ - -static inline void -rte_update_lock(void) -{ - rte_update_nest_cnt++; -} - -static inline void -rte_update_unlock(void) -{ - if (!--rte_update_nest_cnt) - lp_flush(rte_update_pool); -} - rte * channel_preimport(struct rt_import_request *req, rte *new, rte *old) { @@ -1534,7 +1534,7 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src const struct filter *filter = c->in_filter; struct channel_import_stats *stats = &c->import_stats; - rte_update_lock(); + rte_update_lock(c); if (new) { new->net = n; @@ -1549,7 +1549,7 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src new = NULL; } else if ((filter == FILTER_REJECT) || - ((fr = f_run(filter, new, rte_update_pool, 0)) > F_ACCEPT)) + ((fr = f_run(filter, new, c->rte_update_pool, 0)) > F_ACCEPT)) { stats->updates_filtered++; channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); @@ -1565,7 +1565,7 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src rte_import(&c->in_req, n, new, src); - rte_update_unlock(); + rte_update_unlock(c); } void @@ -1593,25 +1593,6 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt rte_recalculate(hook, nn, new, src); } -/* Independent call to rte_announce(), used from next hop - recalculation, outside of rte_update(). new must be non-NULL */ -static inline void -rte_announce_i(rtable *tab, net *net, struct rte_storage *new, struct rte_storage *old, - struct rte_storage *new_best, struct rte_storage *old_best) -{ - rte_update_lock(); - rte_announce(tab, net, new, old, new_best, old_best); - rte_update_unlock(); -} - -static inline void -rte_discard(net *net, rte *old) /* Non-filtered route deletion, used during garbage collection */ -{ - rte_update_lock(); - rte_recalculate(old->sender, net, NULL, old->src); - rte_update_unlock(); -} - /* Check rtable for best route to given net whether it would be exported do p */ int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) @@ -1626,14 +1607,14 @@ rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filte if (!rte_is_valid(&rt)) return 0; - rte_update_lock(); + rte_update_lock(c); /* Rest is stripped down export_filter() */ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; if (v == RIC_PROCESS) - v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT); + v = (f_run(filter, &rt, c->rte_update_pool, FF_SILENT) <= F_ACCEPT); - rte_update_unlock(); + rte_update_unlock(c); return v > 0; } @@ -2123,6 +2104,8 @@ rt_setup(pool *pp, struct rtable_config *cf) t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; + t->nhu_lp = lp_new_default(p); + return t; } @@ -2137,7 +2120,6 @@ rt_init(void) { rta_init(); rt_table_pool = rp_new(&root_pool, "Routing tables"); - rte_update_pool = lp_new_default(rt_table_pool); init_list(&routing_tables); ev_init_cork(&rt_cork, "Route Table Cork"); } @@ -2211,7 +2193,7 @@ again: return; } - rte_discard(n, &e->rte); + rte_recalculate(e->rte.sender, n, NULL, e->rte.src); limit--; goto rescan; @@ -2462,7 +2444,7 @@ rta_next_hop_outdated(rta *a) } void -rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls) +rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp) { a->hostentry = he; a->dest = he->dest; @@ -2497,7 +2479,7 @@ no_nexthop: else { nhr = nhp; - nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh)); + nhp = (nhp ? (nhp->next = lp_alloc(lp, NEXTHOP_MAX_SIZE)) : &(a->nh)); } memset(nhp, 0, NEXTHOP_MAX_SIZE); @@ -2561,7 +2543,7 @@ rt_next_hop_update_rte(rtable *tab, net *n, rte *old) mpls_label_stack mls = { .len = a->nh.labels_orig }; memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32)); - rta_apply_hostentry(a, old->attrs->hostentry, &mls); + rta_apply_hostentry(a, old->attrs->hostentry, &mls, tab->nhu_lp); a->cached = 0; rte e0 = *old; @@ -2615,6 +2597,8 @@ rt_next_hop_update_net(rtable *tab, net *n) new->rte.lastmod = current_time(); new->rte.id = hmap_first_zero(&tab->id_map); hmap_set(&tab->id_map, new->rte.id); + + lp_flush(tab->nhu_lp); } ASSERT_DIE(pos == count); @@ -2645,7 +2629,7 @@ rt_next_hop_update_net(rtable *tab, net *n) { "autoupdated [+best]", "autoupdated [best]" } }; rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); - rte_announce_i(tab, n, updates[i].new, updates[i].old, new, old_best); + rte_announce(tab, n, updates[i].new, updates[i].old, new, old_best); } return count; @@ -2869,21 +2853,17 @@ rt_feed_channel(void *data) uint count = rte_feed_count(n); if (count) { - rte_update_lock(); rte **feed = alloca(count * sizeof(rte *)); rte_feed_obtain(n, feed, count); c->req->export_bulk(c->req, n->n.addr, NULL, feed, count); max_feed -= count; - rte_update_unlock(); } } else if (n->routes && rte_is_valid(&n->routes->rte)) { - rte_update_lock(); struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes }; c->req->export_one(c->req, n->n.addr, &rpe); max_feed--; - rte_update_unlock(); } for (struct rt_pending_export *rpe = n->first; rpe; rpe = rpe_next(rpe, NULL)) diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index d2d5b174..ea9adb4c 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -971,7 +971,7 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table); if (!s->mpls) - rta_apply_hostentry(a, s->hostentry, NULL); + rta_apply_hostentry(a, s->hostentry, NULL, s->pool); /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ } @@ -1005,7 +1005,7 @@ bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) ms.len = lnum; memcpy(ms.stack, labels, 4*lnum); - rta_apply_hostentry(a, s->hostentry, &ms); + rta_apply_hostentry(a, s->hostentry, &ms, s->pool); } } diff --git a/proto/static/static.c b/proto/static/static.c index b0c9d640..45791e8e 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -98,7 +98,7 @@ static_announce_rte(struct static_proto *p, struct static_route *r) if (r->dest == RTDX_RECURSIVE) { rtable *tab = ipa_is_ip4(r->via) ? p->igp_table_ip4 : p->igp_table_ip6; - rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls); + rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls, static_lp); } /* Already announced */ From c7d0c5b2523a8cbfcaee9a235955dd5e58fab671 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 27 Oct 2021 12:42:05 +0000 Subject: [PATCH 3/8] Route subscription uses events --- nest/proto.c | 17 +++++++++-------- nest/route.h | 3 +-- nest/rt-table.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index f55f347a..35af3c6c 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -319,9 +319,9 @@ proto_remove_channels(struct proto *p) } static void -channel_roa_in_changed(struct rt_subscription *s) +channel_roa_in_changed(void *_data) { - struct channel *c = s->data; + struct channel *c = _data; CD(c, "Reload triggered by RPKI change"); @@ -329,9 +329,9 @@ channel_roa_in_changed(struct rt_subscription *s) } static void -channel_roa_out_changed(struct rt_subscription *s) +channel_roa_out_changed(void *_data) { - struct channel *c = s->data; + struct channel *c = _data; CD(c, "Feeding triggered by RPKI change"); c->refeed_pending = 1; @@ -349,14 +349,14 @@ struct roa_subscription { static int channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir) { - void (*hook)(struct rt_subscription *) = + void (*hook)(void *) = dir ? channel_roa_in_changed : channel_roa_out_changed; struct roa_subscription *s; node *n; WALK_LIST2(s, n, c->roa_subscriptions, roa_node) - if ((s->s.tab == tab) && (s->s.hook == hook)) + if ((s->s.tab == tab) && (s->s.event->hook == hook)) return 1; return 0; @@ -370,9 +370,9 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir) return; struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription)); + s->s.event = ev_new_init(c->proto->pool, dir ? channel_roa_in_changed : channel_roa_out_changed, c); + s->s.event->list = proto_work_list(c->proto); - s->s.hook = dir ? channel_roa_in_changed : channel_roa_out_changed; - s->s.data = c; rt_subscribe(tab, &s->s); add_tail(&c->roa_subscriptions, &s->roa_node); @@ -383,6 +383,7 @@ channel_roa_unsubscribe(struct roa_subscription *s) { rt_unsubscribe(&s->s); rem_node(&s->roa_node); + rfree(s->s.event); mb_free(s); } diff --git a/nest/route.h b/nest/route.h index e2dc9986..683c966e 100644 --- a/nest/route.h +++ b/nest/route.h @@ -213,8 +213,7 @@ typedef struct rtable { struct rt_subscription { node n; rtable *tab; - void (*hook)(struct rt_subscription *b); - void *data; + event *event; }; #define NHU_CLEAN 0 diff --git a/nest/rt-table.c b/nest/rt-table.c index e307449a..fb0496bd 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -1979,7 +1979,7 @@ rt_settle_timer(timer *t) struct rt_subscription *s; WALK_LIST(s, tab->subscribers) - s->hook(s); + ev_send(s->event->list, s->event); } static void From c73b5d2d3d94204d2a81d93efd02c4c115859353 Mon Sep 17 00:00:00 2001 From: Eugene Bogomazov Date: Mon, 11 Jul 2022 17:19:34 +0200 Subject: [PATCH 4/8] BGP: Implement BGP roles Implement BGP roles as described in RFC 9234. It is a mechanism for route leak prevention and automatic route filtering based on common BGP topology relationships. It defines role capability (controlled by 'local role' option) and OTC route attribute, which is used for automatic route filtering and leak detection. Minor changes done by commiter. --- doc/bird.sgml | 28 ++++++++++++++++++ proto/bgp/attrs.c | 69 +++++++++++++++++++++++++++++++++++++++++++-- proto/bgp/bgp.c | 19 +++++++++++++ proto/bgp/bgp.h | 17 +++++++++++ proto/bgp/config.Y | 16 +++++++++-- proto/bgp/packets.c | 46 +++++++++++++++++++++++++++++- 6 files changed, 189 insertions(+), 6 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index 89b1541c..c1ce1b91 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2377,6 +2377,7 @@ avoid routing loops. - BGP Administrative Shutdown Communication - Default EBGP Route Propagation Behavior without Policies - Revised Validation Procedure for BGP Flow Specifications + - Route Leak Prevention and Detection Using Roles Route selection rules @@ -2817,6 +2818,28 @@ using the following configuration parameters: protocol itself (for example, if a route is received through eBGP and therefore does not have such attribute). Default: 100 (0 in pre-1.2.0 versions of BIRD). + + + This attribute is defined in . OTC is a flag that marks + routes that should be sent only to customers. If is configured it set automatically. Example diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 9dd9fb1a..0d2116b7 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -896,6 +896,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); } + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_ONLY_TO_CUSTOMER, flags, val); +} + + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -1109,6 +1121,13 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = EAF_TYPE_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { .name = "mpls_label_stack", .type = EAF_TYPE_INT_SET, @@ -1445,6 +1464,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) REPORT("Discarding AIGP attribute received on non-AIGP session"); bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP); } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(a->eattrs, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(&a->eattrs, s->pool, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); + } } @@ -1673,6 +1715,7 @@ bgp_preexport(struct channel *C, rte **new, struct linpool *pool UNUSED) { rte *e = *new; struct proto *SRC = e->attrs->src->proto; + struct bgp_channel *c = (struct bgp_channel *) C; struct bgp_proto *p = (struct bgp_proto *) C->proto; struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; @@ -1702,11 +1745,11 @@ bgp_preexport(struct channel *C, rte **new, struct linpool *pool UNUSED) } /* Handle well-known communities, RFC 1997 */ - struct eattr *c; + struct eattr *a; if (p->cf->interpret_communities && - (c = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs->eattrs, BA_COMMUNITY))) { - const struct adata *d = c->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1725,6 +1768,16 @@ bgp_preexport(struct channel *C, rte **new, struct linpool *pool UNUSED) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs->eattrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } @@ -1834,6 +1887,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, pool, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 89507f1a..3b28a338 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -102,6 +102,7 @@ * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -1983,6 +1984,12 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2345,6 +2352,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +static const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < 5) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2473,6 +2489,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 7cd1c27d..fea87304 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -114,6 +114,8 @@ struct bgp_config { int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ @@ -167,6 +169,13 @@ struct bgp_channel_config { #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -223,6 +232,7 @@ struct bgp_caps { u8 ext_messages; /* Extended message length, RFC draft */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -485,6 +495,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -660,6 +676,7 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ #define BA_AIGP 0x1a /* RFC 7311 */ #define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ #define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 241aa7c2..a00aa156 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -31,7 +31,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST, FREE, VALIDATE, BASE) + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE) %type bgp_nh %type bgp_afi @@ -40,7 +41,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type bgp_cease_mask bgp_cease_list bgp_cease_flag +%type bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR @@ -72,6 +73,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -114,6 +116,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -197,6 +207,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index f13625e2..54423a1a 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, NULL, 0); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, NULL, 0); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -2984,6 +3027,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, From 971721c9b50d361e886762f1c7d0392e10f74021 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Tue, 12 Jul 2022 15:03:17 +0200 Subject: [PATCH 5/8] BGP: Minor improvements to BGP roles Add support for bgp_otc in filters and warning for configuration inside confederations. --- doc/bird.sgml | 3 ++- proto/bgp/bgp.c | 5 ++++- proto/bgp/config.Y | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/bird.sgml b/doc/bird.sgml index c1ce1b91..a0d9c405 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2829,7 +2829,8 @@ using the following configuration parameters: leaks created by third parties. This option is valid for EBGP sessions, but it is not recommended to be - used within AS confederations. + used within AS confederations (which would require manual filtering of + values are: local_role != BGP_ROLE_UNDEFINED)) cf_error("Local role cannot be set on IBGP sessions"); + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) cf_error("Local role must be set if roles are required"); @@ -2357,7 +2360,7 @@ bgp_format_role_name(u8 role) { static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; if (role == BGP_ROLE_UNDEFINED) return "undefined"; - if (role < 5) return bgp_role_names[role]; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; return "?"; } diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index a00aa156..cb410a5e 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -32,7 +32,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, - RS_SERVER, RS_CLIENT, REQUIRE) + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC) %type bgp_nh %type bgp_afi @@ -355,6 +355,8 @@ dynamic_attr: BGP_AIGP { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ; dynamic_attr: BGP_LARGE_COMMUNITY { $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ; +dynamic_attr: BGP_OTC + { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_ONLY_TO_CUSTOMER)); } ; From 4d48ede51dfff9a59572a6b7a21a1bbf159dec60 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 22 Jul 2022 15:37:21 +0200 Subject: [PATCH 6/8] Revert "Export table: Delay freeing of old stored route." This reverts commit cee0cd148c9b71bf47d007c850193b5fbf9486c1. This change is not needed in version 2 and the surrounding code has disappeared mostly in version 3. --- nest/route.h | 2 +- nest/rt-table.c | 63 +++++++++++++++++++++---------------------------- 2 files changed, 28 insertions(+), 37 deletions(-) diff --git a/nest/route.h b/nest/route.h index a4f19fc4..7aec7117 100644 --- a/nest/route.h +++ b/nest/route.h @@ -339,7 +339,7 @@ int rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src int rt_reload_channel(struct channel *c); void rt_reload_channel_abort(struct channel *c); void rt_prune_sync(rtable *t, int all); -int rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old, rte **old_exported, int refeed); +int rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed); struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); static inline int rt_is_ip(rtable *tab) diff --git a/nest/rt-table.c b/nest/rt-table.c index 4500c888..abb29fe1 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -620,29 +620,6 @@ rte_cow_rta(rte *r, linpool *lp) return r; } -/** - * rte_free - delete a &rte - * @e: &rte to be deleted - * - * rte_free() deletes the given &rte from the routing table it's linked to. - */ -void -rte_free(rte *e) -{ - rt_unlock_source(e->src); - if (rta_is_cached(e->attrs)) - rta_free(e->attrs); - sl_free(e); -} - -static inline void -rte_free_quick(rte *e) -{ - rt_unlock_source(e->src); - rta_free(e->attrs); - sl_free(e); -} - static int /* Actually better or at least as good as */ rte_better(rte *new, rte *old) { @@ -799,14 +776,8 @@ do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed) } /* Apply export table */ - struct rte *old_exported = NULL; - if (c->out_table) - { - if (!rte_update_out(c, net->n.addr, new, old, &old_exported, refeed)) - return; - } - else if (c->out_filter == FILTER_ACCEPT) - old_exported = old; + if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed)) + return; if (new) stats->exp_updates_accepted++; @@ -836,9 +807,6 @@ do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed) } p->rt_notify(p, c, net, new, old); - - if (c->out_table && old_exported) - rte_free_quick(old_exported); } static void @@ -1198,6 +1166,29 @@ rte_validate(rte *e) return 1; } +/** + * rte_free - delete a &rte + * @e: &rte to be deleted + * + * rte_free() deletes the given &rte from the routing table it's linked to. + */ +void +rte_free(rte *e) +{ + rt_unlock_source(e->src); + if (rta_is_cached(e->attrs)) + rta_free(e->attrs); + sl_free(e); +} + +static inline void +rte_free_quick(rte *e) +{ + rt_unlock_source(e->src); + rta_free(e->attrs); + sl_free(e); +} + static int rte_same(rte *x, rte *y) { @@ -3258,7 +3249,7 @@ again: */ int -rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, rte **old_exported, int refeed) +rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed) { struct rtable *tab = c->out_table; struct rte_src *src; @@ -3302,7 +3293,7 @@ rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, rte ** /* Remove the old rte */ *pos = old->next; - *old_exported = old; + rte_free_quick(old); tab->rt_count--; break; From 534d0a4b44aa193da785ae180475a448f57805e2 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Sun, 24 Jul 2022 02:15:20 +0200 Subject: [PATCH 7/8] KRT: Scan routing tables separetely on linux to avoid congestion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove compile-time sysdep option CONFIG_ALL_TABLES_AT_ONCE, replace it with runtime ability to run either separate table scans or shared scan. On Linux, use separate table scans by default when the netlink socket option NETLINK_GET_STRICT_CHK is available, but retreat to shared scan when it fails. Running separate table scans has advantages where some routing tables are managed independently, e.g. when multiple routing daemons are running on the same machine, as kernel routing table modification performance is significantly reduced when the table is modified while it is being scanned. Thanks Daniel Gröber for the original patch and Toke Høiland-Jørgensen for suggestions. --- sysdep/cf/README | 1 - sysdep/cf/linux.h | 1 - sysdep/linux/netlink.c | 49 +++++++++++++++++++++------- sysdep/unix/krt.c | 72 +++++++++++++++++++++++------------------- sysdep/unix/krt.h | 4 +-- 5 files changed, 78 insertions(+), 49 deletions(-) diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..68078bbe 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..c640bef4 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 29b744cb..bc65e0d2 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -161,16 +161,13 @@ nl_open_sock(struct nl_sock *nl) } } -static void +static int nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) { - /* - * Strict checking is not necessary, it improves behavior on newer kernels. - * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT - * run-time), we can just ignore it. - */ #ifdef SOL_NETLINK - setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; #endif } @@ -198,10 +195,17 @@ nl_cfg_rx_buffer_size(struct config *cfg) static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); - nl_set_strict_dump(&nl_scan, 1); + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -256,11 +260,13 @@ nl_request_dump_addr(int af) } static void -nl_request_dump_route(int af) +nl_request_dump_route(int af, int table_id) { struct { struct nlmsghdr nh; struct rtmsg rtm; + struct rtattr rta; + u32 table_id; } req = { .nh.nlmsg_type = RTM_GETROUTE, .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), @@ -269,7 +275,17 @@ nl_request_dump_route(int af) .rtm.rtm_family = af, }; - send(nl_scan.fd, &req, sizeof(req), 0); + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); nl_scan.last_hdr = NULL; } @@ -1976,18 +1992,27 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { struct nlmsghdr *h; struct nl_parse_state s; nl_parse_begin(&s, 1); - nl_request_dump_route(AF_UNSPEC); + + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); + while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); + } + nl_parse_end(&s); } diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index a02cf977..342adee5 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -783,18 +783,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new) rte_free(e); } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -815,35 +814,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); - if (!krt_scan_count) + krt_scan_all_count--; + + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -861,26 +867,33 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates @@ -1016,11 +1029,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 62228f08..1536e502 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -52,10 +52,7 @@ struct krt_proto { struct rtable *krt_table; /* Internal table of inherited routes */ #endif -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -76,6 +73,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new); From 97476e002d7dfb24a4613ac401b8f3192ca68d05 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Wed, 3 Aug 2022 11:57:29 +0200 Subject: [PATCH 8/8] BGP: The bucket/prefix hashes are now a resource to allow for proper cleanup --- proto/bgp/attrs.c | 134 ++++++++++++++++++++++++++------------------ proto/bgp/bgp.c | 9 +-- proto/bgp/bgp.h | 29 ++++++---- proto/bgp/packets.c | 10 ++-- 4 files changed, 103 insertions(+), 79 deletions(-) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index a7b1a7ed..2ff3f3f3 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1522,8 +1522,8 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1531,24 +1531,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1577,7 +1561,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1589,15 +1573,17 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) } static void -bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b) +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } int -bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; + /* Won't free the withdraw bucket */ if (b == c->withdraw_bucket) return 0; @@ -1608,21 +1594,23 @@ bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) if (b->px_uc || !EMPTY_LIST(b->prefixes)) return 0; - bgp_free_bucket_xx(c, b); + bgp_free_bucket(c, b); return 1; } void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1643,7 +1631,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash @@ -1652,28 +1640,20 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, u32 path_id, int add_path_tx) { - u32 path_id_hash = c->add_path_tx ? path_id : 0; + u32 path_id_hash = add_path_tx ? path_id : 0; /* We must use a different hash function than the rtable */ u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); @@ -1696,15 +1676,16 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) return px; } -static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px); +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); static inline int bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) { +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) #define BPX_TRACE(what) do { \ if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ c->c.proto->name, c->c.name, what, \ - px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0) + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) px->lastmod = current_time(); /* Already queued for the same bucket */ @@ -1722,7 +1703,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } /* The new bucket is the same as we sent before */ - if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket)) + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) { if (px->cur) BPX_TRACE("reverted"); @@ -1731,15 +1712,15 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke /* Well, we haven't sent anything yet */ if (!px->last) - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); px->cur = NULL; return 0; } /* Enqueue the bucket if it has been empty */ - if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes)) - add_tail(&c->bucket_queue, &b->send_node); + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); /* Enqueue to the new bucket and indicate the change */ add_tail(&b->prefixes, &px->buck_node_xx); @@ -1752,7 +1733,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } static void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); @@ -1780,7 +1761,7 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket px->last->px_uc--; /* Ref the current sent version */ - if (buck != c->withdraw_bucket) + if (!IS_WITHDRAW_BUCKET(buck)) { px->last = buck; px->last->px_uc++; @@ -1790,7 +1771,47 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket /* Prefixes belonging to the withdraw bucket are freed always */ } - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + UNUSED struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + /* Do nothing for now. */ +} + +static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; } @@ -1802,7 +1823,8 @@ static void bgp_out_table_feed(void *data) { struct rt_export_hook *hook = data; - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table); + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table); + struct bgp_pending_tx *c = bc->ptx; int max = 512; @@ -1897,8 +1919,8 @@ bgp_out_table_feed(void *data) static struct rt_export_hook * bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED) { - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re); - pool *p = rp_new(c->c.proto->pool, "Export hook"); + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, re); + pool *p = rp_new(bc->c.proto->pool, "Export hook"); struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook)); hook->pool = p; hook->event = ev_new_init(p, bgp_out_table_feed, hook); @@ -2132,16 +2154,16 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); path = new->src->global_id; } else { - buck = bgp_get_withdraw_bucket(c); + buck = bgp_get_withdraw_bucket(c->ptx); path = old->src->global_id; } - if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck)) + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) bgp_schedule_packet(p->conn, c, PKT_UPDATE); } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 33849b0b..e6d3f125 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -787,10 +787,8 @@ bgp_handle_graceful_restart(struct bgp_proto *p) } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -1800,8 +1798,7 @@ bgp_channel_start(struct channel *C) if (c->cf->export_table) bgp_setup_out_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 469f0cb9..def7b102 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -351,14 +351,8 @@ struct bgp_channel { /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ - - struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -401,6 +395,18 @@ struct bgp_bucket { ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -566,13 +572,12 @@ void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); void bgp_setup_out_table(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); + void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); int bgp_rte_better(struct rte *, struct rte *); diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index de976588..57c00eb5 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -2167,7 +2167,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); int lr, la; @@ -2190,7 +2190,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); /* * 2 B IPv4 Withdrawn Routes Length (zero) @@ -2330,7 +2330,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2340,9 +2340,9 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ if (bgp_done_bucket(c, buck))