mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2024-12-22 09:41:54 +00:00
Implements deterministic MED handling.
Thanks to Alexander V. Chernikov for many suggestions.
This commit is contained in:
parent
cf7f064531
commit
be4cd99a36
@ -1316,6 +1316,21 @@ for each neighbor using the following configuration parameters:
|
||||
received from the same AS (which is the standard behavior).
|
||||
Default: off.
|
||||
|
||||
<tag>deterministic med <m/switch/</tag> BGP route selection
|
||||
algorithm is often viewed as a comparison between individual
|
||||
routes (e.g. if a new route appears and is better than the
|
||||
current best one, it is chosen as the new best one). But the
|
||||
proper route selection, as specified by RFC 4271, cannot be
|
||||
fully implemented in that way. The problem is mainly in
|
||||
handling the MED attribute. BIRD, by default, uses an
|
||||
simplification based on individual route comparison, which in
|
||||
some cases may lead to temporally dependent behavior (i.e. the
|
||||
selection is dependent on the order in which routes appeared).
|
||||
This option enables a different (and slower) algorithm
|
||||
implementing proper RFC 4271 route selection, which is
|
||||
deterministic. Alternative way how to get deterministic
|
||||
behavior is to use <cf/med metric/ option. Default: off.
|
||||
|
||||
<tag>igp metric <m/switch/</tag> Enable comparison of internal
|
||||
distances to boundary routers during best route selection. Default: on.
|
||||
|
||||
|
@ -267,6 +267,26 @@ void store_tmp_attrs(rte *e, ea_list *attrs)
|
||||
int import_control(struct proto *p, rte **e, ea_list **attrs, struct linpool *pool)
|
||||
{ DUMMY; }
|
||||
|
||||
/**
|
||||
* rte_recalculate - prepare routes for comparison
|
||||
* @table: a routing table
|
||||
* @net: a network entry
|
||||
* @new: new route for the network
|
||||
* @old: old route for the network
|
||||
* @old_best: old best route for the network (may be NULL)
|
||||
*
|
||||
* This hook is called when a route change (from @old to @new for a
|
||||
* @net entry) is propagated to a @table. It may be used to prepare
|
||||
* routes for comparison by rte_better() in the best route
|
||||
* selection. @new may or may not be in @net->routes list,
|
||||
* @old is not there.
|
||||
*
|
||||
* Result: 1 if the ordering implied by rte_better() changes enough
|
||||
* that full best route calculation have to be done, 0 otherwise.
|
||||
*/
|
||||
int rte_recalculate(struct rtable *table, struct network *net, struct rte *new, struct rte *old, struct rte *old_best)
|
||||
{ DUMMY; }
|
||||
|
||||
/**
|
||||
* rte_better - compare metrics of two routes
|
||||
* @new: the new route
|
||||
|
@ -178,12 +178,14 @@ struct proto {
|
||||
/*
|
||||
* Routing entry hooks (called only for rte's belonging to this protocol):
|
||||
*
|
||||
* rte_recalculate Called at the beginning of the best route selection
|
||||
* rte_better Compare two rte's and decide which one is better (1=first, 0=second).
|
||||
* rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no).
|
||||
* rte_insert Called whenever a rte is inserted to a routing table.
|
||||
* rte_remove Called whenever a rte is removed from the routing table.
|
||||
*/
|
||||
|
||||
int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *);
|
||||
int (*rte_better)(struct rte *, struct rte *);
|
||||
int (*rte_same)(struct rte *, struct rte *);
|
||||
void (*rte_insert)(struct network *, struct rte *);
|
||||
|
@ -200,6 +200,11 @@ typedef struct rte {
|
||||
u32 tag; /* External route tag */
|
||||
u32 router_id; /* Router that originated this route */
|
||||
} ospf;
|
||||
#endif
|
||||
#ifdef CONFIG_BGP
|
||||
struct {
|
||||
u8 suppressed; /* Used for deterministic MED comparison */
|
||||
} bgp;
|
||||
#endif
|
||||
struct { /* Routes generated by krt sync (both temporary and inherited ones) */
|
||||
s8 src; /* Alleged route source (see krt.h) */
|
||||
|
@ -498,6 +498,9 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte
|
||||
|
||||
rte_announce(table, RA_ANY, net, new, old, tmpa);
|
||||
|
||||
if (src->rte_recalculate && src->rte_recalculate(table, net, new, old, old_best))
|
||||
goto do_recalculate;
|
||||
|
||||
if (new && rte_better(new, old_best))
|
||||
{
|
||||
/* The first case - the new route is cleary optimal, we link it
|
||||
@ -516,6 +519,7 @@ rte_recalculate(rtable *table, net *net, struct proto *p, struct proto *src, rte
|
||||
that route at the first position and announce it. New optimal
|
||||
route might be NULL if there is no more routes */
|
||||
|
||||
do_recalculate:
|
||||
/* Add the new route to the list */
|
||||
if (new)
|
||||
{
|
||||
@ -1015,27 +1019,36 @@ rt_next_hop_update_net(rtable *tab, net *n)
|
||||
if (!old_best)
|
||||
return 0;
|
||||
|
||||
new_best = NULL;
|
||||
for (k = &n->routes; e = *k; k = &e->next)
|
||||
if (rta_next_hop_outdated(e->attrs))
|
||||
{
|
||||
new = rt_next_hop_update_rte(tab, e);
|
||||
*k = new;
|
||||
|
||||
rte_announce_i(tab, RA_ANY, n, new, e);
|
||||
rte_trace_in(D_ROUTES, new->sender, new, "updated");
|
||||
|
||||
/* Call a pre-comparison hook */
|
||||
/* Not really an efficient way to compute this */
|
||||
if (e->attrs->proto->rte_recalculate)
|
||||
e->attrs->proto->rte_recalculate(tab, n, new, e, NULL);
|
||||
|
||||
if (e != old_best)
|
||||
rte_free_quick(e);
|
||||
else /* Freeing of the old best rte is postponed */
|
||||
free_old_best = 1;
|
||||
|
||||
e = new;
|
||||
count++;
|
||||
}
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
/* Find the new best route */
|
||||
new_best = NULL;
|
||||
for (k = &n->routes; e = *k; k = &e->next)
|
||||
{
|
||||
if (rta_next_hop_outdated(e->attrs))
|
||||
{
|
||||
new = rt_next_hop_update_rte(tab, e);
|
||||
*k = new;
|
||||
|
||||
rte_announce_i(tab, RA_ANY, n, new, e);
|
||||
rte_trace_in(D_ROUTES, new->sender, new, "updated");
|
||||
|
||||
if (e != old_best)
|
||||
rte_free_quick(e);
|
||||
else /* Freeing of the old best rte is postponed */
|
||||
free_old_best = 1;
|
||||
|
||||
e = new;
|
||||
count++;
|
||||
}
|
||||
|
||||
if (!new_best || rte_better(e, *new_best))
|
||||
new_best = k;
|
||||
}
|
||||
|
@ -1125,6 +1125,14 @@ bgp_rte_better(rte *new, rte *old)
|
||||
eattr *x, *y;
|
||||
u32 n, o;
|
||||
|
||||
/* Skip suppressed routes (see bgp_rte_recalculate()) */
|
||||
n = new->u.bgp.suppressed;
|
||||
o = old->u.bgp.suppressed;
|
||||
if (n > o)
|
||||
return 0;
|
||||
if (n < o)
|
||||
return 1;
|
||||
|
||||
/* RFC 4271 9.1.2.1. Route resolvability test */
|
||||
n = rte_resolvable(new);
|
||||
o = rte_resolvable(old);
|
||||
@ -1167,14 +1175,15 @@ bgp_rte_better(rte *new, rte *old)
|
||||
return 0;
|
||||
|
||||
/* RFC 4271 9.1.2.2. c) Compare MED's */
|
||||
/* This is noncompliant. Proper RFC 4271 path selection cannot be
|
||||
* interpreted as finding the best path in some ordering.
|
||||
* Therefore, it cannot be implemented in BIRD without some ugly
|
||||
* hacks. This is just an approximation, which in specific
|
||||
* situations may lead to persistent routing loops, because it is
|
||||
* nondeterministic - it depends on the order in which routes
|
||||
* appeared. But it is also the same behavior as used by default in
|
||||
* Cisco routers, so it is probably not a big issue.
|
||||
/* Proper RFC 4271 path selection cannot be interpreted as finding
|
||||
* the best path in some ordering. It is implemented partially in
|
||||
* bgp_rte_recalculate() when deterministic_med option is
|
||||
* active. Without that option, the behavior is just an
|
||||
* approximation, which in specific situations may lead to
|
||||
* persistent routing loops, because it is nondeterministic - it
|
||||
* depends on the order in which routes appeared. But it is also the
|
||||
* same behavior as used by default in Cisco routers, so it is
|
||||
* probably not a big issue.
|
||||
*/
|
||||
if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
|
||||
(bgp_get_neighbor(new) == bgp_get_neighbor(old)))
|
||||
@ -1236,6 +1245,148 @@ bgp_rte_better(rte *new, rte *old)
|
||||
return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
|
||||
}
|
||||
|
||||
|
||||
static inline int
|
||||
same_group(rte *r, u32 lpref, u32 lasn)
|
||||
{
|
||||
return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
|
||||
}
|
||||
|
||||
static inline int
|
||||
use_deterministic_med(rte *r)
|
||||
{
|
||||
return ((struct bgp_proto *) r->attrs->proto)->cf->deterministic_med;
|
||||
}
|
||||
|
||||
int
|
||||
bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
|
||||
{
|
||||
rte *r, *s;
|
||||
rte *key = new ? new : old;
|
||||
u32 lpref = key->pref;
|
||||
u32 lasn = bgp_get_neighbor(key);
|
||||
int old_is_group_best = 0;
|
||||
|
||||
/*
|
||||
* Proper RFC 4271 path selection is a bit complicated, it cannot be
|
||||
* implemented just by rte_better(), because it is not a linear
|
||||
* ordering. But it can be splitted to two levels, where the lower
|
||||
* level chooses the best routes in each group of routes from the
|
||||
* same neighboring AS and higher level chooses the best route (with
|
||||
* a slightly different ordering) between the best-in-group routes.
|
||||
*
|
||||
* When deterministic_med is disabled, we just ignore this issue and
|
||||
* choose the best route by bgp_rte_better() alone. If enabled, the
|
||||
* lower level of the route selection is done here (for the group
|
||||
* to which the changed route belongs), all routes in group are
|
||||
* marked as suppressed, just chosen best-in-group is not.
|
||||
*
|
||||
* Global best route selection then implements higher level by
|
||||
* choosing between non-suppressed routes (as they are always
|
||||
* preferred over suppressed routes). Routes from BGP protocols
|
||||
* that do not set deterministic_med are just never suppressed. As
|
||||
* they do not participate in the lower level selection, it is OK
|
||||
* that this fn is not called for them.
|
||||
*
|
||||
* The idea is simple, the implementation is more problematic,
|
||||
* mostly because of optimizations in rte_recalculate() that
|
||||
* avoids full recalculation in most cases.
|
||||
*
|
||||
* We can assume that at least one of new, old is non-NULL and both
|
||||
* are from the same protocol with enabled deterministic_med. We
|
||||
* group routes by both neighbor AS (lasn) and preference (lpref),
|
||||
* because bgp_rte_better() does not handle preference itself.
|
||||
*/
|
||||
|
||||
/* If new and old are from different groups, we just process that
|
||||
as two independent events */
|
||||
if (new && old && !same_group(old, lpref, lasn))
|
||||
{
|
||||
int i1, i2;
|
||||
i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
|
||||
i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
|
||||
return i1 || i2;
|
||||
}
|
||||
|
||||
/*
|
||||
* We could find the best-in-group and then make some shortcuts like
|
||||
* in rte_recalculate, but as we would have to walk through all
|
||||
* net->routes just to find it, it is probably not worth. So we
|
||||
* just have two simpler fast cases that use just the old route.
|
||||
* We also set suppressed flag to avoid using it in bgp_rte_better().
|
||||
*/
|
||||
|
||||
if (new)
|
||||
new->u.bgp.suppressed = 1;
|
||||
|
||||
if (old)
|
||||
{
|
||||
old_is_group_best = !old->u.bgp.suppressed;
|
||||
old->u.bgp.suppressed = 1;
|
||||
int new_is_better = new && bgp_rte_better(new, old);
|
||||
|
||||
/* The first case - replace not best with worse (or remove not best) */
|
||||
if (!old_is_group_best && !new_is_better)
|
||||
return 0;
|
||||
|
||||
/* The second case - replace the best with better */
|
||||
if (old_is_group_best && new_is_better)
|
||||
{
|
||||
/* new is best-in-group, the see discussion below - this is
|
||||
a special variant of NBG && OBG. From OBG we can deduce
|
||||
that same_group(old_best) iff (old == old_best) */
|
||||
new->u.bgp.suppressed = 0;
|
||||
return (old == old_best);
|
||||
}
|
||||
}
|
||||
|
||||
/* The default case - find a new best-in-group route */
|
||||
r = new; /* new may not be in the list */
|
||||
for (s=net->routes; s; s=s->next)
|
||||
if (use_deterministic_med(s) && same_group(s, lpref, lasn))
|
||||
{
|
||||
s->u.bgp.suppressed = 1;
|
||||
if (!r || bgp_rte_better(s, r))
|
||||
r = s;
|
||||
}
|
||||
|
||||
/* Simple case - the last route in group disappears */
|
||||
if (!r)
|
||||
return 0;
|
||||
|
||||
/* Found best-in-group */
|
||||
r->u.bgp.suppressed = 0;
|
||||
|
||||
/*
|
||||
* There are generally two reasons why we have to force
|
||||
* recalculation (return 1): First, the new route may be wrongfully
|
||||
* chosen to be the best in the first case check in
|
||||
* rte_recalculate(), this may happen only if old_best is from the
|
||||
* same group. Second, another (different than new route)
|
||||
* best-in-group is chosen and that may be the proper best (although
|
||||
* rte_recalculate() without ignore that possibility).
|
||||
*
|
||||
* There are three possible cases according to whether the old route
|
||||
* was the best in group (OBG, stored in old_is_group_best) and
|
||||
* whether the new route is the best in group (NBG, tested by r == new).
|
||||
* These cases work even if old or new is NULL.
|
||||
*
|
||||
* NBG -> new is a possible candidate for the best route, so we just
|
||||
* check for the first reason using same_group().
|
||||
*
|
||||
* !NBG && OBG -> Second reason applies, return 1
|
||||
*
|
||||
* !NBG && !OBG -> Best in group does not change, old != old_best,
|
||||
* rte_better(new, old_best) is false and therefore
|
||||
* the first reason does not apply, return 0
|
||||
*/
|
||||
|
||||
if (r == new)
|
||||
return old_best && same_group(old_best, lpref, lasn);
|
||||
else
|
||||
return old_is_group_best;
|
||||
}
|
||||
|
||||
static struct adata *
|
||||
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
|
||||
{
|
||||
@ -1614,6 +1765,11 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
|
||||
eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
|
||||
u32 origas;
|
||||
|
||||
/*
|
||||
if (e->u.bgp.suppressed)
|
||||
buf += bsprintf(buf, " -");
|
||||
*/
|
||||
|
||||
buf += bsprintf(buf, " (%d", e->pref);
|
||||
if (e->attrs->hostentry)
|
||||
{
|
||||
|
@ -908,6 +908,10 @@ bgp_init(struct proto_config *C)
|
||||
P->import_control = bgp_import_control;
|
||||
P->neigh_notify = bgp_neigh_notify;
|
||||
P->reload_routes = bgp_reload_routes;
|
||||
|
||||
if (c->deterministic_med)
|
||||
P->rte_recalculate = bgp_rte_recalculate;
|
||||
|
||||
p->cf = c;
|
||||
p->local_as = c->local_as;
|
||||
p->remote_as = c->remote_as;
|
||||
|
@ -29,6 +29,7 @@ struct bgp_config {
|
||||
int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */
|
||||
int igp_metric; /* Use IGP metrics when selecting best route */
|
||||
int prefer_older; /* Prefer older routes according to RFC 5004 */
|
||||
int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */
|
||||
u32 default_local_pref; /* Default value for LOCAL_PREF attribute */
|
||||
u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */
|
||||
int capabilities; /* Enable capability handshake [RFC3392] */
|
||||
@ -185,6 +186,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att
|
||||
struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, unsigned int len, struct linpool *pool, int mandatory);
|
||||
int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
|
||||
int bgp_rte_better(struct rte *, struct rte *);
|
||||
int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best);
|
||||
void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs);
|
||||
int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *);
|
||||
void bgp_attr_init(struct bgp_proto *);
|
||||
|
@ -25,7 +25,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
|
||||
CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE,
|
||||
PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
|
||||
INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
|
||||
TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY)
|
||||
TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC)
|
||||
|
||||
CF_GRAMMAR
|
||||
|
||||
@ -82,6 +82,7 @@ bgp_proto:
|
||||
| bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; }
|
||||
| bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; }
|
||||
| bgp_proto PREFER OLDER bool ';' { BGP_CFG->prefer_older = $4; }
|
||||
| bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; }
|
||||
| bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; }
|
||||
| bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; }
|
||||
| bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; }
|
||||
|
Loading…
Reference in New Issue
Block a user