diff --git a/nest/config.Y b/nest/config.Y index 920a3054..fa726efb 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -51,7 +51,7 @@ CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIREC RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE) CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE) CF_ENUM(T_ENUM_RTC, RTC_, UNICAST, BROADCAST, MULTICAST, ANYCAST) -CF_ENUM(T_ENUM_RTD, RTD_, ROUTER, DEVICE, BLACKHOLE, UNREACHABLE, PROHIBIT) +CF_ENUM(T_ENUM_RTD, RTD_, ROUTER, DEVICE, BLACKHOLE, UNREACHABLE, PROHIBIT, MULTIPATH) %type idval %type imexport diff --git a/nest/route.h b/nest/route.h index a849bf00..8f9dff9a 100644 --- a/nest/route.h +++ b/nest/route.h @@ -170,7 +170,7 @@ struct hostentry { struct hostentry *next; /* Next in hash chain */ unsigned hash_key; /* Hash key */ unsigned uc; /* Use count */ - struct iface *iface; /* Chosen outgoing interface */ + struct rta *src; /* Source rta entry */ ip_addr gw; /* Chosen next hop */ byte dest; /* Chosen route destination type (RTD_...) */ u32 igp_metric; /* Chosen route IGP metric */ @@ -266,6 +266,14 @@ void rt_show(struct rt_show_data *); * construction of BGP route attribute lists. */ +/* Multipath next-hop */ +struct mpnh { + ip_addr gw; /* Next hop */ + struct iface *iface; /* Outgoing interface */ + struct mpnh *next; + unsigned char weight; +}; + typedef struct rta { struct rta *next, **pprev; /* Hash chain */ struct proto *proto; /* Protocol instance that originally created the route */ @@ -282,6 +290,7 @@ typedef struct rta { ip_addr from; /* Advertising router */ struct hostentry *hostentry; /* Hostentry for recursive next-hops */ struct iface *iface; /* Outgoing interface */ + struct mpnh *nexthops; /* Next-hops for multipath routes */ struct ea_list *eattrs; /* Extended Attribute chain */ } rta; @@ -309,7 +318,8 @@ typedef struct rta { #define RTD_BLACKHOLE 2 /* Silently drop packets */ #define RTD_UNREACHABLE 3 /* Reject as unreachable */ #define RTD_PROHIBIT 4 /* Administratively prohibited */ -#define RTD_NONE 5 /* Invalid RTD */ +#define RTD_MULTIPATH 5 /* Multipath route (nexthops != NULL) */ +#define RTD_NONE 6 /* Invalid RTD */ #define RTAF_CACHED 1 /* This is a cached rta */ @@ -387,6 +397,10 @@ void ea_format(eattr *e, byte *buf); #define EA_FORMAT_BUF_SIZE 256 ea_list *ea_append(ea_list *to, ea_list *what); +int mpnh__same(struct mpnh *x, struct mpnh *y); /* Compare multipath nexthops */ +static inline int mpnh_same(struct mpnh *x, struct mpnh *y) +{ return (x == y) || mpnh__same(x, y); } + void rta_init(void); rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ static inline rta *rta_clone(rta *r) { r->uc++; return r; } @@ -403,12 +417,14 @@ void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, i * count. Cached rta locks its hostentry (increases its use count), * uncached rta does not lock it. Hostentry with zero use count is * removed asynchronously during host cache update, therefore it is - * safe to hold such hostentry temorarily. There is no need to hold - * a lock for hostentry->dep table, because that table contains routes - * responsible for that hostentry, and therefore is non-empty if given - * hostentry has non-zero use count. The protocol responsible for routes - * with recursive next hops should also hold a lock for a table governing - * that routes (argument tab to rta_set_recursive_next_hop()). + * safe to hold such hostentry temorarily. Hostentry holds a lock for + * a 'source' rta, mainly to share multipath nexthops. There is no + * need to hold a lock for hostentry->dep table, because that table + * contains routes responsible for that hostentry, and therefore is + * non-empty if given hostentry has non-zero use count. The protocol + * responsible for routes with recursive next hops should also hold a + * lock for a table governing that routes (argument tab to + * rta_set_recursive_next_hop()). */ static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } diff --git a/nest/rt-attr.c b/nest/rt-attr.c index ce6fe85d..c1f9c793 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -57,9 +57,65 @@ pool *rta_pool; static slab *rta_slab; +static slab *mpnh_slab; struct protocol *attr_class_to_protocol[EAP_MAX]; +static inline unsigned int +mpnh_hash(struct mpnh *x) +{ + unsigned int h = 0; + for (; x; x = x->next) + h ^= ipa_hash(x->gw); + + return h; +} + +int +mpnh__same(struct mpnh *x, struct mpnh *y) +{ + for (; x && y; x = x->next, y = y->next) + if (!ipa_equal(x->gw, y->gw) || (x->iface != y->iface) || (x->weight != y->weight)) + return 0; + + return x == y; +} + +static struct mpnh * +mpnh_copy(struct mpnh *o) +{ + struct mpnh *first = NULL; + struct mpnh **last = &first; + + for (; o; o = o->next) + { + struct mpnh *n = sl_alloc(mpnh_slab); + n->gw = o->gw; + n->iface = o->iface; + n->next = NULL; + n->weight = o->weight; + + *last = n; + last = &(n->next); + } + + return first; +} + +static void +mpnh_free(struct mpnh *o) +{ + struct mpnh *n; + + while (o) + { + n = o->next; + sl_free(mpnh_slab, o); + o = n; + } +} + + /* * Extended Attributes */ @@ -587,7 +643,8 @@ rta_alloc_hash(void) static inline unsigned int rta_hash(rta *a) { - return (a->proto->hash_key ^ ipa_hash(a->gw) ^ ea_hash(a->eattrs)) & 0xffff; + return (a->proto->hash_key ^ ipa_hash(a->gw) ^ + mpnh_hash(a->nexthops) ^ ea_hash(a->eattrs)) & 0xffff; } static inline int @@ -604,6 +661,7 @@ rta_same(rta *x, rta *y) ipa_equal(x->from, y->from) && x->iface == y->iface && x->hostentry == y->hostentry && + mpnh_same(x->nexthops, y->nexthops) && ea_same(x->eattrs, y->eattrs)); } @@ -614,6 +672,7 @@ rta_copy(rta *o) memcpy(r, o, sizeof(rta)); r->uc = 1; + r->nexthops = mpnh_copy(o->nexthops); r->eattrs = ea_list_copy(o->eattrs); return r; } @@ -707,6 +766,7 @@ rta__free(rta *a) a->next->pprev = a->pprev; a->aflags = 0; /* Poison the entry */ rt_unlock_hostentry(a->hostentry); + mpnh_free(a->nexthops); ea_free(a->eattrs); sl_free(rta_slab, a); } @@ -798,6 +858,7 @@ rta_init(void) { rta_pool = rp_new(&root_pool, "Attributes"); rta_slab = sl_new(rta_pool, sizeof(rta)); + mpnh_slab = sl_new(rta_pool, sizeof(struct mpnh)); rta_alloc_hash(); } diff --git a/nest/rt-table.c b/nest/rt-table.c index a4976f03..73b05d08 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -962,29 +962,31 @@ rt_preconfig(struct config *c) * triggered by rt_schedule_nhu(). */ -static inline int -hostentry_diff(struct hostentry *he, struct iface *iface, ip_addr gw, - byte dest, u32 igp_metric) -{ - return (he->iface != iface) || !ipa_equal(he->gw, gw) || - (he->dest != dest) || (he->igp_metric != igp_metric); -} - static inline int rta_next_hop_outdated(rta *a) { struct hostentry *he = a->hostentry; - return he && hostentry_diff(he, a->iface, a->gw, a->dest, a->igp_metric); + + if (!he) + return 0; + + if (!he->src) + return a->dest != RTD_UNREACHABLE; + + return (a->iface != he->src->iface) || !ipa_equal(a->gw, he->gw) || + (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || + !mpnh_same(a->nexthops, he->src->nexthops); } static inline void rta_apply_hostentry(rta *a, struct hostentry *he) { a->hostentry = he; - a->iface = he->iface; + a->iface = he->src ? he->src->iface : NULL; a->gw = he->gw; a->dest = he->dest; a->igp_metric = he->igp_metric; + a->nexthops = he->src ? he->src->nexthops : NULL; } static inline rte * @@ -1388,6 +1390,7 @@ hc_new_hostentry(struct hostcache *hc, ip_addr a, ip_addr ll, rtable *dep, unsig he->tab = dep; he->hash_key = k; he->uc = 0; + he->src = NULL; add_tail(&hc->hostentries, &he->ln); hc_insert(hc, he); @@ -1402,6 +1405,8 @@ hc_new_hostentry(struct hostcache *hc, ip_addr a, ip_addr ll, rtable *dep, unsig static void hc_delete_hostentry(struct hostcache *hc, struct hostentry *he) { + rta_free(he->src); + rem_node(&he->ln); hc_remove(hc, he); sl_free(hc->slab, he); @@ -1436,6 +1441,8 @@ rt_free_hostcache(rtable *tab) WALK_LIST(n, hc->hostentries) { struct hostentry *he = SKIP_BACK(struct hostentry, ln, n); + rta_free(he->src); + if (he->uc) log(L_ERR "Hostcache is not empty in table %s", tab->name); } @@ -1488,7 +1495,7 @@ rt_get_igp_metric(rte *rt) return rt->u.rip.metric; /* Device routes */ - if (a->dest != RTD_ROUTER) + if ((a->dest != RTD_ROUTER) && (a->dest != RTD_MULTIPATH)) return 0; return IGP_METRIC_UNKNOWN; @@ -1497,12 +1504,15 @@ rt_get_igp_metric(rte *rt) static int rt_update_hostentry(rtable *tab, struct hostentry *he) { - struct iface *old_iface = he->iface; - ip_addr old_gw = he->gw; - byte old_dest = he->dest; - u32 old_metric = he->igp_metric; + rta *old_src = he->src; int pxlen = 0; + /* Reset the hostentry */ + he->src = NULL; + he->gw = IPA_NONE; + he->dest = RTD_UNREACHABLE; + he->igp_metric = 0; + net *n = net_route(tab, he->addr, MAX_PREFIX_LENGTH); if (n) { @@ -1513,53 +1523,41 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %I/%d", - he->addr, n->n.prefix, n->n.pxlen); - he->iface = NULL; - he->gw = IPA_NONE; - he->dest = RTD_UNREACHABLE; + he->addr, n->n.prefix, pxlen); + goto done; } - else if (a->dest == RTD_DEVICE) + + if (a->dest == RTD_DEVICE) { if (if_local_addr(he->addr, a->iface)) { /* The host address is a local address, this is not valid */ log(L_WARN "Next hop address %I is a local address of iface %s", he->addr, a->iface->name); - he->iface = NULL; - he->gw = IPA_NONE; - he->dest = RTD_UNREACHABLE; + goto done; } - else - { - /* The host is directly reachable, use link as a gateway */ - he->iface = a->iface; - he->gw = he->link; - he->dest = RTD_ROUTER; - } + + /* The host is directly reachable, use link as a gateway */ + he->gw = he->link; + he->dest = RTD_ROUTER; } else { /* The host is reachable through some route entry */ - he->iface = a->iface; he->gw = a->gw; he->dest = a->dest; } - he->igp_metric = he->iface ? rt_get_igp_metric(n->routes) : 0; - } - else - { - /* The host is unreachable */ - he->iface = NULL; - he->gw = IPA_NONE; - he->dest = RTD_UNREACHABLE; - he->igp_metric = 0; + he->src = rta_clone(a); + he->igp_metric = rt_get_igp_metric(n->routes); } + done: /* Add a prefix range to the trie */ trie_add_prefix(tab->hostcache->trie, he->addr, MAX_PREFIX_LENGTH, pxlen, MAX_PREFIX_LENGTH); - return hostentry_diff(he, old_iface, old_gw, old_dest, old_metric); + rta_free(old_src); + return old_src != he->src; } static void @@ -1630,6 +1628,7 @@ rt_format_via(rte *e, byte *via) case RTD_BLACKHOLE: bsprintf(via, "blackhole"); break; case RTD_UNREACHABLE: bsprintf(via, "unreachable"); break; case RTD_PROHIBIT: bsprintf(via, "prohibited"); break; + case RTD_MULTIPATH: bsprintf(via, "multipath"); break; default: bsprintf(via, "???"); } } @@ -1641,6 +1640,7 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm byte tm[TM_DATETIME_BUFFER_SIZE], info[256]; rta *a = e->attrs; int primary = (e->net->routes == e); + struct mpnh *nh; rt_format_via(e, via); tm_format_datetime(tm, &config->tf_route, e->lastmod); @@ -1663,6 +1663,8 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm bsprintf(info, " (%d)", e->pref); cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->proto->name, tm, from, primary ? " *" : "", info); + for (nh = a->nexthops; nh; nh = nh->next) + cli_printf(c, -1007, "\tvia %I on %s weight %d", nh->gw, nh->iface->name, nh->weight + 1); if (d->verbose) rta_show(c, a, tmpa); } diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index ef5d024e..ff231b17 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1015,6 +1015,13 @@ bgp_get_neighbor(rte *r) return ((struct bgp_proto *) r->attrs->proto)->remote_as; } +static inline int +rte_resolvable(rte *rt) +{ + int rd = rt->attrs->dest; + return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH); +} + int bgp_rte_better(rte *new, rte *old) { @@ -1024,9 +1031,8 @@ bgp_rte_better(rte *new, rte *old) u32 n, o; /* RFC 4271 9.1.2.1. Route resolvability test */ - /* non-NULL iface means it is either RTD_ROUTER or RTD_DEVICE route */ - n = new->attrs->iface != NULL; - o = old->attrs->iface != NULL; + n = rte_resolvable(new); + o = rte_resolvable(old); if (n > o) return 1; if (n < o) @@ -1502,7 +1508,7 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs) buf += bsprintf(buf, " (%d", e->pref); if (e->attrs->hostentry) { - if (!e->attrs->iface) + if (!rte_resolvable(e)) buf += bsprintf(buf, "/-"); else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) buf += bsprintf(buf, "/?");