From 57c574d82a44d10143aba7aaea6d1384d850c079 Mon Sep 17 00:00:00 2001 From: Ondrej Zajicek Date: Tue, 7 Dec 2010 23:35:39 +0100 Subject: [PATCH] Multipath support for OSPF --- proto/ospf/config.Y | 7 +- proto/ospf/iface.c | 3 + proto/ospf/ospf.c | 18 ++ proto/ospf/ospf.h | 10 +- proto/ospf/rt.c | 596 +++++++++++++++++++++++++++--------------- proto/ospf/rt.h | 33 ++- proto/ospf/topology.c | 4 +- proto/ospf/topology.h | 6 +- 8 files changed, 443 insertions(+), 234 deletions(-) diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index 3af879d6..59e1fbe0 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -51,7 +51,7 @@ CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, NONBROADCAST, POINTOPOINT, TYPE) CF_KEYWORDS(NONE, SIMPLE, AUTHENTICATION, STRICT, CRYPTOGRAPHIC) CF_KEYWORDS(ELIGIBLE, POLL, NETWORKS, HIDDEN, VIRTUAL, CHECK, LINK) CF_KEYWORDS(RX, BUFFER, LARGE, NORMAL, STUBNET, HIDDEN, SUMMARY) -CF_KEYWORDS(WAIT, DELAY, LSADB) +CF_KEYWORDS(WAIT, DELAY, LSADB, ECMP, LIMIT, WEIGHT) %type opttext @@ -76,7 +76,9 @@ ospf_proto: ospf_proto_item: proto_item | RFC1583COMPAT bool { OSPF_CFG->rfc1583 = $2; } - | TICK expr { OSPF_CFG->tick = $2 ; if($2<=0) cf_error("Tick must be greater than zero"); } + | ECMP bool { OSPF_CFG->ecmp = $2 ? DEFAULT_ECMP_LIMIT : 0; } + | ECMP bool LIMIT expr { OSPF_CFG->ecmp = $2 ? $4 : 0; if ($4 < 0) cf_error("ECMP limit cannot be negative"); } + | TICK expr { OSPF_CFG->tick = $2; if($2<=0) cf_error("Tick must be greater than zero"); } | ospf_area '}' ; @@ -193,6 +195,7 @@ ospf_iface_item: | STRICT NONBROADCAST bool { OSPF_PATT->strictnbma = $3 ; } | STUB bool { OSPF_PATT->stub = $2 ; } | CHECK LINK bool { OSPF_PATT->check_link = $3; } + | ECMP WEIGHT expr { OSPF_PATT->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } | NEIGHBORS '{' ipa_list '}' | AUTHENTICATION NONE { OSPF_PATT->autype = OSPF_AUTH_NONE ; } | AUTHENTICATION SIMPLE { OSPF_PATT->autype = OSPF_AUTH_SIMPLE ; } diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 83ea1c29..8b21f94b 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -436,6 +436,7 @@ ospf_iface_new(struct proto_ospf *po, struct iface *iface, struct ifa *addr, ifa->ioprob = OSPF_I_OK; ifa->rxbuf = ip->rxbuf; ifa->check_link = ip->check_link; + ifa->ecmp_weight = ip->ecmp_weight; #ifdef OSPFv2 ifa->autype = ip->autype; @@ -795,6 +796,8 @@ ospf_iface_info(struct ospf_iface *ifa) ifa->stub ? "(stub)" : ""); cli_msg(-1015, "\tPriority: %u", ifa->priority); cli_msg(-1015, "\tCost: %u", ifa->cost); + if (ifa->oa->po->ecmp) + cli_msg(-1015, "\tECMP weight: %d", ((int) ifa->ecmp_weight) + 1); cli_msg(-1015, "\tHello timer: %u", ifa->helloint); if (ifa->type == OSPF_IT_NBMA) diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 026d9751..19e68e24 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -147,6 +147,7 @@ ospf_start(struct proto *p) po->router_id = proto_get_router_id(p->cf); po->rfc1583 = c->rfc1583; po->ebit = 0; + po->ecmp = c->ecmp; po->tick = c->tick; po->disp_timer = tm_new(p->pool); po->disp_timer->data = po; @@ -157,6 +158,7 @@ ospf_start(struct proto *p) po->lsab_size = 256; po->lsab_used = 0; po->lsab = mb_alloc(p->pool, po->lsab_size); + po->nhpool = lp_new(p->pool, 12*sizeof(struct mpnh)); init_list(&(po->iface_list)); init_list(&(po->area_list)); fib_init(&po->rtf, p->pool, sizeof(ort), 0, ospf_rt_initort); @@ -514,6 +516,13 @@ ospf_shutdown(struct proto *p) if (ifa->state > OSPF_IS_DOWN) ospf_iface_shutdown(ifa); + /* Cleanup locked rta entries */ + FIB_WALK(&po->rtf, nftmp) + { + rta_free(((ort *) nftmp)->old_rta); + } + FIB_WALK_END; + return PS_DOWN; } @@ -648,6 +657,7 @@ ospf_reconfigure(struct proto *p, struct proto_config *c) schedule_rtcalc(po); po->tick = new->tick; + po->ecmp = new->ecmp; po->disp_timer->recurrent = po->tick; tm_start(po->disp_timer, 1); @@ -767,6 +777,14 @@ ospf_reconfigure(struct proto *p, struct proto_config *c) ospf_iface_sm(ifa, ifa->check_link ? ISM_LOOP : ISM_UNLOOP); } + /* ECMP weight */ + if (oldip->ecmp_weight != newip->ecmp_weight) + { + ifa->ecmp_weight = newip->ecmp_weight; + OSPF_TRACE(D_EVENTS, "Changing ECMP weight of interface %s from %d to %d", + ifa->iface->name, (int)oldip->ecmp_weight + 1, (int)newip->ecmp_weight + 1); + } + /* strict nbma */ if ((oldip->strictnbma == 0) && (newip->strictnbma != 0)) { diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index 3345d4fc..2ef0180c 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -74,6 +74,7 @@ do { if ((p->debug & D_PACKETS) || OSPF_FORCE_DEBUG) \ #define DEFAULT_OSPFTICK 1 #define DEFAULT_RFC1583 0 /* compatibility with rfc1583 */ #define DEFAULT_STUB_COST 1000 +#define DEFAULT_ECMP_LIMIT 16 struct ospf_config @@ -81,6 +82,7 @@ struct ospf_config struct proto_config c; unsigned tick; int rfc1583; + int ecmp; list area_list; }; @@ -247,6 +249,7 @@ struct ospf_iface u8 sk_dr; /* Socket is a member of DRouters group */ u16 rxbuf; /* Buffer size */ u8 check_link; /* Whether iface link change is used */ + u8 ecmp_weight; /* Weight used for ECMP */ }; struct ospf_md5 @@ -730,11 +733,13 @@ struct proto_ospf list area_list; int areano; /* Number of area I belong to */ struct fib rtf; /* Routing table */ - int rfc1583; /* RFC1583 compatibility */ - int ebit; /* Did I originate any ext lsa? */ + byte rfc1583; /* RFC1583 compatibility */ + byte ebit; /* Did I originate any ext lsa? */ + byte ecmp; /* Maximal number of nexthops in ECMP route, or 0 */ struct ospf_area *backbone; /* If exists */ void *lsab; /* LSA buffer used when originating router LSAs */ int lsab_size, lsab_used; + linpool *nhpool; /* Linpool used for next hops computed in SPF */ u32 router_id; }; @@ -756,6 +761,7 @@ struct ospf_iface_patt u32 vid; u16 rxbuf; u8 check_link; + u8 ecmp_weight; #define OSPF_RXBUF_NORMAL 0 #define OSPF_RXBUF_LARGE 1 #define OSPF_RXBUF_MINSIZE 256 /* Minimal allowed size */ diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 55cd1cc3..6b8886ba 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -10,10 +10,7 @@ static void add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, u32 dist, - struct ospf_area *oa); -static int calc_next_hop(struct ospf_area *oa, - struct top_hash_entry *en, - struct top_hash_entry *par); + struct ospf_area *oa, struct ospf_lsa_rt_link *rtl); static void rt_sync(struct proto_ospf *po); /* In ospf_area->rtr we store paths to routers, but we use RID (and not IP address) @@ -25,20 +22,48 @@ static void rt_sync(struct proto_ospf *po); #endif -static inline void reset_ri(orta * orta) +static inline void reset_ri(ort *ort) { - bzero(orta, sizeof(orta)); + bzero(&ort->n, sizeof(orta)); } void ospf_rt_initort(struct fib_node *fn) { ort *ri = (ort *) fn; - reset_ri(&ri->n); - reset_ri(&ri->o); + reset_ri(ri); + ri->old_rta = NULL; ri->fn.x0 = 0; } +static inline int +unresolved_vlink(struct mpnh *nhs) +{ + return nhs && !nhs->iface; +} + +static inline struct mpnh * +new_nexthop(struct proto_ospf *po, ip_addr gw, struct iface *iface, unsigned char weight) +{ + struct mpnh *nh = lp_alloc(po->nhpool, sizeof(struct mpnh)); + nh->gw = gw; + nh->iface = iface; + nh->next = NULL; + nh->weight = weight; + return nh; +} + +static inline struct mpnh * +copy_nexthop(struct proto_ospf *po, struct mpnh *src) +{ + struct mpnh *nh = lp_alloc(po->nhpool, sizeof(struct mpnh)); + nh->gw = src->gw; + nh->iface = src->iface; + nh->next = NULL; + nh->weight = src->weight; + return nh; +} + /* If new is better return 1 */ static int @@ -234,8 +259,7 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_ .tag = 0, .rid = en->lsa.rt, .oa = oa, - .ifa = en->nhi, - .nh = en->nh + .nhs = en->nhs }; if (en == oa->rt) @@ -248,8 +272,8 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_ * be removed in rt_sync(). */ - nf.ifa = find_stub_src(oa, px, pxlen); - nf.nh = IPA_NONE; + struct ospf_iface *ifa = find_stub_src(oa, px, pxlen); + nf.nhs = ifa ? new_nexthop(oa->po, IPA_NONE, ifa->iface, ifa->ecmp_weight) : NULL; } ri_install_net(oa->po, px, pxlen, &nf); @@ -372,7 +396,7 @@ ospf_rt_spfa_rtlinks(struct ospf_area *oa, struct top_hash_entry *act, struct to if (tmp) DBG("Going to add cand, Mydist: %u, Req: %u\n", tmp->dist, act->dist + rtl->metric); - add_cand(&oa->cand, tmp, act, act->dist + rtl->metric, oa); + add_cand(&oa->cand, tmp, act, act->dist + rtl->metric, oa, rtl); } } @@ -439,8 +463,7 @@ ospf_rt_spfa(struct ospf_area *oa) .tag = 0, .rid = act->lsa.rt, .oa = oa, - .ifa = act->nhi, - .nh = act->nh + .nhs = act->nhs }; ri_install_rt(oa, act->lsa.rt, &nf); } @@ -471,7 +494,7 @@ ospf_rt_spfa(struct ospf_area *oa) DBG("Found :-)\n"); else DBG("Not found!\n"); - add_cand(&oa->cand, tmp, act, act->dist, oa); + add_cand(&oa->cand, tmp, act, act->dist, oa, NULL); } break; } @@ -661,8 +684,7 @@ ospf_rt_sum(struct ospf_area *oa) .tag = 0, .rid = en->lsa.rt, /* ABR ID */ .oa = oa, - .ifa = abr->n.ifa, - .nh = abr->n.nh + .nhs = abr->n.nhs }; if (type == ORT_NET) @@ -762,13 +784,18 @@ ospf_rt_sum_tr(struct ospf_area *oa) metric = abr->n.metric1 + metric; /* IAC */ /* 16.3. (5) */ - if (metric <= re->n.metric1) + if ((metric < re->n.metric1) || + ((metric == re->n.metric1) && unresolved_vlink(re->n.nhs))) { /* We want to replace the next-hop even if the metric is equal - to replace a virtual next-hop through vlink with a real one */ + to replace a virtual next-hop through vlink with a real one. + Proper ECMP would merge nexthops here, but we do not do that. + We restrict nexthops to fit one area to simplify check + 12.4.3 p4 in decide_sum_lsa() */ + re->n.metric1 = metric; - re->n.nh = abr->n.nh; - re->n.ifa = abr->n.ifa; + re->n.voa = oa; + re->n.nhs = abr->n.nhs; } } } @@ -811,7 +838,7 @@ decide_sum_lsa(struct ospf_area *oa, ort *nf, int dest) return 0; /* 12.4.3 p4 */ - if (nf->n.ifa && (nf->n.ifa->oa->areaid == oa->areaid)) + if (nf->n.voa && (nf->n.voa->areaid == oa->areaid)) return 0; /* 12.4.3 p5 */ @@ -912,18 +939,20 @@ ospf_check_vlinks(struct proto_ospf *po) struct top_hash_entry *tmp; tmp = ospf_hash_find_rt(po->gr, iface->voa->areaid, iface->vid); - if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb)) + if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb) && tmp->nhs) { + struct ospf_iface *nhi = ospf_iface_find(po, tmp->nhs->iface); + if ((iface->state != OSPF_IS_PTP) - || (iface->vifa != tmp->nhi) + || (iface->vifa != nhi) || !ipa_equal(iface->vip, tmp->lb)) { OSPF_TRACE(D_EVENTS, "Vlink peer %R found", tmp->lsa.id); ospf_iface_sm(iface, ISM_DOWN); - iface->vifa = tmp->nhi; - iface->iface = tmp->nhi->iface; - iface->addr = tmp->nhi->addr; - iface->sk = tmp->nhi->sk; + iface->vifa = nhi; + iface->iface = nhi->iface; + iface->addr = nhi->addr; + iface->sk = nhi->sk; iface->cost = tmp->dist; iface->vip = tmp->lb; ospf_iface_sm(iface, ISM_UP); @@ -959,8 +988,8 @@ ospf_rt_abr(struct proto_ospf *po) /* RFC 2328 G.3 - incomplete resolution of virtual next hops */ - if (nf->n.type && nf->n.ifa && (nf->n.ifa->type == OSPF_IT_VLINK)) - reset_ri(&nf->n); + if (nf->n.type && unresolved_vlink(nf->n.nhs)) + reset_ri(nf); /* Compute condensed area networks */ @@ -979,7 +1008,7 @@ ospf_rt_abr(struct proto_ospf *po) /* 16.2. (3) */ if (nfi->n.type == RTS_OSPF_IA) - reset_ri(&nfi->n); + reset_ri(nfi); } if (anet->metric < nf->n.metric1) @@ -1055,10 +1084,10 @@ ospf_ext_spf(struct proto_ospf *po) struct proto *p = &po->proto; struct ospf_lsa_ext *le; int pxlen, ebit, rt_fwaddr_valid; - ip_addr ip, nh, rtid, rt_fwaddr; - struct ospf_iface *nhi = NULL; + ip_addr ip, rtid, rt_fwaddr; u32 br_metric, rt_metric, rt_tag; struct ospf_area *atmp; + struct mpnh* nhs = NULL; OSPF_TRACE(D_EVENTS, "Starting routing table calculation for ext routes"); @@ -1119,8 +1148,6 @@ ospf_ext_spf(struct proto_ospf *po) p->name, en->lsa.type, en->lsa.id, en->lsa.rt); continue; } - nhi = NULL; - nh = IPA_NONE; /* 16.4. (3) */ /* If there are more areas, we already precomputed preferred ASBR entries @@ -1138,8 +1165,7 @@ ospf_ext_spf(struct proto_ospf *po) if (!rt_fwaddr_valid) { nf2 = nf1; - nh = nf1->n.nh; - nhi = nf1->n.ifa; + nhs = nf1->n.nhs; br_metric = nf1->n.metric1; } else @@ -1152,12 +1178,13 @@ ospf_ext_spf(struct proto_ospf *po) continue; /* Next-hop is a part of a configured stubnet */ - if (!nf2->n.ifa) + if (!nf2->n.nhs) continue; - /* If nh is zero, it is a device route */ - nh = ipa_nonzero(nf2->n.nh) ? nf2->n.nh : rt_fwaddr; - nhi = nf2->n.ifa; + nhs = nf2->n.nhs; + /* If gw is zero, it is a device route */ + if (ipa_zero(nhs->gw)) + nhs = new_nexthop(po, rt_fwaddr, nhs->iface, nhs->weight); br_metric = nf2->n.metric1; } @@ -1183,14 +1210,14 @@ ospf_ext_spf(struct proto_ospf *po) nfa.tag = rt_tag; nfa.rid = en->lsa.rt; nfa.oa = nf1->n.oa; /* undefined in RFC 2328 */ - nfa.ifa = nhi; - nfa.nh = nh; + nfa.voa = NULL; + nfa.nhs = nhs; ri_install_ext(po, ip, pxlen, &nfa); } } -/* Cleanup of routing tables and data Cleanup */ +/* Cleanup of routing tables and data */ void ospf_rt_reset(struct proto_ospf *po) { @@ -1203,9 +1230,8 @@ ospf_rt_reset(struct proto_ospf *po) FIB_WALK(&po->rtf, nftmp) { ri = (ort *) nftmp; - memcpy(&ri->o, &ri->n, sizeof(orta)); /* Backup old data */ ri->fn.x0 = 0; - reset_ri(&ri->n); + reset_ri(ri); } FIB_WALK_END; @@ -1214,8 +1240,7 @@ ospf_rt_reset(struct proto_ospf *po) { en->color = OUTSPF; en->dist = LSINFINITY; - en->nhi = NULL; - en->nh = IPA_NONE; + en->nhs = NULL; en->lb = IPA_NONE; } @@ -1225,8 +1250,7 @@ ospf_rt_reset(struct proto_ospf *po) FIB_WALK(&oa->rtr, nftmp) { ri = (ort *) nftmp; - memcpy(&ri->o, &ri->n, sizeof(orta)); /* Backup old data */ - reset_ri(&ri->n); + reset_ri(ri); } FIB_WALK_END; @@ -1288,15 +1312,220 @@ ospf_rt_spf(struct proto_ospf *po) ospf_ext_spf(po); rt_sync(po); - + lp_flush(po->nhpool); + po->calcrt = 0; } + +static inline int +match_dr(struct ospf_iface *ifa, struct top_hash_entry *en) +{ +#ifdef OSPFv2 + return (ifa->drid == en->lsa.rt) && (ipa_to_u32(ifa->drip) == en->lsa.id); +#else /* OSPFv3 */ + return (ifa->drid == en->lsa.rt) && (ifa->dr_iface_id == en->lsa.id); +#endif +} + + +static inline int +match_rtlink(struct ospf_iface *ifa, struct ospf_lsa_rt_link *rtl) +{ +#ifdef OSPFv2 + return (ifa->type == OSPF_IT_PTP) && (ifa->cost == rtl->metric) && + (((ifa->addr->flags & IA_UNNUMBERED) ? ifa->iface->index : + ipa_to_u32(ifa->addr->ip)) == rtl->data); +#else /* OSPFv3 */ + return (ifa->type == OSPF_IT_PTP) && (ifa->cost == rtl->metric) && + (ifa->iface->index == rtl->lif); +#endif +} + +static inline int +inherit_nexthops(struct mpnh *pn) +{ + /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */ + return pn && (ipa_nonzero(pn->gw) || !pn->iface); +} + +static struct mpnh * +calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, + struct top_hash_entry *par, struct ospf_lsa_rt_link *rtl) +{ + // struct proto *p = &oa->po->proto; + struct proto_ospf *po = oa->po; + struct mpnh *pn = par->nhs; + struct ospf_iface *ifa; + u32 rid = en->lsa.rt; + + /* 16.1.1. The next hop calculation */ + DBG(" Next hop calculating for id: %R rt: %R type: %u\n", + en->lsa.id, en->lsa.rt, en->lsa.type); + + /* Usually, we inherit parent nexthops */ + if (inherit_nexthops(pn)) + return pn; + + /* + * There are three cases: + * 1) en is a local network (and par is root) + * 2) en is a ptp or ptmp neighbor (and par is root) + * 3) en is a bcast or nbma neighbor (and par is local network) + */ + + /* The first case - local network */ + if ((en->lsa.type == LSA_T_NET) && (par == oa->rt)) + { + WALK_LIST(ifa, po->iface_list) + if (match_dr(ifa, en)) + return new_nexthop(po, IPA_NONE, ifa->iface, ifa->ecmp_weight); + + return NULL; + } + + /* The second case - ptp or ptmp neighbor */ + if ((en->lsa.type == LSA_T_RT) && (par == oa->rt)) + { + if (rtl->type == LSART_VLNK) + return new_nexthop(po, IPA_NONE, NULL, 0); + + WALK_LIST(ifa, po->iface_list) + if (match_rtlink(ifa, rtl)) + { + struct ospf_neighbor *m = find_neigh(ifa, rid); + if (m && (m->state == NEIGHBOR_FULL)) + return new_nexthop(po, m->ip, ifa->iface, ifa->ecmp_weight); + } + + return NULL; + } + + /* The third case - bcast or nbma neighbor */ + if ((en->lsa.type == LSA_T_RT) && (par->lsa.type == LSA_T_NET)) + { + /* par->nhi should be defined from parent's calc_next_hop() */ + if (!pn) + goto bad; + +#ifdef OSPFv2 + /* + * In this case, next-hop is the same as link-back, which is + * already computed in link_back(). + */ + if (ipa_zero(en->lb)) + goto bad; + + return new_nexthop(po, en->lb, pn->iface, pn->weight); + +#else /* OSPFv3 */ + /* + * Next-hop is taken from lladdr field of Link-LSA, en->lb_id + * is computed in link_back(). + */ + struct top_hash_entry *lhe; + lhe = ospf_hash_find(po->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); + + if (!lhe) + return NULL; + + struct ospf_lsa_link *llsa = lhe->lsa_body; + + if (ipa_zero(llsa->lladdr)) + return NULL; + + return new_nexthop(po, llsa->lladdr, pn->iface, pn->weight); +#endif + } + + bad: + /* Probably bug or some race condition, we log it */ + log(L_ERR "Unexpected case in next hop calculation"); + return NULL; +} + +/* Compare nexthops during merge. + We need to maintain nhs sorted to eliminate duplicities */ +static int +cmp_nhs(struct mpnh *s1, struct mpnh *s2) +{ + int r; + + if (!s1) + return 1; + + if (!s2) + return -1; + + r = ((int) s2->weight) - ((int) s1->weight); + if (r) + return r; + + r = ipa_compare(s1->gw, s2->gw); + if (r) + return r; + + return ((int) s1->iface->index) - ((int) s2->iface->index); +} + +static void +merge_nexthops(struct proto_ospf *po, struct top_hash_entry *en, + struct top_hash_entry *par, struct mpnh *new) +{ + if (en->nhs == new) + return; + + int r1 = en->nhs_reuse; + int r2 = (par->nhs != new); + int count = po->ecmp; + struct mpnh *s1 = en->nhs; + struct mpnh *s2 = new; + struct mpnh **n = &(en->nhs); + + /* + * r1, r2 signalize whether we can reuse nexthops from s1, s2. + * New nexthops (s2, new) can be reused if they are not inherited + * from the parent (i.e. it is allocated in calc_next_hop()). + * Current nexthops (s1, en->nhs) can be reused if they weren't + * inherited in previous steps (that is stored in nhs_reuse, + * i.e. created by merging or allocalted in calc_next_hop()). + * + * Generally, a node first inherits shared nexthops from its + * parent and later possibly gets reusable copy during merging. + */ + + while ((s1 || s2) && count--) + { + int cmp = cmp_nhs(s1, s2); + if (cmp < 0) + { + *n = r1 ? s1 : copy_nexthop(po, s1); + s1 = s1->next; + } + else if (cmp > 0) + { + *n = r2 ? s2 : copy_nexthop(po, s2); + s2 = s2->next; + } + else + { + *n = r1 ? s1 : (r2 ? s2 : copy_nexthop(po, s1)); + s1 = s1->next; + s2 = s2->next; + } + n = &((*n)->next); + } + *n = NULL; + + en->nhs_reuse=1; +} + /* Add LSA into list of candidates in Dijkstra's algorithm */ static void add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, - u32 dist, struct ospf_area *oa) + u32 dist, struct ospf_area *oa, struct ospf_lsa_rt_link *rtl) { + struct proto_ospf *po = oa->po; node *prev, *n; int added = 0; struct top_hash_entry *act; @@ -1321,24 +1550,48 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, return; /* 16.1. (2d), also checks that dist < LSINFINITY */ - if (dist >= en->dist) + if (dist > en->dist) return; - /* - * The line above (=) is not a bug, but we don't support multiple - * next hops. I'll start as soon as nest will - */ /* We should check whether there is a reverse link from en to par, */ if (!link_back(oa, en, par)) return; - if (!calc_next_hop(oa, en, par)) + struct mpnh *nhs = calc_next_hop(oa, en, par, rtl); + if (!nhs) { log(L_WARN "Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)", en->lsa.type, en->lsa.id, en->lsa.rt); return; } + if (dist == en->dist) + { + /* + * For multipath, we should merge nexthops. We do not mix dummy + * vlink nexthops, device nexthops and gateway nexthops. We merge + * gateway nexthops only. We prefer device nexthops over gateway + * nexthops and gateway nexthops over vlink nexthops. We either + * keep old nexthops, merge old and new, or replace old with new. + * + * We know that en->color == CANDIDATE and en->nhs is defined. + */ + struct mpnh *onhs = en->nhs; + + /* Keep old ones */ + if (!po->ecmp || !nhs->iface || (onhs->iface && ipa_zero(onhs->gw))) + return; + + /* Merge old and new */ + if (ipa_nonzero(nhs->gw) && ipa_nonzero(onhs->gw)) + { + merge_nexthops(po, en, par, nhs); + return; + } + + /* Fallback to replace old ones */ + } + DBG(" Adding candidate: rt: %R, id: %R, type: %u\n", en->lsa.rt, en->lsa.id, en->lsa.type); @@ -1346,8 +1599,10 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, { /* We found a shorter path */ rem_node(&en->cn); } + en->nhs = nhs; en->dist = dist; en->color = CANDIDATE; + en->nhs_reuse = (par->nhs != nhs); prev = NULL; @@ -1361,8 +1616,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, { act = SKIP_BACK(struct top_hash_entry, cn, n); if ((act->dist > dist) || - ((act->dist == dist) && (act->lsa.type == LSA_T_NET))) - /* FIXME - shouldn't be here LSA_T_RT ??? */ + ((act->dist == dist) && (act->lsa.type == LSA_T_RT))) { if (prev == NULL) add_head(l, &en->cn); @@ -1381,132 +1635,16 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, } } - static inline int -match_dr(struct ospf_iface *ifa, struct top_hash_entry *en) +ort_changed(ort *nf, rta *nr) { -#ifdef OSPFv2 - return (ifa->drid == en->lsa.rt) && (ipa_to_u32(ifa->drip) == en->lsa.id); -#else /* OSPFv3 */ - return (ifa->drid == en->lsa.rt) && (ifa->dr_iface_id == en->lsa.id); -#endif -} - -static int -calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, - struct top_hash_entry *par) -{ - // struct proto *p = &oa->po->proto; - struct ospf_neighbor *neigh, *m; - struct proto_ospf *po = oa->po; - struct ospf_iface *ifa; - - /* 16.1.1. The next hop calculation */ - DBG(" Next hop called.\n"); - if (ipa_zero(par->nh)) - { - u32 rid = en->lsa.rt; - DBG(" Next hop calculating for id: %R rt: %R type: %u\n", - en->lsa.id, en->lsa.rt, en->lsa.type); - - /* - * There are three cases: - * 1) en is a local network (and par is root) - * 2) en is a ptp or ptmp neighbor (and par is root) - * 3) en is a bcast or nbma neighbor (and par is local network) - */ - - /* The first case - local network */ - if ((en->lsa.type == LSA_T_NET) && (par == oa->rt)) - { - WALK_LIST(ifa, po->iface_list) - if (match_dr(ifa, en)) - { - en->nh = IPA_NONE; - en->nhi = ifa; - return 1; - } - return 0; - } - - /* The second case - ptp or ptmp neighbor */ - if ((en->lsa.type == LSA_T_RT) && (par == oa->rt)) - { - /* - * We don't know which iface was used to reach this neighbor - * (there might be more parallel ifaces) so we will find - * the best PTP iface with given fully adjacent neighbor. - */ - neigh = NULL; - WALK_LIST(ifa, po->iface_list) - if ((ifa->type == OSPF_IT_PTP) || (ifa->type == OSPF_IT_VLINK)) - { - m = find_neigh(ifa, rid); - if (m && (m->state == NEIGHBOR_FULL)) - { - if (!neigh || (m->ifa->cost < neigh->ifa->cost)) - neigh = m; - } - } - - if (!neigh) - return 0; - - en->nh = neigh->ip; - en->nhi = neigh->ifa; - return 1; - } - - /* The third case - bcast or nbma neighbor */ - if ((en->lsa.type == LSA_T_RT) && (par->lsa.type == LSA_T_NET)) - { - /* par->nhi should be defined from parent's calc_next_hop() */ - if (!par->nhi) - goto bad; - -#ifdef OSPFv2 - /* - * In this case, next-hop is the same as link-back, which is - * already computed in link_back(). - */ - if (ipa_zero(en->lb)) - goto bad; - - en->nh = en->lb; - en->nhi = par->nhi; - return 1; - -#else /* OSPFv3 */ - /* - * Next-hop is taken from lladdr field of Link-LSA, en->lb_id - * is computed in link_back(). - */ - struct top_hash_entry *lhe; - lhe = ospf_hash_find(po->gr, par->nhi->iface->index, en->lb_id, rid, LSA_T_LINK); - - if (!lhe) - return 0; - - struct ospf_lsa_link *llsa = lhe->lsa_body; - - if (ipa_zero(llsa->lladdr)) - return 0; - - en->nh = llsa->lladdr; - en->nhi = par->nhi; - return 1; -#endif - } - - bad: - /* Probably bug or some race condition, we log it */ - log(L_ERR "Unexpected case in next hop calculation"); - return 0; - } - - en->nh = par->nh; - en->nhi = par->nhi; - return 1; + rta *or = nf->old_rta; + return !or || + (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) || + (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) || + (nr->source != or->source) || (nr->dest != or->dest) || + (nr->iface != or->iface) || !ipa_equal(nr->gw, or->gw) || + !mpnh_same(nr->nexthops, or->nexthops); } static void @@ -1530,57 +1668,83 @@ again1: { nf = (ort *) nftmp; - /* Sanity check of next-hop address */ - if (nf->n.type && ipa_nonzero(nf->n.nh)) + /* Sanity check of next-hop addresses, failure should not happen */ + if (nf->n.type) { - neighbor *ng = neigh_find2(p, &nf->n.nh, nf->n.ifa->iface, 0); - if (!ng || (ng->scope == SCOPE_HOST)) - reset_ri(&nf->n); + struct mpnh *nh; + for (nh = nf->n.nhs; nh; nh = nh->next) + if (ipa_nonzero(nh->gw)) + { + neighbor *ng = neigh_find2(p, &nh->gw, nh->iface, 0); + if (!ng || (ng->scope == SCOPE_HOST)) + { reset_ri(nf); break; } + } } if (po->areano > 1) check_sum_net_lsa(po, nf); /* Remove configured stubnets */ - if (!nf->n.ifa) - reset_ri(&nf->n); + if (!nf->n.nhs) + reset_ri(nf); - if (reload || memcmp(&nf->n, &nf->o, sizeof(orta))) + if (nf->n.type) /* Add the route */ { - net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen); + rta a0 = { + .proto = p, + .source = nf->n.type, + .scope = SCOPE_UNIVERSE, + .cast = RTC_UNICAST, + }; - if (nf->n.type) /* Add the route */ + if (nf->n.nhs->next) { - rta a0 = { - .proto = p, - .source = nf->n.type, - .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST, - .iface = nf->n.ifa->iface - }; + a0.dest = RTD_MULTIPATH; + a0.nexthops = nf->n.nhs; + } + else if (ipa_nonzero(nf->n.nhs->gw)) + { + a0.dest = RTD_ROUTER; + a0.iface = nf->n.nhs->iface; + a0.gw = nf->n.nhs->gw; + } + else + { + a0.dest = RTD_DEVICE; + a0.iface = nf->n.nhs->iface; + } - if (ipa_nonzero(nf->n.nh)) - { - a0.dest = RTD_ROUTER; - a0.gw = nf->n.nh; - } - else - a0.dest = RTD_DEVICE; + if (reload || ort_changed(nf, &a0)) + { + net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen); + rta *a = rta_lookup(&a0); + rte *e = rte_get_temp(a); - rte *e = rte_get_temp(&a0); - e->u.ospf.metric1 = nf->n.metric1; - e->u.ospf.metric2 = nf->n.metric2; - e->u.ospf.tag = nf->n.tag; - e->u.ospf.router_id = nf->n.rid; + rta_free(nf->old_rta); + nf->old_rta = rta_clone(a); + e->u.ospf.metric1 = nf->old_metric1 = nf->n.metric1; + e->u.ospf.metric2 = nf->old_metric2 = nf->n.metric2; + e->u.ospf.tag = nf->old_tag = nf->n.tag; + e->u.ospf.router_id = nf->old_rid = nf->n.rid; e->pflags = 0; e->net = ne; e->pref = p->preference; + + + DBG("Mod rte type %d - %I/%d via %I on iface %s, met %d\n", a0.source, nf->fn.prefix, nf->fn.pxlen, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); rte_update(p->table, ne, p, p, e); } - else /* Remove the route */ - rte_update(p->table, ne, p, p, NULL); + } + else if (nf->old_rta) + { + /* Remove the route */ + rta_free(nf->old_rta); + nf->old_rta = NULL; + + net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen); + rte_update(p->table, ne, p, p, NULL); } /* Remove unused rt entry. Entries with fn.x0 == 1 are persistent. */ diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 78156c75..bf234f58 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -40,21 +40,32 @@ typedef struct orta u32 tag; u32 rid; /* Router ID of real advertising router */ struct ospf_area *oa; - struct ospf_iface *ifa; /* Outgoing interface */ - ip_addr nh; /* Next hop */ + struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(), + NULL otherwise */ + struct mpnh *nhs; /* Next hops computed during SPF */ } orta; +// struct ospf_iface *ifa; /* Outgoing interface */ +// ip_addr nh; /* Next hop */ + + typedef struct ort { /* * We use fn.x0 to mark persistent rt entries, that are needed for summary * LSAs that don't have 'proper' rt entry (area networks + default to stubs) * to keep uid stable (used for LSA ID in OSPFv3 - see fibnode_to_lsaid()). + * + * old_* values are here to represent the last route update. old_rta + * is cached (we keep reference), mainly for multipath nexthops. + * old_rta == NULL means route wasn not in the last update, in that + * case other old_* values are not valid. */ struct fib_node fn; orta n; - orta o; + u32 old_metric1, old_metric2, old_tag, old_rid; + rta *old_rta; } ort; @@ -64,18 +75,24 @@ ort; * - only router, network and AS-external LSAs * - lsa.age < LSA_MAXAGE * - dist < LSINFINITY (or 2*LSINFINITY for ext-LSAs) - * - nhi are non-NULL unless the node is oa->rt (calculating router itself) - * - beware, nhi is not valid after SPF calculation - * - nh is IFA_NONE iff the node is a local network + * - nhs is non-NULL unless the node is oa->rt (calculating router itself) + * - beware, nhs is not valid after SPF calculation * * Invariants for structs orta nodes of fib tables po->rtf, oa->rtr: * - nodes may be invalid (fn.type == 0), in that case other invariants don't hold * - n.metric1 may be at most a small multiple of LSINFINITY, * therefore sums do not overflow * - n.oa is always non-NULL - * - n.ifa is always non-NULL with one exception - configured stubnet - nodes (in po->rtf). In that case, n.nh is IFA_NONE. + * - n.nhs is always non-NULL with one exception - configured stubnet + * nodes (in po->rtf). * - oa->rtr does not contain calculating router itself + * + * There are three types of nexthops in nhs fields: + * - gateway nexthops (non-NULL iface, gw != IPA_NONE) + * - device nexthops (non-NULL iface, gw == IPA_NONE) + * - dummy vlink nexthops (NULL iface, gw == IPA_NONE) + * These three types don't mix, nhs field contains either + * one device, one vlink node, or one/more gateway nodes. */ void ospf_rt_spf(struct proto_ospf *po); diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index 51e96c7f..e604bf87 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -1674,14 +1674,12 @@ ospf_hash_get(struct top_graph *f, u32 domain, u32 lsa, u32 rtr, u32 type) e = sl_alloc(f->hash_slab); e->color = OUTSPF; e->dist = LSINFINITY; - e->nhi = NULL; - e->nh = IPA_NONE; + e->nhs = NULL; e->lb = IPA_NONE; e->lsa.id = lsa; e->lsa.rt = rtr; e->lsa.type = type; e->lsa_body = NULL; - e->nhi = NULL; e->domain = domain; e->next = *ee; *ee = e; diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index b185c7f3..9521e3eb 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -20,9 +20,8 @@ struct top_hash_entry // struct ospf_area *oa; void *lsa_body; bird_clock_t inst_t; /* Time of installation into DB */ - ip_addr nh; /* Next hop */ + struct mpnh *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */ - struct ospf_iface *nhi; /* Next hop interface - valid only in ospf_rt_spf()*/ #ifdef OSPFv3 u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */ #endif @@ -32,7 +31,8 @@ struct top_hash_entry #define OUTSPF 0 #define CANDIDATE 1 #define INSPF 2 - u8 padding; + u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. + See a note in rt.c:merge_nexthops() */ }; struct top_graph