diff --git a/README b/README index daeb18bd..5d4bd9b2 100644 --- a/README +++ b/README @@ -64,6 +64,7 @@ What do we support: o Static routes o Inter-table protocol o IPv6 router advertisements + o Bidirectional Forwarding Detection (BFD) o Command-line interface (using the `birdc' client; to get some help, just press `?') o Soft reconfiguration -- no online commands for changing the diff --git a/conf/cf-lex.l b/conf/cf-lex.l index 61e04075..5a2a4d6b 100644 --- a/conf/cf-lex.l +++ b/conf/cf-lex.l @@ -70,7 +70,7 @@ struct sym_scope { static struct sym_scope *conf_this_scope; static int cf_hash(byte *c); -static struct symbol *cf_find_sym(byte *c, unsigned int h0); +static inline struct symbol * cf_get_sym(byte *c, uint h0); linpool *cfg_mem; @@ -194,7 +194,7 @@ else: { } k=k->next; } - cf_lval.s = cf_find_sym(yytext, h); + cf_lval.s = cf_get_sym(yytext, h); return SYM; } @@ -426,8 +426,9 @@ check_eof(void) } static struct symbol * -cf_new_sym(byte *c, unsigned int h) +cf_new_sym(byte *c, uint h0) { + uint h = h0 & (SYM_HASH_SIZE-1); struct symbol *s, **ht; int l; @@ -449,56 +450,77 @@ cf_new_sym(byte *c, unsigned int h) } static struct symbol * -cf_find_sym(byte *c, unsigned int h0) +cf_find_sym(struct config *cfg, byte *c, uint h0) { - unsigned int h = h0 & (SYM_HASH_SIZE-1); + uint h = h0 & (SYM_HASH_SIZE-1); struct symbol *s, **ht; - if (ht = new_config->sym_hash) + if (ht = cfg->sym_hash) { for(s = ht[h]; s; s=s->next) if (!strcmp(s->name, c) && s->scope->active) return s; } - if (new_config->sym_fallback) + if (ht = cfg->sym_fallback) { /* We know only top-level scope is active */ - for(s = new_config->sym_fallback[h]; s; s=s->next) + for(s = ht[h]; s; s=s->next) if (!strcmp(s->name, c) && s->scope->active) return s; } - return cf_new_sym(c, h); + + return NULL; +} + +static inline struct symbol * +cf_get_sym(byte *c, uint h0) +{ + return cf_find_sym(new_config, c, h0) ?: cf_new_sym(c, h0); } /** * cf_find_symbol - find a symbol by name + * @cfg: specificed config * @c: symbol name * - * This functions searches the symbol table for a symbol of given - * name. First it examines the current scope, then the second recent - * one and so on until it either finds the symbol and returns a pointer - * to its &symbol structure or reaches the end of the scope chain - * and returns %NULL to signify no match. + * This functions searches the symbol table in the config @cfg for a symbol of + * given name. First it examines the current scope, then the second recent one + * and so on until it either finds the symbol and returns a pointer to its + * &symbol structure or reaches the end of the scope chain and returns %NULL to + * signify no match. */ struct symbol * -cf_find_symbol(byte *c) +cf_find_symbol(struct config *cfg, byte *c) { - return cf_find_sym(c, cf_hash(c)); + return cf_find_sym(cfg, c, cf_hash(c)); +} + +/** + * cf_get_symbol - get a symbol by name + * @c: symbol name + * + * This functions searches the symbol table of the currently parsed config + * (@new_config) for a symbol of given name. It returns either the already + * existing symbol or a newly allocated undefined (%SYM_VOID) symbol if no + * existing symbol is found. + */ +struct symbol * +cf_get_symbol(byte *c) +{ + return cf_get_sym(c, cf_hash(c)); } struct symbol * cf_default_name(char *template, int *counter) { - char buf[32]; + char buf[SYM_MAX_LEN]; struct symbol *s; char *perc = strchr(template, '%'); for(;;) { bsprintf(buf, template, ++(*counter)); - s = cf_find_sym(buf, cf_hash(buf)); - if (!s) - break; + s = cf_get_sym(buf, cf_hash(buf)); if (s->class == SYM_VOID) return s; if (!perc) @@ -529,7 +551,7 @@ cf_define_symbol(struct symbol *sym, int type, void *def) { if (sym->scope == conf_this_scope) cf_error("Symbol already defined"); - sym = cf_new_sym(sym->name, cf_hash(sym->name) & (SYM_HASH_SIZE-1)); + sym = cf_new_sym(sym->name, cf_hash(sym->name)); } sym->class = type; sym->def = def; diff --git a/conf/conf.c b/conf/conf.c index 6933ef3c..b4474ce6 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -20,19 +20,19 @@ * * There can exist up to four different configurations at one time: an active * one (pointed to by @config), configuration we are just switching from - * (@old_config), one queued for the next reconfiguration (@future_config; - * if there is one and the user wants to reconfigure once again, we just - * free the previous queued config and replace it with the new one) and - * finally a config being parsed (@new_config). The stored @old_config - * is also used for undo reconfiguration, which works in a similar way. - * Reconfiguration could also have timeout (using @config_timer) and undo - * is automatically called if the new configuration is not confirmed later. + * (@old_config), one queued for the next reconfiguration (@future_config; if + * there is one and the user wants to reconfigure once again, we just free the + * previous queued config and replace it with the new one) and finally a config + * being parsed (@new_config). The stored @old_config is also used for undo + * reconfiguration, which works in a similar way. Reconfiguration could also + * have timeout (using @config_timer) and undo is automatically called if the + * new configuration is not confirmed later. The new config (@new_config) and + * associated linear pool (@cfg_mem) is non-NULL only during parsing. * - * Loading of new configuration is very simple: just call config_alloc() - * to get a new &config structure, then use config_parse() to parse a - * configuration file and fill all fields of the structure - * and finally ask the config manager to switch to the new - * config by calling config_commit(). + * Loading of new configuration is very simple: just call config_alloc() to get + * a new &config structure, then use config_parse() to parse a configuration + * file and fill all fields of the structure and finally ask the config manager + * to switch to the new config by calling config_commit(). * * CLI commands are parsed in a very similar way -- there is also a stripped-down * &config structure associated with them and they are lex-ed and parsed by the @@ -86,10 +86,15 @@ config_alloc(byte *name) linpool *l = lp_new(p, 4080); struct config *c = lp_allocz(l, sizeof(struct config)); + /* Duplication of name string in local linear pool */ + uint nlen = strlen(name) + 1; + char *ndup = lp_allocu(l, nlen); + memcpy(ndup, name, nlen); + c->mrtdump_file = -1; /* Hack, this should be sysdep-specific */ c->pool = p; - cfg_mem = c->mem = l; - c->file_name = cfg_strdup(name); + c->mem = l; + c->file_name = ndup; c->load_time = now; c->tf_route = c->tf_proto = (struct timeformat){"%T", "%F", 20*3600}; c->tf_base = c->tf_log = (struct timeformat){"%F %T", NULL, 0}; @@ -114,11 +119,13 @@ config_alloc(byte *name) int config_parse(struct config *c) { + int done = 0; DBG("Parsing configuration file `%s'\n", c->file_name); new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) - return 0; + goto cleanup; + cf_lex_init(0, c); sysdep_preconfig(c); protos_preconfig(c); @@ -132,7 +139,12 @@ config_parse(struct config *c) if (!c->router_id) cf_error("Router ID must be configured manually on IPv6 routers"); #endif - return 1; + done = 1; + +cleanup: + new_config = NULL; + cfg_mem = NULL; + return done; } /** @@ -145,14 +157,22 @@ config_parse(struct config *c) int cli_parse(struct config *c) { - new_config = c; + int done = 0; c->sym_fallback = config->sym_hash; + new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) - return 0; + goto cleanup; + cf_lex_init(1, c); cf_parse(); - return 1; + done = 1; + +cleanup: + c->sym_fallback = NULL; + new_config = NULL; + cfg_mem = NULL; + return done; } /** @@ -232,10 +252,6 @@ config_do_commit(struct config *c, int type) if (old_config && !config->shutdown) log(L_INFO "Reconfiguring"); - /* This should not be necessary, but it seems there are some - functions that access new_config instead of config */ - new_config = config; - if (old_config) old_config->obstacle_count++; @@ -249,9 +265,6 @@ config_do_commit(struct config *c, int type) DBG("protos_commit\n"); protos_commit(c, old_config, force_restart, type); - /* Just to be sure nobody uses that now */ - new_config = NULL; - int obs = 0; if (old_config) obs = --old_config->obstacle_count; diff --git a/conf/conf.h b/conf/conf.h index cca6da81..867d8f4d 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -149,7 +149,9 @@ int cf_lex(void); void cf_lex_init(int is_cli, struct config *c); void cf_lex_unwind(void); -struct symbol *cf_find_symbol(byte *c); +struct symbol *cf_find_symbol(struct config *cfg, byte *c); + +struct symbol *cf_get_symbol(byte *c); struct symbol *cf_default_name(char *template, int *counter); struct symbol *cf_define_symbol(struct symbol *symbol, int type, void *def); void cf_push_scope(struct symbol *); diff --git a/doc/bird.sgml b/doc/bird.sgml index 1c2dda4b..df83aacd 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1571,7 +1571,7 @@ RFC 4271 It also supports the community attributes (RFC 1997), capability negotiation -(RFC 3392), +(RFC 5492), MD5 password authentication (RFC 2385), extended communities @@ -1707,7 +1707,11 @@ using the following configuration parameters: scan time - Time in seconds between two scans of the network interface list. On systems where we are notified about interface status changes asynchronously (such as newer versions of Linux), we need to scan the @@ -2227,6 +2230,18 @@ limitations can be overcome using another routing table and the pipe protocol. a graceful restart recovery is active, the Kernel protocol will defer synchronization of routing tables until the end of the recovery. Note that import of kernel routes to BIRD is not affected. + + merge paths switch [limit number] + Usually, only best routes are exported to the kernel protocol. With path + merging enabled, both best routes and equivalent non-best routes are + merged during export to generate one ECMP (equal-cost multipath) route + for each network. This is useful e.g. for BGP multipath. Note that best + routes are still pivotal for route export (responsible for most + properties of resulting ECMP routes), while exported non-best routes are + responsible just for additional multipath next hops. This option also + allows to specify a limit on maximal number of nexthops in one route. By + default, multipath merging is disabled. If enabled, default value of the + limit is 16. Attributes @@ -2254,6 +2269,20 @@ these attributes: The realm of the route. Can be used for traffic classification. +

In Linux, there is also a plenty of obscure route attributes mostly focused +on tuning TCP performance of local connections. BIRD supports most of these +attributes, see Linux or iproute2 documentation for their meaning. Attributes +Example

A simple configuration can look this way: @@ -3382,7 +3411,9 @@ of the protocol contains mainly a list of static routes: route - Static route through a neighboring router. + Static route through a neighboring router. For link-local next hops, + interface can be specified as a part of the address (e.g., + route Static multipath route. Contains several nexthops (gateways), possibly diff --git a/filter/filter.c b/filter/filter.c index 3b14fc0c..55062aca 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -471,26 +471,22 @@ static inline void f_rte_cow(void) static void f_rta_cow(void) { - if ((*f_rte)->attrs->aflags & RTAF_CACHED) { + if (!rta_is_cached((*f_rte)->attrs)) + return; - /* Prepare to modify rte */ - f_rte_cow(); + /* Prepare to modify rte */ + f_rte_cow(); - /* Store old rta to free it later */ - f_old_rta = (*f_rte)->attrs; + /* Store old rta to free it later, it stores reference from rte_cow() */ + f_old_rta = (*f_rte)->attrs; - /* - * Alloc new rta, do shallow copy and update rte. Fields eattrs - * and nexthops of rta are shared with f_old_rta (they will be - * copied when the cached rta will be obtained at the end of - * f_run()), also the lock of hostentry is inherited (we suppose - * hostentry is not changed by filters). - */ - rta *ra = lp_alloc(f_pool, sizeof(rta)); - memcpy(ra, f_old_rta, sizeof(rta)); - ra->aflags = 0; - (*f_rte)->attrs = ra; - } + /* + * Get shallow copy of rta. Fields eattrs and nexthops of rta are shared + * with f_old_rta (they will be copied when the cached rta will be obtained + * at the end of f_run()), also the lock of hostentry is inherited (we + * suppose hostentry is not changed by filters). + */ + (*f_rte)->attrs = rta_do_cow((*f_rte)->attrs, f_pool); } static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; @@ -1531,6 +1527,30 @@ f_run(struct filter *filter, struct rte **rte, struct ea_list **tmp_attrs, struc return res.val.i; } +/* TODO: perhaps we could integrate f_eval(), f_eval_rte() and f_run() */ + +struct f_val +f_eval_rte(struct f_inst *expr, struct rte **rte, struct linpool *tmp_pool) +{ + struct ea_list *tmp_attrs = NULL; + + f_rte = rte; + f_old_rta = NULL; + f_tmp_attrs = &tmp_attrs; + f_pool = tmp_pool; + f_flags = 0; + + LOG_BUFFER_INIT(f_buf); + + /* Note that in this function we assume that rte->attrs is private / uncached */ + struct f_val res = interpret(expr); + + /* Hack to include EAF_TEMP attributes to the main list */ + (*rte)->attrs->eattrs = ea_append(tmp_attrs, (*rte)->attrs->eattrs); + + return res; +} + struct f_val f_eval(struct f_inst *expr, struct linpool *tmp_pool) { diff --git a/filter/filter.h b/filter/filter.h index 97d5a814..a680808e 100644 --- a/filter/filter.h +++ b/filter/filter.h @@ -107,6 +107,7 @@ struct ea_list; struct rte; int f_run(struct filter *filter, struct rte **rte, struct ea_list **tmp_attrs, struct linpool *tmp_pool, int flags); +struct f_val f_eval_rte(struct f_inst *expr, struct rte **rte, struct linpool *tmp_pool); struct f_val f_eval(struct f_inst *expr, struct linpool *tmp_pool); uint f_eval_int(struct f_inst *expr); u32 f_eval_asn(struct f_inst *expr); diff --git a/filter/filter_test.c b/filter/filter_test.c index a1a1243d..b5c35993 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -51,10 +51,10 @@ t_simple(void) */ struct symbol *sym = NULL; - sym = cf_find_symbol(TESTING_FILTER_NAME); + sym = cf_get_symbol(TESTING_FILTER_NAME); struct symbol *sym2 = NULL; - sym2 = cf_find_symbol(TESTING_FILTER_NAME "2"); + sym2 = cf_get_symbol(TESTING_FILTER_NAME "2"); struct filter *f = sym->def; diff --git a/lib/birdlib.h b/lib/birdlib.h index 77d03a8b..24e769db 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -33,6 +33,7 @@ #endif #define ABS(a) ((a)>=0 ? (a) : -(a)) +#define DELTA(a,b) (((a)>=(b))?(a)-(b):(b)-(a)) #define ARRAY_SIZE(a) (sizeof(a)/sizeof(*(a))) diff --git a/nest/bfd.h b/nest/bfd.h index 79c3c921..f1e95cb2 100644 --- a/nest/bfd.h +++ b/nest/bfd.h @@ -32,6 +32,12 @@ struct bfd_request { }; +#define BFD_STATE_ADMIN_DOWN 0 +#define BFD_STATE_DOWN 1 +#define BFD_STATE_INIT 2 +#define BFD_STATE_UP 3 + + #ifdef CONFIG_BFD struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, void (*hook)(struct bfd_request *), void *data); diff --git a/nest/config.Y b/nest/config.Y index 939bed6a..799a09f9 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -94,6 +94,7 @@ rtrid: idval: NUM { $$ = $1; } + | '(' term ')' { $$ = f_eval_int($2); } | RTRID | IPA { #ifndef IPV6 @@ -102,6 +103,16 @@ idval: cf_error("Router IDs must be entered as hexadecimal numbers or IPv4 addresses in IPv6 version"); #endif } + | SYM { + if ($1->class == (SYM_CONSTANT | T_INT) || $1->class == (SYM_CONSTANT | T_QUAD)) + $$ = SYM_VAL($1).i; +#ifndef IPV6 + else if ($1->class == (SYM_CONSTANT | T_IP)) + $$ = ipa_to_u32(SYM_VAL($1).px.ip); +#endif + else + cf_error("Number of IPv4 address constant expected"); + } ; @@ -183,16 +194,18 @@ proto_name: } | FROM SYM { struct symbol *s = cf_default_name(this_proto->protocol->template, &this_proto->protocol->name_counter); + s->class = this_proto->class; + s->def = this_proto; this_proto->name = s->name; + if (($2->class != SYM_TEMPLATE) && ($2->class != SYM_PROTO)) cf_error("Template or protocol name expected"); proto_copy_config(this_proto, $2->def); } | SYM FROM SYM { - if (($3->class != SYM_TEMPLATE) && ($3->class != SYM_PROTO)) cf_error("Template or protocol name expected"); - cf_define_symbol($1, this_proto->class, this_proto); this_proto->name = $1->name; + if (($3->class != SYM_TEMPLATE) && ($3->class != SYM_PROTO)) cf_error("Template or protocol name expected"); proto_copy_config(this_proto, $3->def); } ; diff --git a/nest/proto.c b/nest/proto.c index 6531083c..d04da333 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -521,7 +521,7 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty WALK_LIST(oc, old->protos) { p = oc->proto; - sym = cf_find_symbol(oc->name); + sym = cf_find_symbol(new, oc->name); if (sym && sym->class == SYM_PROTO && !new->shutdown) { /* Found match, let's check if we can smoothly switch to new configuration */ diff --git a/nest/protocol.h b/nest/protocol.h index a51e9afd..8c49154f 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -158,6 +158,7 @@ struct proto { byte gr_wait; /* Route export to protocol is postponed until graceful restart */ byte down_sched; /* Shutdown is scheduled for later (PDS_*) */ byte down_code; /* Reason for shutdown (PDC_* codes) */ + byte merge_limit; /* Maximal number of nexthops for RA_MERGED */ u32 hash_key; /* Random key used for hashing of neighbors */ bird_clock_t last_state_change; /* Time of last state transition */ char *last_state_name_announced; /* Last state name we've announced to the user */ @@ -200,6 +201,7 @@ struct proto { * rte_recalculate Called at the beginning of the best route selection * rte_better Compare two rte's and decide which one is better (1=first, 0=second). * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). + * rte_mergable Compare two rte's and decide whether they could be merged (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ @@ -207,6 +209,7 @@ struct proto { int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_better)(struct rte *, struct rte *); int (*rte_same)(struct rte *, struct rte *); + int (*rte_mergable)(struct rte *, struct rte *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); diff --git a/nest/route.h b/nest/route.h index 3af75ad8..d0f8c688 100644 --- a/nest/route.h +++ b/nest/route.h @@ -78,7 +78,7 @@ void fit_put(struct fib_iterator *, struct fib_node *); #define FIB_WALK(fib, z) do { \ struct fib_node *z, **ff = (fib)->hash_table; \ - uint count = (fib)->hash_size; \ + uint count = (fib)->hash_size; \ while (count--) \ for(z = *ff++; z; z=z->next) @@ -88,8 +88,8 @@ void fit_put(struct fib_iterator *, struct fib_node *); #define FIB_ITERATE_START(fib, it, z) do { \ struct fib_node *z = fit_get(fib, it); \ - uint count = (fib)->hash_size; \ - uint hpos = (it)->hash; \ + uint count = (fib)->hash_size; \ + uint hpos = (it)->hash; \ for(;;) { \ if (!z) \ { \ @@ -174,7 +174,7 @@ struct hostentry { ip_addr addr; /* IP address of host, part of key */ ip_addr link; /* (link-local) IP address of host, used as gw if host is directly attached */ - struct rtable *tab; /* Dependent table, part of key*/ + struct rtable *tab; /* Dependent table, part of key */ struct hostentry *next; /* Next in hash chain */ unsigned hash_key; /* Hash key */ unsigned uc; /* Use count */ @@ -240,6 +240,7 @@ static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); #define RA_OPTIMAL 1 /* Announcement of optimal route change */ #define RA_ACCEPTED 2 /* Announcement of first accepted route */ #define RA_ANY 3 /* Announcement of any route change */ +#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ /* Return value of import_control() callback */ #define RIC_ACCEPT 1 /* Accepted by protocol */ @@ -263,12 +264,14 @@ void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *s static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); } void rte_discard(rtable *tab, rte *old); int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter); +rte *rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, struct ea_list **tmpa, int silent); void rt_refresh_begin(rtable *t, struct announce_hook *ah); void rt_refresh_end(rtable *t, struct announce_hook *ah); void rte_dump(rte *); void rte_free(rte *); rte *rte_do_cow(rte *); static inline rte * rte_cow(rte *r) { return (r->flags & REF_COW) ? rte_do_cow(r) : r; } +rte *rte_cow_rta(rte *r, linpool *lp); void rt_dump(rtable *); void rt_dump_all(void); int rt_feed_baby(struct proto *p); @@ -388,6 +391,12 @@ typedef struct rta { #define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other protocol-specific metric is availabe */ + +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ +static inline int rte_is_reachable(rte *r) +{ uint d = r->attrs->dest; return (d == RTD_ROUTER) || (d == RTD_DEVICE) || (d == RTD_MULTIPATH); } + + /* * Extended Route Attributes */ @@ -482,6 +491,7 @@ void ea_format_bitfield(struct eattr *a, byte *buf, int bufsize, const char **na int mpnh__same(struct mpnh *x, struct mpnh *y); /* Compare multipath nexthops */ static inline int mpnh_same(struct mpnh *x, struct mpnh *y) { return (x == y) || mpnh__same(x, y); } +struct mpnh *mpnh_merge(struct mpnh *x, struct mpnh *y, int rx, int ry, int max, linpool *lp); void rta_init(void); rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ @@ -489,25 +499,33 @@ static inline int rta_is_cached(rta *r) { return r->aflags & RTAF_CACHED; } static inline rta *rta_clone(rta *r) { r->uc++; return r; } void rta__free(rta *r); static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } +rta *rta_do_cow(rta *o, linpool *lp); +static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } void rta_dump(rta *); void rta_dump_all(void); void rta_show(struct cli *, rta *, ea_list *); void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, ip_addr *ll); /* - * rta_set_recursive_next_hop() acquires hostentry from hostcache and - * fills rta->hostentry field. New hostentry has zero use - * count. Cached rta locks its hostentry (increases its use count), - * uncached rta does not lock it. Hostentry with zero use count is - * removed asynchronously during host cache update, therefore it is - * safe to hold such hostentry temorarily. Hostentry holds a lock for - * a 'source' rta, mainly to share multipath nexthops. There is no - * need to hold a lock for hostentry->dep table, because that table - * contains routes responsible for that hostentry, and therefore is - * non-empty if given hostentry has non-zero use count. The protocol - * responsible for routes with recursive next hops should also hold a - * lock for a table governing that routes (argument tab to - * rta_set_recursive_next_hop()). + * rta_set_recursive_next_hop() acquires hostentry from hostcache and fills + * rta->hostentry field. New hostentry has zero use count. Cached rta locks its + * hostentry (increases its use count), uncached rta does not lock it. Hostentry + * with zero use count is removed asynchronously during host cache update, + * therefore it is safe to hold such hostentry temorarily. Hostentry holds a + * lock for a 'source' rta, mainly to share multipath nexthops. + * + * There is no need to hold a lock for hostentry->dep table, because that table + * contains routes responsible for that hostentry, and therefore is non-empty if + * given hostentry has non-zero use count. If the hostentry has zero use count, + * the entry is removed before dep is referenced. + * + * The protocol responsible for routes with recursive next hops should hold a + * lock for a 'source' table governing that routes (argument tab to + * rta_set_recursive_next_hop()), because its routes reference hostentries + * (through rta) related to the governing table. When all such routes are + * removed, rtas are immediately removed achieving zero uc. Then the 'source' + * table lock could be immediately released, although hostentries may still + * exist - they will be freed together with the 'source' table. */ static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } diff --git a/nest/rt-attr.c b/nest/rt-attr.c index 85a192c8..7fa05d6d 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -167,7 +167,7 @@ rt_get_source(struct proto *p, u32 id) src->private_id = id; src->global_id = rte_src_alloc_id(); src->uc = 0; - + HASH_INSERT2(src_hash, RSH, rta_pool, src); return src; @@ -215,6 +215,94 @@ mpnh__same(struct mpnh *x, struct mpnh *y) return x == y; } +static int +mpnh_compare_node(struct mpnh *x, struct mpnh *y) +{ + int r; + + if (!x) + return 1; + + if (!y) + return -1; + + r = ((int) y->weight) - ((int) x->weight); + if (r) + return r; + + r = ipa_compare(x->gw, y->gw); + if (r) + return r; + + return ((int) x->iface->index) - ((int) y->iface->index); +} + +static inline struct mpnh * +mpnh_copy_node(const struct mpnh *src, linpool *lp) +{ + struct mpnh *n = lp_alloc(lp, sizeof(struct mpnh)); + n->gw = src->gw; + n->iface = src->iface; + n->next = NULL; + n->weight = src->weight; + return n; +} + +/** + * mpnh_merge - merge nexthop lists + * @x: list 1 + * @y: list 2 + * @rx: reusability of list @x + * @ry: reusability of list @y + * @max: max number of nexthops + * @lp: linpool for allocating nexthops + * + * The mpnh_merge() function takes two nexthop lists @x and @y and merges them, + * eliminating possible duplicates. The input lists must be sorted and the + * result is sorted too. The number of nexthops in result is limited by @max. + * New nodes are allocated from linpool @lp. + * + * The arguments @rx and @ry specify whether corresponding input lists may be + * consumed by the function (i.e. their nodes reused in the resulting list), in + * that case the caller should not access these lists after that. To eliminate + * issues with deallocation of these lists, the caller should use some form of + * bulk deallocation (e.g. stack or linpool) to free these nodes when the + * resulting list is no longer needed. When reusability is not set, the + * corresponding lists are not modified nor linked from the resulting list. + */ +struct mpnh * +mpnh_merge(struct mpnh *x, struct mpnh *y, int rx, int ry, int max, linpool *lp) +{ + struct mpnh *root = NULL; + struct mpnh **n = &root; + + while ((x || y) && max--) + { + int cmp = mpnh_compare_node(x, y); + if (cmp < 0) + { + *n = rx ? x : mpnh_copy_node(x, lp); + x = x->next; + } + else if (cmp > 0) + { + *n = ry ? y : mpnh_copy_node(y, lp); + y = y->next; + } + else + { + *n = rx ? x : (ry ? y : mpnh_copy_node(x, lp)); + x = x->next; + y = y->next; + } + n = &((*n)->next); + } + *n = NULL; + + return root; +} + + static struct mpnh * mpnh_copy(struct mpnh *o) { @@ -635,7 +723,7 @@ get_generic_attr(eattr *a, byte **buf, int buflen UNUSED) *buf += bsprintf(*buf, "igp_metric"); return GA_NAME; } - + return GA_UNKNOWN; } @@ -741,7 +829,7 @@ ea_show(struct cli *c, eattr *e) } else if (EA_PROTO(e->id)) pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); - else + else status = get_generic_attr(e, &pos, end - pos); if (status < GA_NAME) @@ -1050,6 +1138,16 @@ rta__free(rta *a) sl_free(rta_slab, a); } +rta * +rta_do_cow(rta *o, linpool *lp) +{ + rta *r = lp_alloc(lp, sizeof(rta)); + memcpy(r, o, sizeof(rta)); + r->aflags = 0; + r->uc = 0; + return r; +} + /** * rta_dump - dump route attributes * @a: attribute structure to dump diff --git a/nest/rt-dev.c b/nest/rt-dev.c index 87ffc5ec..f6bc1432 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -51,7 +51,10 @@ dev_ifa_notify(struct proto *p, unsigned c, struct ifa *ad) DBG("dev_if_notify: device shutdown: prefix not found\n"); return; } - rte_update(p, n, NULL); + + /* Use iface ID as local source ID */ + struct rte_src *src = rt_get_source(p, ad->iface->index); + rte_update2(p->main_ahook, n, NULL, src); } else if (c & IF_CHANGE_UP) { @@ -61,8 +64,11 @@ dev_ifa_notify(struct proto *p, unsigned c, struct ifa *ad) DBG("dev_if_notify: %s:%I going up\n", ad->iface->name, ad->ip); + /* Use iface ID as local source ID */ + struct rte_src *src = rt_get_source(p, ad->iface->index); + rta a0 = { - .src = p->main_source, + .src = src, .source = RTS_DEVICE, .scope = SCOPE_UNIVERSE, .cast = RTC_UNICAST, @@ -75,7 +81,7 @@ dev_ifa_notify(struct proto *p, unsigned c, struct ifa *ad) e = rte_get_temp(a); e->net = n; e->pflags = 0; - rte_update(p, n, e); + rte_update2(p->main_ahook, n, e, src); } } diff --git a/nest/rt-roa.c b/nest/rt-roa.c index aa453f16..0fd89667 100644 --- a/nest/rt-roa.c +++ b/nest/rt-roa.c @@ -311,7 +311,7 @@ roa_commit(struct config *new, struct config *old) if (old) WALK_LIST(t, roa_table_list) { - struct symbol *sym = cf_find_symbol(t->name); + struct symbol *sym = cf_find_symbol(new, t->name); if (sym && sym->class == SYM_ROA) { /* Found old table in new config */ diff --git a/nest/rt-table.c b/nest/rt-table.c index d5af41c7..10ce400a 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -144,6 +144,38 @@ rte_do_cow(rte *r) return e; } +/** + * rte_cow_rta - get a private writable copy of &rte with writable &rta + * @r: a route entry to be copied + * @lp: a linpool from which to allocate &rta + * + * rte_cow_rta() takes a &rte and prepares it and associated &rta for + * modification. There are three possibilities: First, both &rte and &rta are + * private copies, in that case they are returned unchanged. Second, &rte is + * private copy, but &rta is cached, in that case &rta is duplicated using + * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case + * both structures are duplicated by rte_do_cow() and rta_do_cow(). + * + * Note that in the second case, cached &rta loses one reference, while private + * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs, + * nexthops, ...) with it. To work properly, original shared &rta should have + * another reference during the life of created private copy. + * + * Result: a pointer to the new writable &rte with writable &rta. + */ +rte * +rte_cow_rta(rte *r, linpool *lp) +{ + if (!rta_is_cached(r->attrs)) + return r; + + rte *e = rte_cow(r); + rta *a = rta_do_cow(r->attrs, lp); + rta_free(e->attrs); + e->attrs = a; + return e; +} + static int /* Actually better or at least as good as */ rte_better(rte *new, rte *old) { @@ -172,6 +204,26 @@ rte_better(rte *new, rte *old) return 0; } +static int +rte_mergable(rte *pri, rte *sec) +{ + int (*mergable)(rte *, rte *); + + if (!rte_is_valid(pri) || !rte_is_valid(sec)) + return 0; + + if (pri->pref != sec->pref) + return 0; + + if (pri->attrs->src->proto->proto != sec->attrs->src->proto->proto) + return 0; + + if (mergable = pri->attrs->src->proto->rte_mergable) + return mergable(pri, sec); + + return 0; +} + static void rte_trace(struct proto *p, rte *e, int dir, char *msg) { @@ -208,12 +260,10 @@ export_filter(struct announce_hook *ah, rte *rt0, rte **rt_free, ea_list **tmpa, rt = rt0; *rt_free = NULL; - /* If called does not care for eattrs, we prepare one internally */ if (!tmpa) - { - tmpb = make_tmp_attrs(rt, rte_update_pool); - tmpa = &tmpb; - } + tmpa = &tmpb; + + *tmpa = make_tmp_attrs(rt, rte_update_pool); v = p->import_control ? p->import_control(p, &rt, tmpa, rte_update_pool) : 0; if (v < 0) @@ -347,7 +397,7 @@ do_rt_notify(struct announce_hook *ah, net *net, rte *new, rte *old, ea_list *tm } static void -rt_notify_basic(struct announce_hook *ah, net *net, rte *new0, rte *old0, ea_list *tmpa, int refeed) +rt_notify_basic(struct announce_hook *ah, net *net, rte *new0, rte *old0, int refeed) { struct proto *p = ah->proto; struct proto_stats *stats = ah->stats; @@ -356,6 +406,7 @@ rt_notify_basic(struct announce_hook *ah, net *net, rte *new0, rte *old0, ea_lis rte *old = old0; rte *new_free = NULL; rte *old_free = NULL; + ea_list *tmpa = NULL; if (new) stats->exp_updates_received++; @@ -419,17 +470,17 @@ rt_notify_basic(struct announce_hook *ah, net *net, rte *new0, rte *old0, ea_lis } static void -rt_notify_accepted(struct announce_hook *ah, net *net, rte *new_changed, rte *old_changed, rte *before_old, - ea_list *tmpa, int feed) +rt_notify_accepted(struct announce_hook *ah, net *net, rte *new_changed, rte *old_changed, rte *before_old, int feed) { // struct proto *p = ah->proto; struct proto_stats *stats = ah->stats; + rte *r; rte *new_best = NULL; rte *old_best = NULL; rte *new_free = NULL; rte *old_free = NULL; - rte *r; + ea_list *tmpa = NULL; /* Used to track whether we met old_changed position. If before_old is NULL old_changed was the first and we met it implicitly before current best route. */ @@ -536,6 +587,129 @@ rt_notify_accepted(struct announce_hook *ah, net *net, rte *new_changed, rte *ol rte_free(old_free); } + +static struct mpnh * +mpnh_merge_rta(struct mpnh *nhs, rta *a, int max) +{ + struct mpnh nh = { .gw = a->gw, .iface = a->iface }; + struct mpnh *nh2 = (a->dest == RTD_MULTIPATH) ? a->nexthops : &nh; + return mpnh_merge(nhs, nh2, 1, 0, max, rte_update_pool); +} + +rte * +rt_export_merged(struct announce_hook *ah, net *net, rte **rt_free, ea_list **tmpa, int silent) +{ + // struct proto *p = ah->proto; + struct mpnh *nhs = NULL; + rte *best0, *best, *rt0, *rt, *tmp; + + best0 = net->routes; + *rt_free = NULL; + + if (!rte_is_valid(best0)) + return NULL; + + best = export_filter(ah, best0, rt_free, tmpa, silent); + + if (!best || !rte_is_reachable(best)) + return best; + + for (rt0 = best0->next; rt0; rt0 = rt0->next) + { + if (!rte_mergable(best0, rt0)) + continue; + + rt = export_filter(ah, rt0, &tmp, NULL, 1); + + if (!rt) + continue; + + if (rte_is_reachable(rt)) + nhs = mpnh_merge_rta(nhs, rt->attrs, ah->proto->merge_limit); + + if (tmp) + rte_free(tmp); + } + + if (nhs) + { + nhs = mpnh_merge_rta(nhs, best->attrs, ah->proto->merge_limit); + + if (nhs->next) + { + best = rte_cow_rta(best, rte_update_pool); + best->attrs->dest = RTD_MULTIPATH; + best->attrs->nexthops = nhs; + } + } + + if (best != best0) + *rt_free = best; + + return best; +} + + +static void +rt_notify_merged(struct announce_hook *ah, net *net, rte *new_changed, rte *old_changed, + rte *new_best, rte*old_best, int refeed) +{ + // struct proto *p = ah->proto; + + rte *new_best_free = NULL; + rte *old_best_free = NULL; + rte *new_changed_free = NULL; + rte *old_changed_free = NULL; + ea_list *tmpa = NULL; + + /* We assume that all rte arguments are either NULL or rte_is_valid() */ + + /* This check should be done by the caller */ + if (!new_best && !old_best) + return; + + /* Check whether the change is relevant to the merged route */ + if ((new_best == old_best) && !refeed) + { + new_changed = rte_mergable(new_best, new_changed) ? + export_filter(ah, new_changed, &new_changed_free, NULL, 1) : NULL; + + old_changed = rte_mergable(old_best, old_changed) ? + export_filter(ah, old_changed, &old_changed_free, NULL, 1) : NULL; + + if (!new_changed && !old_changed) + return; + } + + if (new_best) + ah->stats->exp_updates_received++; + else + ah->stats->exp_withdraws_received++; + + /* Prepare new merged route */ + if (new_best) + new_best = rt_export_merged(ah, net, &new_best_free, &tmpa, 0); + + /* Prepare old merged route (without proper merged next hops) */ + /* There are some issues with running filter on old route - see rt_notify_basic() */ + if (old_best && !refeed) + old_best = export_filter(ah, old_best, &old_best_free, NULL, 1); + + if (new_best || old_best) + do_rt_notify(ah, net, new_best, old_best, tmpa, refeed); + + /* Discard temporary rte's */ + if (new_best_free) + rte_free(new_best_free); + if (old_best_free) + rte_free(old_best_free); + if (new_changed_free) + rte_free(new_changed_free); + if (old_changed_free) + rte_free(old_changed_free); +} + + /** * rte_announce - announce a routing table change * @tab: table the route has been added to @@ -543,7 +717,6 @@ rt_notify_accepted(struct announce_hook *ah, net *net, rte *new_changed, rte *ol * @net: network in question * @new: the new route to be announced * @old: the previous route for the same network - * @tmpa: a list of temporary attributes belonging to the new route * * This function gets a routing table update and announces it * to all protocols that acccepts given type of route announcement @@ -566,13 +739,20 @@ rt_notify_accepted(struct announce_hook *ah, net *net, rte *new_changed, rte *ol * the protocol gets called. */ static void -rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *before_old, ea_list *tmpa) +rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, + rte *new_best, rte *old_best, rte *before_old) { + if (!rte_is_valid(new)) + new = NULL; + if (!rte_is_valid(old)) old = before_old = NULL; - if (!rte_is_valid(new)) - new = NULL; + if (!rte_is_valid(new_best)) + new_best = NULL; + + if (!rte_is_valid(old_best)) + old_best = NULL; if (!old && !new) return; @@ -594,9 +774,11 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo ASSERT(a->proto->export_state != ES_DOWN); if (a->proto->accept_ra_types == type) if (type == RA_ACCEPTED) - rt_notify_accepted(a, net, new, old, before_old, tmpa, 0); + rt_notify_accepted(a, net, new, old, before_old, 0); + else if (type == RA_MERGED) + rt_notify_merged(a, net, new, old, new_best, old_best, 0); else - rt_notify_basic(a, net, new, old, tmpa, 0); + rt_notify_basic(a, net, new, old, 0); } } @@ -659,7 +841,7 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } static void -rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, struct rte_src *src) +rte_recalculate(struct announce_hook *ah, net *net, rte *new, struct rte_src *src) { struct proto *p = ah->proto; struct rtable *table = ah->table; @@ -900,11 +1082,12 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str } /* Propagate the route change */ - rte_announce(table, RA_ANY, net, new, old, NULL, tmpa); + rte_announce(table, RA_ANY, net, new, old, NULL, NULL, NULL); if (net->routes != old_best) - rte_announce(table, RA_OPTIMAL, net, net->routes, old_best, NULL, tmpa); + rte_announce(table, RA_OPTIMAL, net, net->routes, old_best, NULL, NULL, NULL); if (table->config->sorted) - rte_announce(table, RA_ACCEPTED, net, new, old, before_old, tmpa); + rte_announce(table, RA_ACCEPTED, net, new, old, NULL, NULL, before_old); + rte_announce(table, RA_MERGED, net, new, old, net->routes, old_best, NULL); if (!net->routes && (table->gc_counter++ >= table->config->gc_max_ops) && @@ -1069,7 +1252,7 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src) recalc: rte_hide_dummy_routes(net, &dummy); - rte_recalculate(ah, net, new, tmpa, src); + rte_recalculate(ah, net, new, src); rte_unhide_dummy_routes(net, &dummy); rte_update_unlock(); return; @@ -1077,20 +1260,17 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src) drop: rte_free(new); new = NULL; - tmpa = NULL; goto recalc; } /* Independent call to rte_announce(), used from next hop recalculation, outside of rte_update(). new must be non-NULL */ static inline void -rte_announce_i(rtable *tab, unsigned type, net *n, rte *new, rte *old) +rte_announce_i(rtable *tab, unsigned type, net *net, rte *new, rte *old, + rte *new_best, rte *old_best) { - ea_list *tmpa; - rte_update_lock(); - tmpa = make_tmp_attrs(new, rte_update_pool); - rte_announce(tab, type, n, new, old, NULL, tmpa); + rte_announce(tab, type, net, new, old, new_best, old_best, NULL); rte_update_unlock(); } @@ -1098,7 +1278,7 @@ void rte_discard(rtable *t, rte *old) /* Non-filtered route deletion, used during garbage collection */ { rte_update_lock(); - rte_recalculate(old->sender, old->net, NULL, NULL, old->attrs->src); + rte_recalculate(old->sender, old->net, NULL, old->attrs->src); rte_update_unlock(); } @@ -1483,7 +1663,7 @@ rt_prune_loop(void) void rt_preconfig(struct config *c) { - struct symbol *s = cf_find_symbol("master"); + struct symbol *s = cf_get_symbol("master"); init_list(&c->tables); c->master_rtc = rt_new_table(s); @@ -1554,7 +1734,7 @@ rt_next_hop_update_net(rtable *tab, net *n) new = rt_next_hop_update_rte(tab, e); *k = new; - rte_announce_i(tab, RA_ANY, n, new, e); + rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL); rte_trace_in(D_ROUTES, new->sender->proto, new, "updated"); /* Call a pre-comparison hook */ @@ -1594,10 +1774,13 @@ rt_next_hop_update_net(rtable *tab, net *n) /* Announce the new best route */ if (new != old_best) { - rte_announce_i(tab, RA_OPTIMAL, n, new, old_best); + rte_announce_i(tab, RA_OPTIMAL, n, new, old_best, NULL, NULL); rte_trace_in(D_ROUTES, new->sender->proto, new, "updated [best]"); } + /* FIXME: Better announcement of merged routes */ + rte_announce_i(tab, RA_MERGED, n, new, old_best, new, old_best); + if (free_old_best) rte_free_quick(old_best); @@ -1685,6 +1868,7 @@ rt_unlock_table(rtable *r) { struct config *conf = r->deleted; DBG("Deleting routing table %s\n", r->name); + r->config->table = NULL; if (r->hostcache) rt_free_hostcache(r); rem_node(&r->n); @@ -1720,7 +1904,7 @@ rt_commit(struct config *new, struct config *old) rtable *ot = o->table; if (!ot->deleted) { - struct symbol *sym = cf_find_symbol(o->name); + struct symbol *sym = cf_find_symbol(new, o->name); if (sym && sym->class == SYM_TABLE && !new->shutdown) { DBG("\t%s: same\n", o->name); @@ -1758,14 +1942,13 @@ rt_commit(struct config *new, struct config *old) static inline void do_feed_baby(struct proto *p, int type, struct announce_hook *h, net *n, rte *e) { - ea_list *tmpa; - rte_update_lock(); - tmpa = make_tmp_attrs(e, rte_update_pool); if (type == RA_ACCEPTED) - rt_notify_accepted(h, n, e, NULL, NULL, tmpa, p->refeeding ? 2 : 1); + rt_notify_accepted(h, n, e, NULL, NULL, p->refeeding ? 2 : 1); + else if (type == RA_MERGED) + rt_notify_merged(h, n, NULL, NULL, e, p->refeeding ? e : NULL, p->refeeding); else - rt_notify_basic(h, n, e, p->refeeding ? e : NULL, tmpa, p->refeeding); + rt_notify_basic(h, n, e, p->refeeding ? e : NULL, p->refeeding); rte_update_unlock(); } @@ -1811,20 +1994,26 @@ again: /* XXXX perhaps we should change feed for RA_ACCEPTED to not use 'new' */ if ((p->accept_ra_types == RA_OPTIMAL) || - (p->accept_ra_types == RA_ACCEPTED)) + (p->accept_ra_types == RA_ACCEPTED) || + (p->accept_ra_types == RA_MERGED)) if (rte_is_valid(e)) { if (p->export_state != ES_FEEDING) return 1; /* In the meantime, the protocol fell down. */ + do_feed_baby(p, p->accept_ra_types, h, n, e); max_feed--; } if (p->accept_ra_types == RA_ANY) - for(e = n->routes; rte_is_valid(e); e = e->next) + for(e = n->routes; e; e = e->next) { if (p->export_state != ES_FEEDING) return 1; /* In the meantime, the protocol fell down. */ + + if (!rte_is_valid(e)) + continue; + do_feed_baby(p, RA_ANY, h, n, e); max_feed--; } @@ -2271,12 +2460,22 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) rte_update_lock(); /* We use the update buffer for filtering */ tmpa = make_tmp_attrs(e, rte_update_pool); - if (d->export_mode) + /* Special case for merged export */ + if ((d->export_mode == RSEM_EXPORT) && (d->export_protocol->accept_ra_types == RA_MERGED)) + { + rte *rt_free; + e = rt_export_merged(a, n, &rt_free, &tmpa, 1); + pass = 1; + + if (!e) + { e = ee; goto skip; } + } + else if (d->export_mode) { struct proto *ep = d->export_protocol; int ic = ep->import_control ? ep->import_control(ep, &e, &tmpa, rte_update_pool) : 0; - if (ep->accept_ra_types == RA_OPTIMAL) + if (ep->accept_ra_types == RA_OPTIMAL || ep->accept_ra_types == RA_MERGED) pass = 1; if (ic < 0) diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 5f089846..7a085791 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -43,7 +43,7 @@ * the needs of BFD sessions. When a new session is created, it requests a * proper BFD interface by function bfd_get_iface(), which either finds an * existing one in &iface_list (from &bfd_proto) or allocates a new one. When a - * session is removed, an associated iface is dicharged by bfd_free_iface(). + * session is removed, an associated iface is discharged by bfd_free_iface(). * * BFD requests are the external API for the other protocols. When a protocol * wants a BFD session, it calls bfd_request_session(), which creates a @@ -62,7 +62,7 @@ * configuration (like static routes in the static protocol). BFD neighbors are * handled by BFD protocol like it is a BFD client -- when a BFD neighbor is * ready, the protocol just creates a BFD request like any other protocol. - * + * * The protocol uses a new generic event loop (structure &birdloop) from |io.c|, * which supports sockets, timers and events like the main loop. Timers * (structure &timer2) are new microsecond based timers, while sockets and @@ -129,11 +129,11 @@ static inline void bfd_notify_kick(struct bfd_proto *p); * BFD sessions */ -static void +static void bfd_session_update_state(struct bfd_session *s, uint state, uint diag) { struct bfd_proto *p = s->ifa->bfd; - uint old_state = s->loc_state; + uint old_state = s->loc_state; int notify; if (state == old_state) @@ -201,8 +201,8 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) if (s->passive && (s->rem_id == 0)) goto stop; - if (s->rem_demand_mode && - !s->poll_active && + if (s->rem_demand_mode && + !s->poll_active && (s->loc_state == BFD_STATE_UP) && (s->rem_state == BFD_STATE_UP)) goto stop; @@ -303,7 +303,7 @@ bfd_session_process_ctl(struct bfd_session *s, u8 flags, u32 old_tx_int, u32 old bfd_send_ctl(s->ifa->bfd, s, 1); } -static void +static void bfd_session_timeout(struct bfd_session *s) { struct bfd_proto *p = s->ifa->bfd; @@ -353,7 +353,7 @@ bfd_session_set_min_rx(struct bfd_session *s, u32 val) if (val == s->req_min_rx_new) return; - s->req_min_rx_new = val; + s->req_min_rx_new = val; /* Postpone timer update if req_min_rx_int decreases and the session is up */ if ((s->loc_state != BFD_STATE_UP) || (val > s->req_min_rx_int)) @@ -575,9 +575,13 @@ bfd_free_iface(struct bfd_iface *ifa) if (!ifa || --ifa->uc) return; + if (ifa->sk) + { + sk_stop(ifa->sk); + rfree(ifa->sk); + } + rem_node(&ifa->n); - sk_stop(ifa->sk); - rfree(ifa->sk); mb_free(ifa); } @@ -873,7 +877,7 @@ bfd_notify_hook(sock *sk, int len) diag = s->loc_diag; bfd_unlock_sessions(p); - /* FIXME: convert to btime and move to bfd_session_update_state() */ + /* FIXME: convert to btime and move to bfd_session_update_state() */ s->last_state_change = now; s->notify_running = 1; @@ -1092,7 +1096,7 @@ bfd_show_sessions(struct proto *P) /* FIXME: this is thread-unsafe, but perhaps harmless */ state = s->loc_state; diag = s->loc_diag; - ifname = (s->ifa && s->ifa->sk->iface) ? s->ifa->sk->iface->name : "---"; + ifname = (s->ifa && s->ifa->iface) ? s->ifa->iface->name : "---"; tx_int = s->last_tx ? (MAX(s->des_min_tx_int, s->rem_min_rx_int) TO_MS) : 0; timeout = (MAX(s->req_min_rx_int, s->rem_min_tx_int) TO_MS) * s->rem_detect_mult; diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index b5fd6782..cb40bcda 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -63,9 +63,13 @@ void bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final) { sock *sk = s->ifa->sk; - struct bfd_ctl_packet *pkt = (struct bfd_ctl_packet *) sk->tbuf; + struct bfd_ctl_packet *pkt; char fb[8]; + if (!sk) + return; + + pkt = (struct bfd_ctl_packet *) sk->tbuf; pkt->vdiag = bfd_pack_vdiag(1, s->loc_diag); pkt->flags = bfd_pack_flags(s->loc_state, 0); pkt->detect_mult = s->detect_mult; @@ -139,7 +143,7 @@ bfd_rx_hook(sock *sk, int len) u8 ps = bfd_pkt_get_state(pkt); if (ps > BFD_STATE_DOWN) DROP("invalid init state", ps); - + s = bfd_find_session_by_addr(p, sk->faddr); /* FIXME: better session matching and message */ diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index d56c017d..d85afa8f 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -1312,6 +1312,82 @@ bgp_rte_better(rte *new, rte *old) } +int +bgp_rte_mergable(rte *pri, rte *sec) +{ + struct bgp_proto *pri_bgp = (struct bgp_proto *) pri->attrs->src->proto; + struct bgp_proto *sec_bgp = (struct bgp_proto *) sec->attrs->src->proto; + eattr *x, *y; + u32 p, s; + + /* Skip suppressed routes (see bgp_rte_recalculate()) */ + if (pri->u.bgp.suppressed != sec->u.bgp.suppressed) + return 0; + + /* RFC 4271 9.1.2.1. Route resolvability test */ + if (!rte_resolvable(sec)) + return 0; + + /* Start with local preferences */ + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF)); + p = x ? x->u.data : pri_bgp->cf->default_local_pref; + s = y ? y->u.data : sec_bgp->cf->default_local_pref; + if (p != s) + return 0; + + /* RFC 4271 9.1.2.2. a) Use AS path lengths */ + if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + + if (p != s) + return 0; + +// if (DELTA(p, s) > pri_bgp->cf->relax_multipath) +// return 0; + } + + /* RFC 4271 9.1.2.2. b) Use origins */ + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); + p = x ? x->u.data : ORIGIN_INCOMPLETE; + s = y ? y->u.data : ORIGIN_INCOMPLETE; + if (p != s) + return 0; + + /* RFC 4271 9.1.2.2. c) Compare MED's */ + if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || + (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + p = x ? x->u.data : pri_bgp->cf->default_med; + s = y ? y->u.data : sec_bgp->cf->default_med; + if (p != s) + return 0; + } + + /* RFC 4271 9.1.2.2. d) Prefer external peers */ + if (pri_bgp->is_internal != sec_bgp->is_internal) + return 0; + + /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ + p = pri_bgp->cf->igp_metric ? pri->attrs->igp_metric : 0; + s = sec_bgp->cf->igp_metric ? sec->attrs->igp_metric : 0; + if (p != s) + return 0; + + /* Remaining criteria are ignored */ + + return 1; +} + + + static inline int same_group(rte *r, u32 lpref, u32 lasn) { diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index e48b643b..f549b0ed 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -569,6 +569,7 @@ bgp_send_open(struct bgp_conn *conn) conn->peer_gr_time = 0; conn->peer_gr_flags = 0; conn->peer_gr_aflags = 0; + conn->peer_ext_messages_support = 0; DBG("BGP: Sending open\n"); conn->sk->rx_hook = bgp_rx; @@ -733,8 +734,8 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c s->dport = p->cf->remote_port; s->iface = p->neigh ? p->neigh->iface : NULL; s->ttl = p->cf->ttl_security ? 255 : hops; - s->rbsize = BGP_RX_BUFFER_SIZE; - s->tbsize = BGP_TX_BUFFER_SIZE; + s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE; + s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE; s->tos = IP_PREC_INTERNET_CONTROL; s->password = p->cf->password; s->tx_hook = bgp_connected; @@ -843,6 +844,13 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED) if (sk_set_min_ttl(sk, 256 - hops) < 0) goto err; + if (p->cf->enable_extended_messages) + { + sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; + sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; + sk_reallocate(sk); + } + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); bgp_send_open(&p->incoming_conn); @@ -1243,6 +1251,7 @@ bgp_init(struct proto_config *C) P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; P->rte_better = bgp_rte_better; + P->rte_mergable = bgp_rte_mergable; P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = c; @@ -1517,21 +1526,23 @@ bgp_show_proto_info(struct proto *P) else if (P->proto_state == PS_UP) { cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s", + cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s%s", c->peer_refresh_support ? " refresh" : "", c->peer_enhanced_refresh_support ? " enhanced-refresh" : "", c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), c->peer_as4_support ? " AS4" : "", (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", - (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : ""); - cli_msg(-1006, " Session: %s%s%s%s%s%s%s", + (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "", + c->peer_ext_messages_support ? " ext-messages" : ""); + cli_msg(-1006, " Session: %s%s%s%s%s%s%s%s", p->is_internal ? "internal" : "external", p->cf->multihop ? " multihop" : "", p->rr_client ? " route-reflector" : "", p->rs_client ? " route-server" : "", p->as4_session ? " AS4" : "", p->add_path_rx ? " add-path-rx" : "", - p->add_path_tx ? " add-path-tx" : ""); + p->add_path_tx ? " add-path-tx" : "", + p->ext_messages ? " ext-messages" : ""); cli_msg(-1006, " Source address: %I", p->source_addr); if (P->cf->in_limit) cli_msg(-1006, " Route limit: %d/%d", diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 446fc857..274794f1 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -40,6 +40,7 @@ struct bgp_config { int capabilities; /* Enable capability handshake [RFC3392] */ int enable_refresh; /* Enable local support for route refresh [RFC2918] */ int enable_as4; /* Enable local support for 4B AS numbers [RFC4893] */ + int enable_extended_messages; /* Enable local support for extended messages [draft] */ u32 rr_cluster_id; /* Route reflector cluster ID, if different from local ID */ int rr_client; /* Whether neighbor is RR client of me */ int rs_client; /* Whether neighbor is RS client of me */ @@ -109,6 +110,7 @@ struct bgp_conn { u16 peer_gr_time; u8 peer_gr_flags; u8 peer_gr_aflags; + u8 peer_ext_messages_support; /* Peer supports extended message length [draft] */ unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; @@ -121,6 +123,7 @@ struct bgp_proto { u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ + u8 ext_messages; /* Session allows to use extended messages (both sides support it) */ u32 local_id; /* BGP identifier of this router */ u32 remote_id; /* BGP identifier of the neighbor */ u32 rr_cluster_id; /* Route reflector cluster ID */ @@ -180,9 +183,15 @@ struct bgp_bucket { #define BGP_PORT 179 #define BGP_VERSION 4 #define BGP_HEADER_LENGTH 19 -#define BGP_MAX_PACKET_LENGTH 4096 +#define BGP_MAX_MESSAGE_LENGTH 4096 +#define BGP_MAX_EXT_MSG_LENGTH 65535 #define BGP_RX_BUFFER_SIZE 4096 -#define BGP_TX_BUFFER_SIZE BGP_MAX_PACKET_LENGTH +#define BGP_TX_BUFFER_SIZE 4096 +#define BGP_RX_BUFFER_EXT_SIZE 65535 +#define BGP_TX_BUFFER_EXT_SIZE 65535 + +static inline int bgp_max_packet_length(struct bgp_proto *p) +{ return p->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } extern struct linpool *bgp_linpool; @@ -238,6 +247,7 @@ byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned att struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, uint len, struct linpool *pool, int mandatory); int bgp_get_attr(struct eattr *e, byte *buf, int buflen); int bgp_rte_better(struct rte *, struct rte *); +int bgp_rte_mergable(rte *pri, rte *sec); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 49afe5ae..85b93a6b 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -27,7 +27,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, - CHECK, LINK, PORT) + CHECK, LINK, PORT, EXTENDED, MESSAGES) CF_GRAMMAR @@ -108,6 +108,7 @@ bgp_proto: | bgp_proto DISABLE AFTER ERROR bool ';' { BGP_CFG->disable_after_error = $5; } | bgp_proto ENABLE ROUTE REFRESH bool ';' { BGP_CFG->enable_refresh = $5; } | bgp_proto ENABLE AS4 bool ';' { BGP_CFG->enable_as4 = $4; } + | bgp_proto ENABLE EXTENDED MESSAGES bool ';' { BGP_CFG->enable_extended_messages = $5; } | bgp_proto CAPABILITIES bool ';' { BGP_CFG->capabilities = $3; } | bgp_proto ADVERTISE IPV4 bool ';' { BGP_CFG->advertise_ipv4 = $4; } | bgp_proto PASSWORD text ';' { BGP_CFG->password = $3; } diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 378f5ab1..ed99f623 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -84,7 +84,7 @@ mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4) static void mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) { - byte buf[BGP_MAX_PACKET_LENGTH + 128]; + byte *buf = alloca(128+len); /* 128 is enough for MRT headers */ byte *bp = buf + MRTDUMP_HDR_LENGTH; int as4 = conn->bgp->as4_session; @@ -223,6 +223,14 @@ bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) return buf; } +static byte * +bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) +{ + *buf++ = 230; /* Capability TBD: Support for extended messages */ + *buf++ = 0; /* Capability data length */ + return buf; +} + static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) @@ -274,6 +282,9 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) if (p->cf->enable_refresh) cap = bgp_put_cap_err(p, cap); + if (p->cf->enable_extended_messages) + cap = bgp_put_cap_ext_msg(p, cap); + cap_len = cap - buf - 12; if (cap_len > 0) { @@ -342,7 +353,7 @@ bgp_create_update(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; struct bgp_bucket *buck; - int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4; + int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; byte *w; int wd_size = 0; int r_size = 0; @@ -428,7 +439,7 @@ bgp_create_update(struct bgp_conn *conn, byte *buf) struct bgp_proto *p = conn->bgp; struct bgp_bucket *buck; int size, second, rem_stored; - int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4; + int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; byte *w, *w_stored, *tmp, *tstart; ip_addr *ipp, ip, ip_ll; ea_list *ea; @@ -856,6 +867,12 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_enhanced_refresh_support = 1; break; + case 230: /* Extended message length capability, draft, cap number TBD */ + if (cl != 0) + goto err; + conn->peer_ext_messages_support = 1; + break; + /* We can safely ignore all other capabilities */ } len -= 2 + cl; @@ -1019,6 +1036,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); p->gr_ready = p->cf->gr_mode && conn->peer_gr_able; + p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support; if (p->add_path_tx) p->p.accept_ra_types = RA_ANY; @@ -1418,7 +1436,7 @@ static struct { { 2, 4, "Unsupported optional parameter" }, { 2, 5, "Authentication failure" }, { 2, 6, "Unacceptable hold time" }, - { 2, 7, "Required capability missing" }, /* [RFC3392] */ + { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, @@ -1666,6 +1684,7 @@ int bgp_rx(sock *sk, int size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; unsigned i, len; @@ -1682,7 +1701,7 @@ bgp_rx(sock *sk, int size) break; } len = get_u16(pkt_start+16); - if (len < BGP_HEADER_LENGTH || len > BGP_MAX_PACKET_LENGTH) + if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p)) { bgp_error(conn, 1, 2, pkt_start+16, 2); break; diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 656184c6..77ce839a 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -493,8 +493,11 @@ ospf_iface_add(struct object_lock *lock) ifa->flood_queue = mb_allocz(ifa->pool, ifa->flood_queue_size * sizeof(void *)); } - /* Do iface UP, unless there is no link and we use link detection */ - ospf_iface_sm(ifa, (ifa->check_link && !(ifa->iface->flags & IF_LINK_UP)) ? ISM_LOOP : ISM_UP); + /* Do iface UP, unless there is no link (then wait in LOOP state) */ + if (!ifa->check_link || (ifa->iface->flags & IF_LINK_UP)) + ospf_iface_sm(ifa, ISM_UP); + else + ospf_iface_chstate(ifa, OSPF_IS_LOOP); } static inline void @@ -1344,9 +1347,9 @@ ospf_iface_info(struct ospf_iface *ifa) cli_msg(-1015, "\tRetransmit timer: %u", ifa->rxmtint); if ((ifa->type == OSPF_IT_BCAST) || (ifa->type == OSPF_IT_NBMA)) { - cli_msg(-1015, "\tDesigned router (ID): %R", ifa->drid); - cli_msg(-1015, "\tDesigned router (IP): %I", ifa->drip); - cli_msg(-1015, "\tBackup designed router (ID): %R", ifa->bdrid); - cli_msg(-1015, "\tBackup designed router (IP): %I", ifa->bdrip); + cli_msg(-1015, "\tDesignated router (ID): %R", ifa->drid); + cli_msg(-1015, "\tDesignated router (IP): %I", ifa->drip); + cli_msg(-1015, "\tBackup designated router (ID): %R", ifa->bdrid); + cli_msg(-1015, "\tBackup designated router (IP): %I", ifa->bdrip); } } diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 1bc4e077..d5d5d354 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -450,10 +450,21 @@ ospf_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool if (oa_is_stub(oa)) return -1; /* Do not export routes to stub areas */ - eattr *ea = ea_find(e->attrs->eattrs, EA_GEN_IGP_METRIC); - u32 m1 = (ea && (ea->u.data < LSINFINITY)) ? ea->u.data : LSINFINITY; + ea_list *ea = e->attrs->eattrs; + u32 m0 = ea_get_int(ea, EA_GEN_IGP_METRIC, LSINFINITY); + u32 m1 = MIN(m0, LSINFINITY); + u32 m2 = 10000; + u32 tag = 0; - *attrs = ospf_build_attrs(*attrs, pool, m1, 10000, 0, 0); + /* Hack for setting attributes directly in static protocol */ + if (e->attrs->source == RTS_STATIC) + { + m1 = ea_get_int(ea, EA_OSPF_METRIC1, m1); + m2 = ea_get_int(ea, EA_OSPF_METRIC2, 10000); + tag = ea_get_int(ea, EA_OSPF_TAG, 0); + } + + *attrs = ospf_build_attrs(*attrs, pool, m1, m2, tag, 0); return 0; /* Leave decision to the filters */ } diff --git a/proto/ospf/packet.c b/proto/ospf/packet.c index 0e232017..bfabceb4 100644 --- a/proto/ospf/packet.c +++ b/proto/ospf/packet.c @@ -232,6 +232,10 @@ ospf_rx_hook(sock *sk, int len) const char *err_dsc = NULL; uint err_val = 0; + /* Should not happen */ + if (ifa->state <= OSPF_IS_LOOP) + return 1; + int src_local, dst_local, dst_mcast; src_local = ipa_in_net(sk->faddr, ifa->addr->prefix, ifa->addr->pxlen); dst_local = ipa_equal(sk->laddr, ifa->addr->ip); diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index e68ee0f4..fc52f631 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -53,88 +53,6 @@ new_nexthop(struct ospf_proto *p, ip_addr gw, struct iface *iface, byte weight) return nh; } -static inline struct mpnh * -copy_nexthop(struct ospf_proto *p, const struct mpnh *src) -{ - struct mpnh *nh = lp_alloc(p->nhpool, sizeof(struct mpnh)); - nh->gw = src->gw; - nh->iface = src->iface; - nh->next = NULL; - nh->weight = src->weight; - return nh; -} - -/* Compare nexthops during merge. - We need to maintain nhs sorted to eliminate duplicities */ -static int -cmp_nhs(struct mpnh *s1, struct mpnh *s2) -{ - int r; - - if (!s1) - return 1; - - if (!s2) - return -1; - - r = ((int) s2->weight) - ((int) s1->weight); - if (r) - return r; - - r = ipa_compare(s1->gw, s2->gw); - if (r) - return r; - - return ((int) s1->iface->index) - ((int) s2->iface->index); -} - -static struct mpnh * -merge_nexthops(struct ospf_proto *p, struct mpnh *s1, struct mpnh *s2, int r1, int r2) -{ - struct mpnh *root = NULL; - struct mpnh **n = &root; - int count = p->ecmp; - - ASSERT(p->ecmp); - - /* - * r1, r2 signalize whether we can reuse nexthops from s1, s2. - * New nexthops (s2, new) can be reused if they are not inherited - * from the parent (i.e. it is allocated in calc_next_hop()). - * Current nexthops (s1, en->nhs) can be reused if they weren't - * inherited in previous steps (that is stored in nhs_reuse, - * i.e. created by merging or allocated in calc_next_hop()). - * - * Generally, a node first inherits shared nexthops from its - * parent and later possibly gets reusable copy during merging. - */ - - while ((s1 || s2) && count--) - { - int cmp = cmp_nhs(s1, s2); - if (cmp < 0) - { - *n = r1 ? s1 : copy_nexthop(p, s1); - s1 = s1->next; - } - else if (cmp > 0) - { - *n = r2 ? s2 : copy_nexthop(p, s2); - s2 = s2->next; - } - else - { - *n = r1 ? s1 : (r2 ? s2 : copy_nexthop(p, s1)); - s1 = s1->next; - s2 = s2->next; - } - n = &((*n)->next); - } - *n = NULL; - - return root; -} - /* Returns true if there are device nexthops in n */ static inline int has_device_nexthops(const struct mpnh *n) @@ -178,7 +96,7 @@ fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw) } } - return merge_nexthops(p, root1, root2, 1, 1); + return mpnh_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); } @@ -374,7 +292,8 @@ ort_merge(struct ospf_proto *p, ort *o, const orta *new) if (old->nhs != new->nhs) { - old->nhs = merge_nexthops(p, old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse); + old->nhs = mpnh_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, + p->ecmp, p->nhpool); old->nhs_reuse = 1; } @@ -389,7 +308,8 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) if (old->nhs != new->nhs) { - old->nhs = merge_nexthops(p, old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse); + old->nhs = mpnh_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, + p->ecmp, p->nhpool); old->nhs_reuse = 1; } @@ -1885,8 +1805,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, return; } - /* We know that en->color == CANDIDATE and en->nhs is defined. */ - + /* If en->dist > 0, we know that en->color == CANDIDATE and en->nhs is defined. */ if ((dist == en->dist) && !nh_is_vlink(en->nhs)) { /* @@ -1900,7 +1819,14 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, * allocated in calc_next_hop()). * * Generally, a node first inherits shared nexthops from its parent and - * later possibly gets reusable copy during merging. + * later possibly gets reusable (private) copy during merging. This is more + * or less same for both top_hash_entry nodes and orta nodes. + * + * Note that when a child inherits a private nexthop from its parent, it + * should make the nexthop shared for both parent and child, while we only + * update nhs_reuse for the child node. This makes nhs_reuse field for the + * parent technically incorrect, but it is not a problem as parent's nhs + * will not be modified (and nhs_reuse examined) afterwards. */ /* Keep old ones */ @@ -1909,7 +1835,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, /* Merge old and new */ int new_reuse = (par->nhs != nhs); - en->nhs = merge_nexthops(p, en->nhs, nhs, en->nhs_reuse, new_reuse); + en->nhs = mpnh_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); en->nhs_reuse = 1; return; } diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 61936f3c..30332f3b 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -18,7 +18,8 @@ typedef struct orta { u8 type; /* RTS_OSPF_* */ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging */ + u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. + See a note in rt.c:add_cand() */ u32 options; /* * For ORT_ROUTER routes, options field are router-LSA style diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index e2d6c773..5652ced0 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -39,7 +39,7 @@ struct top_hash_entry #define INSPF 2 u8 mode; /* LSA generated during RT calculation (LSA_RTCALC or LSA_STALE)*/ u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:merge_nexthops() */ + See a note in rt.c:add_cand() */ }; diff --git a/proto/static/config.Y b/proto/static/config.Y index a8bfa36f..182721b3 100644 --- a/proto/static/config.Y +++ b/proto/static/config.Y @@ -14,11 +14,24 @@ CF_DEFINES #define STATIC_CFG ((struct static_config *) this_proto) static struct static_route *this_srt, *this_srt_nh, *last_srt_nh; +static struct f_inst **this_srt_last_cmd; + +static void +static_route_finish(void) +{ + struct static_route *r; + + /* Update undefined use_bfd entries in multipath nexthops */ + if (this_srt->dest == RTD_MULTIPATH) + for (r = this_srt->mp_next; r; r = r->mp_next) + if (r->use_bfd < 0) + r->use_bfd = this_srt->use_bfd; +} CF_DECLS CF_KEYWORDS(STATIC, ROUTE, VIA, DROP, REJECT, PROHIBIT, PREFERENCE, CHECK, LINK) -CF_KEYWORDS(MULTIPATH, WEIGHT, RECURSIVE, IGP, TABLE, BLACKHOLE, UNREACHABLE) +CF_KEYWORDS(MULTIPATH, WEIGHT, RECURSIVE, IGP, TABLE, BLACKHOLE, UNREACHABLE, BFD) CF_GRAMMAR @@ -36,7 +49,7 @@ static_proto: | static_proto proto_item ';' | static_proto CHECK LINK bool ';' { STATIC_CFG->check_link = $4; } | static_proto IGP TABLE rtable ';' { STATIC_CFG->igp_table = $4; } - | static_proto stat_route ';' + | static_proto stat_route stat_route_opt_list ';' { static_route_finish(); } ; stat_route0: ROUTE prefix { @@ -44,6 +57,7 @@ stat_route0: ROUTE prefix { add_tail(&STATIC_CFG->other_routes, &this_srt->n); this_srt->net = $2.addr; this_srt->masklen = $2.len; + this_srt_last_cmd = &(this_srt->cmds); } ; @@ -55,11 +69,15 @@ stat_multipath1: this_srt_nh->via = $2; this_srt_nh->via_if = $3; this_srt_nh->if_name = (void *) this_srt; /* really */ + this_srt_nh->use_bfd = -1; /* undefined */ } | stat_multipath1 WEIGHT expr { this_srt_nh->masklen = $3 - 1; /* really */ if (($3<1) || ($3>256)) cf_error("Weight must be in range 1-256"); } + | stat_multipath1 BFD bool { + this_srt_nh->use_bfd = $3; cf_check_bfd($3); + } ; stat_multipath: @@ -94,6 +112,22 @@ stat_route: | stat_route0 PROHIBIT { this_srt->dest = RTD_PROHIBIT; } ; +stat_route_item: + cmd { *this_srt_last_cmd = $1; this_srt_last_cmd = &($1->next); } + | BFD bool ';' { this_srt->use_bfd = $2; cf_check_bfd($2); } + ; + +stat_route_opts: + /* empty */ + | stat_route_opts stat_route_item + ; + +stat_route_opt_list: + /* empty */ + | '{' stat_route_opts '}' + ; + + CF_CLI(SHOW STATIC, optsym, [], [[Show details of static protocol]]) { static_show(proto_get_named($3, &proto_static)); } ; diff --git a/proto/static/static.c b/proto/static/static.c index 4b72fa9d..be808593 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -42,11 +42,14 @@ #include "nest/route.h" #include "nest/cli.h" #include "conf/conf.h" +#include "filter/filter.h" #include "lib/string.h" #include "lib/alloca.h" #include "static.h" +static linpool *static_lp; + static inline rtable * p_igp_table(struct proto *p) { @@ -54,12 +57,11 @@ p_igp_table(struct proto *p) return cf->igp_table ? cf->igp_table->table : p->table; } - static void static_install(struct proto *p, struct static_route *r, struct iface *ifa) { net *n; - rta a, *aa; + rta a; rte *e; if (r->installed > 0) @@ -108,13 +110,21 @@ static_install(struct proto *p, struct static_route *r, struct iface *ifa) if (r->dest == RTDX_RECURSIVE) rta_set_recursive_next_hop(p->table, &a, p_igp_table(p), &r->via, &r->via); - aa = rta_lookup(&a); + /* We skip rta_lookup() here */ + n = net_get(p->table, r->net, r->masklen); - e = rte_get_temp(aa); + e = rte_get_temp(&a); e->net = n; e->pflags = 0; + + if (r->cmds) + f_eval_rte(r->cmds, &e, static_lp); + rte_update(p, n, e); r->installed = 1; + + if (r->cmds) + lp_flush(static_lp); } static void @@ -131,6 +141,29 @@ static_remove(struct proto *p, struct static_route *r) r->installed = 0; } +static void +static_bfd_notify(struct bfd_request *req); + +static void +static_update_bfd(struct proto *p, struct static_route *r) +{ + struct neighbor *nb = r->neigh; + int bfd_up = (nb->scope > 0) && r->use_bfd; + + if (bfd_up && !r->bfd_req) + { + // ip_addr local = ipa_nonzero(r->local) ? r->local : nb->ifa->ip; + r->bfd_req = bfd_request_session(p->pool, r->via, nb->ifa->ip, nb->iface, + static_bfd_notify, r); + } + + if (!bfd_up && r->bfd_req) + { + rfree(r->bfd_req); + r->bfd_req = NULL; + } +} + static int static_decide(struct static_config *cf, struct static_route *r) { @@ -143,6 +176,9 @@ static_decide(struct static_config *cf, struct static_route *r) if (cf->check_link && !(r->neigh->iface->flags & IF_LINK_UP)) return 0; + if (r->bfd_req && r->bfd_req->state != BFD_STATE_UP) + return 0; + return 1; } @@ -161,6 +197,8 @@ static_add(struct proto *p, struct static_config *cf, struct static_route *r) r->chain = n->data; n->data = r; r->neigh = n; + + static_update_bfd(p, r); if (static_decide(cf, r)) static_install(p, r, n->iface); else @@ -190,6 +228,8 @@ static_add(struct proto *p, struct static_config *cf, struct static_route *r) r2->chain = n->data; n->data = r2; r2->neigh = n; + + static_update_bfd(p, r2); r2->installed = static_decide(cf, r2); count += r2->installed; } @@ -212,6 +252,26 @@ static_add(struct proto *p, struct static_config *cf, struct static_route *r) } } +static void +static_rte_cleanup(struct proto *p, struct static_route *r) +{ + struct static_route *r2; + + if (r->bfd_req) + { + rfree(r->bfd_req); + r->bfd_req = NULL; + } + + if (r->dest == RTD_MULTIPATH) + for (r2 = r->mp_next; r2; r2 = r2->mp_next) + if (r2->bfd_req) + { + rfree(r2->bfd_req); + r2->bfd_req = NULL; + } +} + static int static_start(struct proto *p) { @@ -220,6 +280,9 @@ static_start(struct proto *p) DBG("Static: take off!\n"); + if (!static_lp) + static_lp = lp_new(&root_pool, 1008); + if (cf->igp_table) rt_lock_table(cf->igp_table->table); @@ -241,7 +304,10 @@ static_shutdown(struct proto *p) WALK_LIST(r, cf->iface_routes) r->installed = 0; WALK_LIST(r, cf->other_routes) + { + static_rte_cleanup(p, r); r->installed = 0; + } return PS_DOWN; } @@ -255,6 +321,44 @@ static_cleanup(struct proto *p) rt_unlock_table(cf->igp_table->table); } +static void +static_update_rte(struct proto *p, struct static_route *r) +{ + switch (r->dest) + { + case RTD_ROUTER: + if (static_decide((struct static_config *) p->cf, r)) + static_install(p, r, r->neigh->iface); + else + static_remove(p, r); + break; + + case RTD_NONE: /* a part of multipath route */ + { + int decision = static_decide((struct static_config *) p->cf, r); + if (decision == r->installed) + break; /* no change */ + r->installed = decision; + + struct static_route *r1, *r2; + int count = 0; + r1 = (void *) r->if_name; /* really */ + for (r2 = r1->mp_next; r2; r2 = r2->mp_next) + count += r2->installed; + + if (count) + { + /* Set of nexthops changed - force reinstall */ + r1->installed = 0; + static_install(p, r1, NULL); + } + else + static_remove(p, r1); + + break; + } + } +} static void static_neigh_notify(struct neighbor *n) @@ -264,40 +368,21 @@ static_neigh_notify(struct neighbor *n) DBG("Static: neighbor notify for %I: iface %p\n", n->addr, n->iface); for(r=n->data; r; r=r->chain) - switch (r->dest) - { - case RTD_ROUTER: - if (static_decide((struct static_config *) p->cf, r)) - static_install(p, r, n->iface); - else - static_remove(p, r); - break; + { + static_update_bfd(p, r); + static_update_rte(p, r); + } +} - case RTD_NONE: /* a part of multipath route */ - { - int decision = static_decide((struct static_config *) p->cf, r); - if (decision == r->installed) - break; /* no change */ - r->installed = decision; +static void +static_bfd_notify(struct bfd_request *req) +{ + struct static_route *r = req->data; + struct proto *p = r->neigh->proto; - struct static_route *r1, *r2; - int count = 0; - r1 = (void *) r->if_name; /* really */ - for (r2 = r1->mp_next; r2; r2 = r2->mp_next) - count += r2->installed; + // if (req->down) TRACE(D_EVENTS, "BFD session down for nbr %I on %s", XXXX); - if (count) - { - /* Set of nexthops changed - force reinstall */ - r1->installed = 0; - static_install(p, r1, NULL); - } - else - static_remove(p, r1); - - break; - } - } + static_update_rte(p, r); } static void @@ -352,6 +437,12 @@ static_if_notify(struct proto *p, unsigned flags, struct iface *i) } } +int +static_rte_mergable(rte *pri, rte *sec) +{ + return 1; +} + void static_init_config(struct static_config *c) { @@ -366,6 +457,7 @@ static_init(struct proto_config *c) p->neigh_notify = static_neigh_notify; p->if_notify = static_if_notify; + p->rte_mergable = static_rte_mergable; return p; } @@ -394,7 +486,7 @@ static_same_dest(struct static_route *x, struct static_route *y) for (x = x->mp_next, y = y->mp_next; x && y; x = x->mp_next, y = y->mp_next) - if (!ipa_equal(x->via, y->via) || (x->via_if != y->via_if)) + if (!ipa_equal(x->via, y->via) || (x->via_if != y->via_if) || (x->use_bfd != y->use_bfd)) return 0; return !x && !y; @@ -406,6 +498,13 @@ static_same_dest(struct static_route *x, struct static_route *y) } } +static inline int +static_same_rte(struct static_route *x, struct static_route *y) +{ + return static_same_dest(x, y) && i_same(x->cmds, y->cmds); +} + + static void static_match(struct proto *p, struct static_route *r, struct static_config *n) { @@ -434,7 +533,7 @@ static_match(struct proto *p, struct static_route *r, struct static_config *n) found: /* If destination is different, force reinstall */ - if ((r->installed > 0) && !static_same_dest(r, t)) + if ((r->installed > 0) && !static_same_rte(r, t)) t->installed = -1; else t->installed = r->installed; @@ -472,6 +571,9 @@ static_reconfigure(struct proto *p, struct proto_config *new) WALK_LIST(r, n->other_routes) static_add(p, n, r); + WALK_LIST(r, o->other_routes) + static_rte_cleanup(p, r); + return 1; } @@ -557,13 +659,14 @@ static_show_rt(struct static_route *r) case RTDX_RECURSIVE: bsprintf(via, "recursive %I", r->via); break; default: bsprintf(via, "???"); } - cli_msg(-1009, "%I/%d %s%s", r->net, r->masklen, via, r->installed ? "" : " (dormant)"); + cli_msg(-1009, "%I/%d %s%s%s", r->net, r->masklen, via, + r->bfd_req ? " (bfd)" : "", r->installed ? "" : " (dormant)"); struct static_route *r2; if (r->dest == RTD_MULTIPATH) for (r2 = r->mp_next; r2; r2 = r2->mp_next) - cli_msg(-1009, "\tvia %I%J weight %d%s", r2->via, r2->via_if, r2->masklen + 1, /* really */ - r2->installed ? "" : " (dormant)"); + cli_msg(-1009, "\tvia %I%J weight %d%s%s", r2->via, r2->via_if, r2->masklen + 1, /* really */ + r2->bfd_req ? " (bfd)" : "", r2->installed ? "" : " (dormant)"); } void diff --git a/proto/static/static.h b/proto/static/static.h index 99a0e68b..6b047234 100644 --- a/proto/static/static.h +++ b/proto/static/static.h @@ -9,6 +9,9 @@ #ifndef _BIRD_STATIC_H_ #define _BIRD_STATIC_H_ +#include "nest/route.h" +#include "nest/bfd.h" + struct static_config { struct proto_config c; list iface_routes; /* Routes to search on interface events */ @@ -31,7 +34,10 @@ struct static_route { struct neighbor *neigh; byte *if_name; /* Name for RTD_DEVICE routes */ struct static_route *mp_next; /* Nexthops for RTD_MULTIPATH routes */ + struct f_inst *cmds; /* List of commands for setting attributes */ int installed; /* Installed in rt table, -1 for reinstall */ + int use_bfd; /* Configured to use BFD */ + struct bfd_request *bfd_req; /* BFD request, if BFD is used */ }; /* Dummy nodes (parts of multipath route) abuses masklen field for weight diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 064bae18..29203d1b 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -970,13 +970,15 @@ krt_sock_close_shared(void) } } -void +int krt_sys_start(struct krt_proto *p) { krt_table_map[KRT_CF->sys.table_id] = p; krt_sock_open_shared(); p->sys.sk = krt_sock; + + return 1; } void @@ -992,10 +994,11 @@ krt_sys_shutdown(struct krt_proto *p) #else -void +int krt_sys_start(struct krt_proto *p) { p->sys.sk = krt_sock_open(p->p.pool, p, KRT_CF->sys.table_id); + return 1; } void diff --git a/sysdep/bsd/krt-sys.h b/sysdep/bsd/krt-sys.h index 2c6e35c5..a63f8caf 100644 --- a/sysdep/bsd/krt-sys.h +++ b/sysdep/bsd/krt-sys.h @@ -42,6 +42,7 @@ struct krt_state { }; +static inline void krt_sys_io_init(void) { } static inline void krt_sys_init(struct krt_proto *p UNUSED) { } static inline int krt_sys_get_attr(eattr *a UNUSED, byte *buf UNUSED, int buflen UNUSED) { } diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index e32e4fe1..7fd5f139 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -84,18 +84,18 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i) { return NULL; } #define EA_KRT_FEATURE_ALLFRAG EA_KRT_FEATURES | EA_BIT(0x3) - -#define NL_NUM_TABLES 256 - struct krt_params { - int table_id; /* Kernel table ID we sync with */ + u32 table_id; /* Kernel table ID we sync with */ }; struct krt_state { + struct krt_proto *hash_next; }; static inline void krt_sys_init(struct krt_proto *p UNUSED) { } +static inline void krt_sys_preconfig(struct config *c UNUSED) { } +static inline void krt_sys_postconfig(struct krt_config *x UNUSED) { } #endif diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index f8137e23..e9c225a2 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -23,8 +23,6 @@ CF_ADDTO(kern_proto, kern_proto kern_sys_item ';') kern_sys_item: KERNEL TABLE expr { - if ($3 <= 0 || $3 >= NL_NUM_TABLES) - cf_error("Kernel routing table number out of range"); THIS_KRT->sys.table_id = $3; } ; diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7de15e40..f2f60100 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -25,6 +25,7 @@ #include "lib/krt.h" #include "lib/socket.h" #include "lib/string.h" +#include "lib/hash.h" #include "conf/conf.h" #include @@ -32,6 +33,7 @@ #include #include + #ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */ #define MSG_TRUNC 0x20 #endif @@ -40,6 +42,11 @@ #define IFF_LOWER_UP 0x10000 #endif +#ifndef RTA_TABLE +#define RTA_TABLE 15 +#endif + + /* * Synchronous Netlink interface */ @@ -239,6 +246,16 @@ nl_parse_attrs(struct rtattr *a, struct rtattr **k, int ksize) return 1; } +static inline u32 rta_get_u32(struct rtattr *a) +{ return *(u32 *) RTA_DATA(a); } + +static inline ip4_addr rta_get_ip4(struct rtattr *a) +{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); } + +static inline ip6_addr rta_get_ip6(struct rtattr *a) +{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); } + + struct rtattr * nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen) { @@ -420,7 +437,7 @@ nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max) return -1; metrics[0] |= 1 << a->rta_type; - metrics[a->rta_type] = *(u32 *)RTA_DATA(a); + metrics[a->rta_type] = rta_get_u32(a); } if (len > 0) @@ -456,7 +473,7 @@ nl_parse_link(struct nlmsghdr *h, int scan) return; } name = RTA_DATA(a[IFLA_IFNAME]); - memcpy(&mtu, RTA_DATA(a[IFLA_MTU]), sizeof(u32)); + mtu = rta_get_u32(a[IFLA_MTU]); ifi = if_find_by_index(i->ifi_index); if (!new) @@ -640,7 +657,23 @@ kif_do_scan(struct kif_proto *p UNUSED) * Routes */ -static struct krt_proto *nl_table_map[NL_NUM_TABLES]; +static inline u32 +krt_table_id(struct krt_proto *p) +{ + return KRT_CF->sys.table_id; +} + +static HASH(struct krt_proto) nl_table_map; + +#define RTH_FN(k) u32_hash(k) +#define RTH_EQ(k1,k2) k1 == k2 +#define RTH_KEY(p) krt_table_id(p) +#define RTH_NEXT(p) p->sys.hash_next + +#define RTH_REHASH rth_rehash +#define RTH_PARAMS /8, *2, 2, 2, 6, 20 + +HASH_DEFINE_REHASH_FN(RTH, struct krt_proto) int krt_capable(rte *e) @@ -698,12 +731,20 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) r.r.rtm_family = BIRD_AF; r.r.rtm_dst_len = net->n.pxlen; - r.r.rtm_tos = 0; - r.r.rtm_table = KRT_CF->sys.table_id; r.r.rtm_protocol = RTPROT_BIRD; r.r.rtm_scope = RT_SCOPE_UNIVERSE; nl_add_attr_ipa(&r.h, sizeof(r), RTA_DST, net->n.prefix); + if (krt_table_id(p) < 256) + r.r.rtm_table = krt_table_id(p); + else + nl_add_attr_u32(&r.h, sizeof(r), RTA_TABLE, krt_table_id(p)); + + /* For route delete, we do not specify route attributes */ + if (!new) + return nl_exchange(&r.h); + + if (ea = ea_find(eattrs, EA_KRT_METRIC)) nl_add_attr_u32(&r.h, sizeof(r), RTA_PRIORITY, ea->u.data); @@ -794,11 +835,12 @@ nl_parse_route(struct nlmsghdr *h, int scan) { struct krt_proto *p; struct rtmsg *i; - struct rtattr *a[RTA_CACHEINFO+1]; + struct rtattr *a[RTA_TABLE+1]; int new = h->nlmsg_type == RTM_NEWROUTE; ip_addr dst = IPA_NONE; u32 oif = ~0; + u32 table; int src; if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(RTM_RTA(i), a, sizeof(a))) @@ -810,6 +852,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) (a[RTA_IIF] && RTA_PAYLOAD(a[RTA_IIF]) != 4) || #endif (a[RTA_OIF] && RTA_PAYLOAD(a[RTA_OIF]) != 4) || + (a[RTA_TABLE] && RTA_PAYLOAD(a[RTA_TABLE]) != 4) || (a[RTA_GATEWAY] && RTA_PAYLOAD(a[RTA_GATEWAY]) != sizeof(ip_addr)) || (a[RTA_PRIORITY] && RTA_PAYLOAD(a[RTA_PRIORITY]) != 4) || (a[RTA_PREFSRC] && RTA_PAYLOAD(a[RTA_PREFSRC]) != sizeof(ip_addr)) || @@ -826,12 +869,17 @@ nl_parse_route(struct nlmsghdr *h, int scan) } if (a[RTA_OIF]) - memcpy(&oif, RTA_DATA(a[RTA_OIF]), sizeof(oif)); + oif = rta_get_u32(a[RTA_OIF]); - p = nl_table_map[i->rtm_table]; /* Do we know this table? */ - DBG("KRT: Got %I/%d, type=%d, oif=%d, table=%d, prid=%d, proto=%s\n", dst, i->rtm_dst_len, i->rtm_type, oif, i->rtm_table, i->rtm_protocol, p ? p->p.name : "(none)"); + if (a[RTA_TABLE]) + table = rta_get_u32(a[RTA_TABLE]); + else + table = i->rtm_table; + + p = HASH_FIND(nl_table_map, RTH, table); /* Do we know this table? */ + DBG("KRT: Got %I/%d, type=%d, oif=%d, table=%d, prid=%d, proto=%s\n", dst, i->rtm_dst_len, i->rtm_type, oif, table, i->rtm_protocol, p ? p->p.name : "(none)"); if (!p) - SKIP("unknown table %d\n", i->rtm_table); + SKIP("unknown table %d\n", table); #ifdef IPV6 @@ -960,11 +1008,10 @@ nl_parse_route(struct nlmsghdr *h, int scan) e->u.krt.src = src; e->u.krt.proto = i->rtm_protocol; e->u.krt.type = i->rtm_type; + e->u.krt.metric = 0; if (a[RTA_PRIORITY]) - memcpy(&e->u.krt.metric, RTA_DATA(a[RTA_PRIORITY]), sizeof(e->u.krt.metric)); - else - e->u.krt.metric = 0; + e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]); if (a[RTA_PREFSRC]) { @@ -995,7 +1042,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) ea->attrs[0].id = EA_KRT_REALM; ea->attrs[0].flags = 0; ea->attrs[0].type = EAF_TYPE_INT; - memcpy(&ea->attrs[0].u.data, RTA_DATA(a[RTA_FLOW]), 4); + ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]); } if (a[RTA_METRICS]) @@ -1172,25 +1219,41 @@ nl_open_async(void) bug("Netlink: sk_open failed"); } + /* * Interface to the UNIX krt module */ -static u8 nl_cf_table[(NL_NUM_TABLES+7) / 8]; - void +krt_sys_io_init(void) +{ + HASH_INIT(nl_table_map, krt_pool, 6); +} + +int krt_sys_start(struct krt_proto *p) { - nl_table_map[KRT_CF->sys.table_id] = p; + struct krt_proto *old = HASH_FIND(nl_table_map, RTH, krt_table_id(p)); + + if (old) + { + log(L_ERR "%s: Kernel table %u already registered by %s", + p->p.name, krt_table_id(p), old->p.name); + return 0; + } + + HASH_INSERT2(nl_table_map, RTH, krt_pool, p); nl_open(); nl_open_async(); + + return 1; } void -krt_sys_shutdown(struct krt_proto *p UNUSED) +krt_sys_shutdown(struct krt_proto *p) { - nl_table_map[KRT_CF->sys.table_id] = NULL; + HASH_REMOVE2(nl_table_map, RTH, krt_pool, p); } int @@ -1199,23 +1262,6 @@ krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt return n->sys.table_id == o->sys.table_id; } - -void -krt_sys_preconfig(struct config *c UNUSED) -{ - bzero(&nl_cf_table, sizeof(nl_cf_table)); -} - -void -krt_sys_postconfig(struct krt_config *x) -{ - int id = x->sys.table_id; - - if (nl_cf_table[id/8] & (1 << (id%8))) - cf_error("Multiple kernel syncers defined for table #%d", id); - nl_cf_table[id/8] |= (1 << (id%8)); -} - void krt_sys_init_config(struct krt_config *cf) { diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 20a40547..8a431481 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1330,6 +1330,18 @@ sk_passive_connected(sock *s, int type) log(L_WARN "SOCK: Cannot get remote IP address for TCP<"); } + if (fd >= FD_SETSIZE) + { + /* FIXME: Call err_hook instead ? */ + log(L_ERR "SOCK: Incoming connection from %I%J (port %d) %s", + t->daddr, ipa_is_link_local(t->daddr) ? t->iface : NULL, + t->dport, "rejected due to FD_SETSIZE limit"); + close(fd); + t->fd = -1; + rfree(t); + return 1; + } + if (sk_setup(t) < 0) { /* FIXME: Call err_hook instead ? */ @@ -1406,6 +1418,9 @@ sk_open(sock *s) if (fd < 0) ERR("socket"); + if (fd >= FD_SETSIZE) + ERR2("FD_SETSIZE limit reached"); + s->af = af; s->fd = fd; diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index 630cda38..e036081d 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -17,7 +17,7 @@ CF_DEFINES CF_DECLS -CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC) +CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS) CF_GRAMMAR @@ -47,6 +47,8 @@ kern_item: } | DEVICE ROUTES bool { THIS_KRT->devroutes = $3; } | GRACEFUL RESTART bool { THIS_KRT->graceful_restart = $3; } + | MERGE PATHS bool { THIS_KRT->merge_paths = $3 ? KRT_DEFAULT_ECMP_LIMIT : 0; } + | MERGE PATHS bool LIMIT expr { THIS_KRT->merge_paths = $3 ? $5 : 0; if (($5 <= 0) || ($5 > 255)) cf_error("Merge paths limit must be in range 1-255"); } ; /* Kernel interface protocol */ diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index cfb623ce..49bf9519 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -77,6 +77,7 @@ krt_io_init(void) krt_pool = rp_new(&root_pool, "Kernel Syncer"); krt_filter_lp = lp_new(krt_pool, 4080); init_list(&krt_proto_list); + krt_sys_io_init(); } /* @@ -592,6 +593,48 @@ krt_flush_routes(struct krt_proto *p) FIB_WALK_END; } +static struct rte * +krt_export_net(struct krt_proto *p, net *net, rte **rt_free, ea_list **tmpa) +{ + struct announce_hook *ah = p->p.main_ahook; + struct filter *filter = ah->out_filter; + rte *rt; + + if (p->p.accept_ra_types == RA_MERGED) + return rt_export_merged(ah, net, rt_free, tmpa, 1); + + rt = net->routes; + *rt_free = NULL; + + if (!rte_is_valid(rt)) + return NULL; + + if (filter == FILTER_REJECT) + return NULL; + + struct proto *src = rt->attrs->src->proto; + *tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(rt, krt_filter_lp) : NULL; + + /* We could run krt_import_control() here, but it is already handled by KRF_INSTALLED */ + + if (filter == FILTER_ACCEPT) + goto accept; + + if (f_run(filter, &rt, tmpa, krt_filter_lp, FF_FORCE_TMPATTR) > F_ACCEPT) + goto reject; + + +accept: + if (rt != net->routes) + *rt_free = rt; + return rt; + +reject: + if (rt != net->routes) + rte_free(rt); + return NULL; +} + static int krt_same_dest(rte *k, rte *e) { @@ -620,7 +663,6 @@ krt_same_dest(rte *k, rte *e) void krt_got_route(struct krt_proto *p, rte *e) { - rte *old; net *net = e->net; int verdict; @@ -663,15 +705,26 @@ krt_got_route(struct krt_proto *p, rte *e) goto sentenced; } - old = net->routes; - if ((net->n.flags & KRF_INSTALLED) && rte_is_valid(old)) + if (net->n.flags & KRF_INSTALLED) { - /* There may be changes in route attributes, we ignore that. - Also, this does not work well if gw is changed in export filter */ - if ((net->n.flags & KRF_SYNC_ERROR) || ! krt_same_dest(e, old)) + rte *new, *rt_free; + ea_list *tmpa; + + new = krt_export_net(p, net, &rt_free, &tmpa); + + /* TODO: There also may be changes in route eattrs, we ignore that for now. */ + + if (!new) + verdict = KRF_DELETE; + else if ((net->n.flags & KRF_SYNC_ERROR) || !krt_same_dest(e, new)) verdict = KRF_UPDATE; else verdict = KRF_SEEN; + + if (rt_free) + rte_free(rt_free); + + lp_flush(krt_filter_lp); } else verdict = KRF_DELETE; @@ -692,25 +745,6 @@ krt_got_route(struct krt_proto *p, rte *e) rte_free(e); } -static inline int -krt_export_rte(struct krt_proto *p, rte **new, ea_list **tmpa) -{ - struct filter *filter = p->p.main_ahook->out_filter; - - if (! *new) - return 0; - - if (filter == FILTER_REJECT) - return 0; - - if (filter == FILTER_ACCEPT) - return 1; - - struct proto *src = (*new)->attrs->src->proto; - *tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(*new, krt_filter_lp) : NULL; - return f_run(filter, new, tmpa, krt_filter_lp, FF_FORCE_TMPATTR) <= F_ACCEPT; -} - static void krt_prune(struct krt_proto *p) { @@ -721,7 +755,7 @@ krt_prune(struct krt_proto *p) { net *n = (net *) f; int verdict = f->flags & KRF_VERDICT_MASK; - rte *new, *new0, *old; + rte *new, *old, *rt_free = NULL; ea_list *tmpa = NULL; if (verdict == KRF_UPDATE || verdict == KRF_DELETE) @@ -733,23 +767,18 @@ krt_prune(struct krt_proto *p) else old = NULL; - new = new0 = n->routes; if (verdict == KRF_CREATE || verdict == KRF_UPDATE) { /* We have to run export filter to get proper 'new' route */ - if (! krt_export_rte(p, &new, &tmpa)) - { - /* Route rejected, should not happen (KRF_INSTALLED) but to be sure .. */ - verdict = (verdict == KRF_CREATE) ? KRF_IGNORE : KRF_DELETE; - } + new = krt_export_net(p, n, &rt_free, &tmpa); + + if (!new) + verdict = (verdict == KRF_CREATE) ? KRF_IGNORE : KRF_DELETE; else - { - ea_list **x = &tmpa; - while (*x) - x = &((*x)->next); - *x = new ? new->attrs->eattrs : NULL; - } + tmpa = ea_append(tmpa, new->attrs->eattrs); } + else + new = NULL; switch (verdict) { @@ -778,8 +807,8 @@ krt_prune(struct krt_proto *p) if (old) rte_free(old); - if (new != new0) - rte_free(new); + if (rt_free) + rte_free(rt_free); lp_flush(krt_filter_lp); f->flags &= ~KRF_VERDICT_MASK; } @@ -974,7 +1003,8 @@ krt_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool * * We will remove KRT_INSTALLED flag, which stops such withdraw to be * processed in krt_rt_notify() and krt_replace_rte(). */ - e->net->n.flags &= ~KRF_INSTALLED; + if (e == e->net->routes) + e->net->n.flags &= ~KRF_INSTALLED; #endif return -1; } @@ -1066,11 +1096,13 @@ krt_rte_same(rte *a, rte *b) struct krt_config *krt_cf; static struct proto * -krt_init(struct proto_config *c) +krt_init(struct proto_config *C) { - struct krt_proto *p = proto_new(c, sizeof(struct krt_proto)); + struct krt_proto *p = proto_new(C, sizeof(struct krt_proto)); + struct krt_config *c = (struct krt_config *) C; - p->p.accept_ra_types = RA_OPTIMAL; + p->p.accept_ra_types = c->merge_paths ? RA_MERGED : RA_OPTIMAL; + p->p.merge_limit = c->merge_paths; p->p.import_control = krt_import_control; p->p.rt_notify = krt_rt_notify; p->p.if_notify = krt_if_notify; @@ -1095,7 +1127,11 @@ krt_start(struct proto *P) krt_learn_init(p); #endif - krt_sys_start(p); + if (!krt_sys_start(p)) + { + rem_node(&p->krt_node); + return PS_START; + } krt_scan_timer_start(p); @@ -1119,8 +1155,10 @@ krt_shutdown(struct proto *P) p->ready = 0; p->initialized = 0; - krt_sys_shutdown(p); + if (p->p.proto_state == PS_START) + return PS_DOWN; + krt_sys_shutdown(p); rem_node(&p->krt_node); return PS_DOWN; @@ -1136,7 +1174,8 @@ krt_reconfigure(struct proto *p, struct proto_config *new) return 0; /* persist, graceful restart need not be the same */ - return o->scan_time == n->scan_time && o->learn == n->learn && o->devroutes == n->devroutes; + return o->scan_time == n->scan_time && o->learn == n->learn && + o->devroutes == n->devroutes && o->merge_paths == n->merge_paths; } static void diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 1940cbcd..aea20102 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -26,6 +26,8 @@ struct kif_proto; #define KRF_DELETE 3 /* Should be deleted */ #define KRF_IGNORE 4 /* To be ignored */ +#define KRT_DEFAULT_ECMP_LIMIT 16 + #define EA_KRT_SOURCE EA_CODE(EAP_KRT, 0) #define EA_KRT_METRIC EA_CODE(EAP_KRT, 1) @@ -47,6 +49,7 @@ struct krt_config { int learn; /* Learn routes from other sources */ int devroutes; /* Allow export of device routes */ int graceful_restart; /* Regard graceful restart recovery */ + int merge_paths; /* Exported routes are merged for ECMP */ }; struct krt_proto { @@ -116,8 +119,9 @@ struct proto_config * krt_init_config(int class); /* krt sysdep */ +void krt_sys_io_init(void); void krt_sys_init(struct krt_proto *); -void krt_sys_start(struct krt_proto *); +int krt_sys_start(struct krt_proto *); void krt_sys_shutdown(struct krt_proto *); int krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o); diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index eb33a566..1f3a91d1 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -39,7 +39,6 @@ #include "lib/main_helper.h" - /* * Hic Est main() */ diff --git a/sysdep/unix/main_helper.c b/sysdep/unix/main_helper.c index 7b71bed1..42a9e6cb 100644 --- a/sysdep/unix/main_helper.c +++ b/sysdep/unix/main_helper.c @@ -100,7 +100,7 @@ drop_gid(gid_t gid) void add_num_const(char *name, int val) { - struct symbol *s = cf_find_symbol(name); + struct symbol *s = cf_get_symbol(name); s->class = SYM_CONSTANT | T_INT; s->def = cfg_allocz(sizeof(struct f_val)); SYM_TYPE(s) = T_INT; diff --git a/tools/Makefile.in b/tools/Makefile.in index 3c4573b3..46c7d112 100644 --- a/tools/Makefile.in +++ b/tools/Makefile.in @@ -56,10 +56,12 @@ $(exedir)/bird: $(bird-dep) lib/main.o $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) $(exedir)/birdc: $(birdc-dep) - $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) $(CLIENT_LIBS) + @echo LD $(LDFLAGS) -o $@ $^ $(LIBS) $(CLIENT_LIBS) + @$(CC) $(LDFLAGS) -o $@ $^ $(LIBS) $(CLIENT_LIBS) $(exedir)/birdcl: $(birdcl-dep) - $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) + @echo LD $(LDFLAGS) -o $@ $^ $(LIBS) + @$(CC) $(LDFLAGS) -o $@ $^ $(LIBS) .dir-stamp: sysdep/paths.h mkdir -p $(static-dirs) $(client-dirs) $(doc-dirs) diff --git a/tools/Rules.in b/tools/Rules.in index c383dad8..f2d2dd0b 100644 --- a/tools/Rules.in +++ b/tools/Rules.in @@ -83,12 +83,14 @@ subdir: all.o all.o: $(objs) # $(LD) -r -o $@ $^ # Changed to $(CC) because $(LD) has problems with crosscompiling - $(CC) -nostdlib -r -o $@ $^ + @echo LD -r -o $@ $^ + @$(CC) -nostdlib -r -o $@ $^ endif %.o: $(src-path)%.c - $(CC) $(CFLAGS) -o $@ -c $< + @echo CC -o $@ -c $< + @$(CC) $(CFLAGS) -o $@ -c $< ifndef source-dep source-dep := $(source)