diff --git a/NEWS b/NEWS index 117f4d7b..f7e384b4 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,32 @@ -Version 1.5.0pre (2014-11-05) - Not for production - o Major OSPF protocol redesign - o RFC 6549 - OSPFv2 multi-instance extension +Version 1.5.0 (2015-04-20) + o Major OSPF protocol redesign. + o OSPFv2 multi-instance extension (RFC 6549). + o BGP AS-wide unique router ID (RFC 6286). + o BGP enhanced route refresh (RFC 7313). + o Link state support in BGP. + o Latency tracking and internal watchdog. + o Uses high port range for BFD on BSD. + o Increase max symbol length to 64. + o Allows to define unnamed protocols from templates. + o Fixes two serious bugs in BGP. + o Several bugfixes and minor improvements. + o Several minor option changes: + - OSPF: Protocol-wide 'instance id' option added. + - BGP: Parameters to option 'neighbor' extended. + - BGP: Separate option 'interface' added. + - BGP: Option 'start delay time' renamed to 'connect delay time'. + - BGP: Option 'route limit' deprecated. + + Upgrade notes: + + For OSPF, there are deep internal changes, but user-visible changes + are limited to log messages and minor changes in formatting of command + output. + + For BGP, version 1.5.0 is essentially a minor release. There are two + deprecated options ('start delay time' and 'route limit') and some + minor formatting changes. + Version 1.4.5 (2014-10-06) o New 'show route noexport' command option. diff --git a/conf/conf.h b/conf/conf.h index 00a8c8f2..6ab53e25 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -41,6 +41,10 @@ struct config { u32 gr_wait; /* Graceful restart wait timeout */ int cli_debug; /* Tracing of CLI connections and commands */ + int latency_debug; /* I/O loop tracks duration of each event */ + u32 latency_limit; /* Events with longer duration are logged (us) */ + u32 watchdog_warning; /* I/O loop watchdog limit for warning (us) */ + u32 watchdog_timeout; /* Watchdog timeout (in seconds, 0 = disabled) */ char *err_msg; /* Parser error message */ int err_lino; /* Line containing error */ char *err_file_name; /* File name containing error */ diff --git a/doc/bird.conf.example b/doc/bird.conf.example index dcc62e29..bbfe0020 100644 --- a/doc/bird.conf.example +++ b/doc/bird.conf.example @@ -33,6 +33,10 @@ # Turn on global debugging of all protocols #debug protocols all; +# Turn on internal watchdog +#watchdog warning 5 s; +#watchdog timeout 30 s; + # The direct protocol automatically generates device routes to # all network interfaces. Can exist in as many instances as you wish # if you want to populate multiple routing tables with device routes. @@ -162,7 +166,7 @@ protocol static { # }; # }; #} - + #protocol bgp { # disabled; @@ -186,7 +190,7 @@ protocol static { # source address 198.51.100.14; # What local address we use for the TCP connection # password "secret"; # Password used for MD5 authentication # rr client; # I am a route reflector and the neighor is my client -# rr cluster id 1.0.0.1; # Use this value for cluster id instead of my router id +# rr cluster id 1.0.0.1; # Use this value for cluster id instead of my router id # export where source=RTS_STATIC; # export filter { # if source = RTS_STATIC then { @@ -202,7 +206,7 @@ protocol static { # reject; # }; #} -# +# # Template usage example #template bgp rr_client { # disabled; diff --git a/doc/bird.sgml b/doc/bird.sgml index 04d8284f..fc5fc9ae 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -344,6 +344,23 @@ protocol rip { of connects and disconnects, 2 and higher for logging of all client commands). Default: 0. + debug latency + Activate tracking of elapsed time for internal events. Recent events + could be examined using debug latency limit + If watchdog warning + Set time limit for I/O loop cycle. If one iteration took more time to + complete, a warning is logged. Default: 5 s. + + watchdog timeout + Set time limit for I/O loop cycle. If the limit is breached, BIRD is + killed by abort signal. The timeout has effective granularity of + seconds, zero means disabled. Default: disabled (0). + mrtdump " Set MRTdump file name. This option must be specified to allow MRTdump feature. Default: no dump file. @@ -1787,13 +1804,17 @@ using the following configuration parameters: other means. Default: 0 (no local AS number allowed). enable route refresh - When BGP speaker changes its import filter, it has to re-examine all - routes received from its neighbor against the new filter. As these - routes might not be available, there is a BGP protocol extension Route - Refresh (specified in RFC 2918) that allows BGP speaker to request - re-advertisement of all routes from its neighbor. This option specifies - whether BIRD advertises this capability and accepts such requests. Even - when disabled, BIRD can send route refresh requests. Default: on. + After the initial route exchange, BGP protocol uses incremental updates + to keep BGP speakers synchronized. Sometimes (e.g., if BGP speaker + changes its import filter, or if there is suspicion of inconsistency) it + is necessary to do a new complete route exchange. BGP protocol extension + Route Refresh (RFC 2918) allows BGP speaker to request re-advertisement + of all routes from its neighbor. BGP protocol extension Enhanced Route + Refresh (RFC 7313) specifies explicit begin and end for such exchanges, + therefore the receiver can remove stale routes that were not advertised + during the exchange. This option specifies whether BIRD advertises these + capabilities and supports related procedures. Note that even when + disabled, BIRD can send route refresh requests. Default: on. graceful restart When a BGP speaker restarts or crashes, neighbors will discard all diff --git a/lib/event.c b/lib/event.c index b429c205..c33e0ffc 100644 --- a/lib/event.c +++ b/lib/event.c @@ -114,6 +114,8 @@ ev_schedule(event *e) ev_enqueue(&global_event_list, e); } +void io_log_event(void *hook, void *data); + /** * ev_run_list - run an event list * @l: an event list @@ -132,6 +134,11 @@ ev_run_list(event_list *l) WALK_LIST_FIRST(n, tmp_list) { event *e = SKIP_BACK(event, n, n); + + /* This is ugly hack, we want to log just events executed from the main I/O loop */ + if (l == &global_event_list) + io_log_event(e->hook, e->data); + ev_run(e); } return !EMPTY_LIST(*l); diff --git a/misc/bird.spec b/misc/bird.spec index 30601a91..e6b699a0 100644 --- a/misc/bird.spec +++ b/misc/bird.spec @@ -1,6 +1,6 @@ Summary: BIRD Internet Routing Daemon Name: bird -Version: 1.4.5 +Version: 1.5.0 Release: 1 Copyright: GPL Group: Networking/Daemons @@ -41,11 +41,11 @@ install $RPM_SOURCE_DIR/birdc6 usr/sbin/birdc6 %post /sbin/ldconfig /sbin/chkconfig --add bird - + %preun if [ $1 = 0 ] ; then /sbin/chkconfig --del bird -fi +fi %files %attr(755,root,root) /usr/sbin/bird diff --git a/nest/config.Y b/nest/config.Y index 8b697292..939bed6a 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -181,6 +181,12 @@ proto_name: cf_define_symbol($1, this_proto->class, this_proto); this_proto->name = $1->name; } + | FROM SYM { + struct symbol *s = cf_default_name(this_proto->protocol->template, &this_proto->protocol->name_counter); + this_proto->name = s->name; + if (($2->class != SYM_TEMPLATE) && ($2->class != SYM_PROTO)) cf_error("Template or protocol name expected"); + proto_copy_config(this_proto, $2->def); + } | SYM FROM SYM { if (($3->class != SYM_TEMPLATE) && ($3->class != SYM_PROTO)) cf_error("Template or protocol name expected"); @@ -621,6 +627,8 @@ CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]]) { rdump(&root_pool); cli_msg(0, ""); } ; CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]]) { sk_dump_all(); cli_msg(0, ""); } ; +CF_CLI(DUMP EVENTS,,, [[Dump event log]]) +{ io_log_dump(); cli_msg(0, ""); } ; CF_CLI(DUMP INTERFACES,,, [[Dump interface information]]) { if_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]]) diff --git a/nest/proto.c b/nest/proto.c index 7339e4f4..44cfb637 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -942,8 +942,8 @@ proto_feed_more(void *P) p->export_state = ES_READY; proto_log_state_change(p); - if (p->feed_done) - p->feed_done(p); + if (p->feed_end) + p->feed_end(p); } else { @@ -976,6 +976,9 @@ proto_schedule_feed(struct proto *p, int initial) p->attn->hook = initial ? proto_feed_initial : proto_feed_more; ev_schedule(p->attn); + + if (p->feed_begin) + p->feed_begin(p, initial); } /* diff --git a/nest/protocol.h b/nest/protocol.h index f46e0b13..8660cc2c 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -179,7 +179,8 @@ struct proto { * reload_routes Request protocol to reload all its routes to the core * (using rte_update()). Returns: 0=reload cannot be done, * 1= reload is scheduled and will happen (asynchronously). - * feed_done Notify protocol about finish of route feeding. + * feed_begin Notify protocol about beginning of route feeding. + * feed_end Notify protocol about finish of route feeding. */ void (*if_notify)(struct proto *, unsigned flags, struct iface *i); @@ -190,7 +191,8 @@ struct proto { void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs); int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool); int (*reload_routes)(struct proto *); - void (*feed_done)(struct proto *); + void (*feed_begin)(struct proto *, int initial); + void (*feed_end)(struct proto *); /* * Routing entry hooks (called only for routes belonging to this protocol): diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 050f737f..e48b643b 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -377,6 +377,8 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; + p->feed_state = BFS_NONE; + p->load_state = BFS_NONE; bgp_init_bucket_table(p); bgp_init_prefix_table(p, 8); @@ -394,6 +396,12 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING))) bgp_graceful_restart_done(p); + /* GR capability implies that neighbor will send End-of-RIB */ + if (conn->peer_gr_aware) + p->load_state = BFS_LOADING; + + /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */ + bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); } @@ -504,6 +512,47 @@ bgp_graceful_restart_timeout(timer *t) bgp_stop(p, 0); } + +/** + * bgp_refresh_begin - start incoming enhanced route refresh sequence + * @p: BGP instance + * + * This function is called when an incoming enhanced route refresh sequence is + * started by the neighbor, demarcated by the BoRR packet. The function updates + * the load state and starts the routing table refresh cycle. Note that graceful + * restart also uses routing table refresh cycle, but RFC 7313 and load states + * ensure that these two sequences do not overlap. + */ +void +bgp_refresh_begin(struct bgp_proto *p) +{ + if (p->load_state == BFS_LOADING) + { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } + + p->load_state = BFS_REFRESHING; + rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); +} + +/** + * bgp_refresh_end - finish incoming enhanced route refresh sequence + * @p: BGP instance + * + * This function is called when an incoming enhanced route refresh sequence is + * finished by the neighbor, demarcated by the EoRR packet. The function updates + * the load state and ends the routing table refresh cycle. Routes not received + * during the sequence are removed by the nest. + */ +void +bgp_refresh_end(struct bgp_proto *p) +{ + if (p->load_state != BFS_REFRESHING) + { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } + + p->load_state = BFS_NONE; + rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); +} + + static void bgp_send_open(struct bgp_conn *conn) { @@ -514,6 +563,7 @@ bgp_send_open(struct bgp_conn *conn) conn->peer_refresh_support = 0; conn->peer_as4_support = 0; conn->peer_add_path = 0; + conn->peer_enhanced_refresh_support = 0; conn->peer_gr_aware = 0; conn->peer_gr_able = 0; conn->peer_gr_time = 0; @@ -959,16 +1009,56 @@ bgp_reload_routes(struct proto *P) } static void -bgp_feed_done(struct proto *P) +bgp_feed_begin(struct proto *P, int initial) { struct bgp_proto *p = (struct bgp_proto *) P; - if (!p->conn || !p->cf->gr_mode || p->p.refeeding) + + /* This should not happen */ + if (!p->conn) return; - p->send_end_mark = 1; + if (initial && p->cf->gr_mode) + p->feed_state = BFS_LOADING; + + /* It is refeed and both sides support enhanced route refresh */ + if (!initial && p->cf->enable_refresh && + p->conn->peer_enhanced_refresh_support) + { + /* BoRR must not be sent before End-of-RIB */ + if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED) + return; + + p->feed_state = BFS_REFRESHING; + bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH); + } +} + +static void +bgp_feed_end(struct proto *P) +{ + struct bgp_proto *p = (struct bgp_proto *) P; + + /* This should not happen */ + if (!p->conn) + return; + + /* Non-demarcated feed ended, nothing to do */ + if (p->feed_state == BFS_NONE) + return; + + /* Schedule End-of-RIB packet */ + if (p->feed_state == BFS_LOADING) + p->feed_state = BFS_LOADED; + + /* Schedule EoRR packet */ + if (p->feed_state == BFS_REFRESHING) + p->feed_state = BFS_REFRESHED; + + /* Kick TX hook */ bgp_schedule_packet(p->conn, PKT_UPDATE); } + static void bgp_start_locked(struct object_lock *lock) { @@ -1150,7 +1240,8 @@ bgp_init(struct proto_config *C) P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; - P->feed_done = bgp_feed_done; + P->feed_begin = bgp_feed_begin; + P->feed_end = bgp_feed_end; P->rte_better = bgp_rte_better; P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; @@ -1426,8 +1517,9 @@ bgp_show_proto_info(struct proto *P) else if (P->proto_state == PS_UP) { cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s%s", + cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s", c->peer_refresh_support ? " refresh" : "", + c->peer_enhanced_refresh_support ? " enhanced-refresh" : "", c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), c->peer_as4_support ? " AS4" : "", (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 2c2b02b8..f4f21226 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -103,6 +103,7 @@ struct bgp_conn { u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ u8 peer_add_path; /* Peer supports ADD-PATH [draft] */ + u8 peer_enhanced_refresh_support; /* Peer supports enhanced refresh [RFC7313] */ u8 peer_gr_aware; u8 peer_gr_able; u16 peer_gr_time; @@ -127,6 +128,8 @@ struct bgp_proto { int rs_client; /* Whether neighbor is RS client of me */ u8 gr_ready; /* Neighbor could do graceful restart */ u8 gr_active; /* Neighbor is doing graceful restart */ + u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ + u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ @@ -144,7 +147,6 @@ struct bgp_proto { slab *prefix_slab; /* Slab holding prefix nodes */ list bucket_queue; /* Queue of buckets to send */ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - unsigned send_end_mark; /* End-of-RIB mark scheduled for transmit */ unsigned startup_delay; /* Time to delay protocol startup by due to errors */ bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */ u8 last_error_class; /* Error class of last error */ @@ -196,6 +198,8 @@ void bgp_conn_enter_close_state(struct bgp_conn *conn); void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_handle_graceful_restart(struct bgp_proto *p); void bgp_graceful_restart_done(struct bgp_proto *p); +void bgp_refresh_begin(struct bgp_proto *p); +void bgp_refresh_end(struct bgp_proto *p); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, unsigned subcode); @@ -263,7 +267,8 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define PKT_UPDATE 0x02 #define PKT_NOTIFICATION 0x03 #define PKT_KEEPALIVE 0x04 -#define PKT_ROUTE_REFRESH 0x05 +#define PKT_ROUTE_REFRESH 0x05 /* [RFC2918] */ +#define PKT_BEGIN_REFRESH 0x1e /* Dummy type for BoRR packet [RFC7313] */ #define PKT_SCHEDULE_CLOSE 0x1f /* Used internally to schedule socket close */ /* Attributes */ @@ -306,13 +311,13 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BS_MAX 7 /* BGP start states - * + * * Used in PS_START for fine-grained specification of starting state. * - * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP protocol - * done what is neccessary to start itself (like acquiring the lock), it goes to BSS_CONNECT. - * When some connection attempt failed because of option or capability error, it goes to - * BSS_CONNECT_NOCAP. + * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP + * protocol done what is neccessary to start itself (like acquiring the lock), + * it goes to BSS_CONNECT. When some connection attempt failed because of + * option or capability error, it goes to BSS_CONNECT_NOCAP. */ #define BSS_PREPARE 0 /* Used before ordinary BGP started, i. e. waiting for lock */ @@ -320,6 +325,33 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BSS_CONNECT 2 /* Ordinary BGP connecting */ #define BSS_CONNECT_NOCAP 3 /* Legacy BGP connecting (without capabilities) */ + +/* BGP feed states (TX) + * + * RFC 4724 specifies that an initial feed should end with End-of-RIB mark. + * + * RFC 7313 specifies that a route refresh should be demarcated by BoRR and EoRR packets. + * + * These states (stored in p->feed_state) are used to keep track of these + * requirements. When such feed is started, BFS_LOADING / BFS_REFRESHING is + * set. When it ended, BFS_LOADED / BFS_REFRESHED is set to schedule End-of-RIB + * or EoRR packet. When the packet is sent, the state returned to BFS_NONE. + * + * Note that when a non-demarcated feed (e.g. plain RFC 4271 initial load + * without End-of-RIB or plain RFC 2918 route refresh without BoRR/EoRR + * demarcation) is active, BFS_NONE is set. + * + * BFS_NONE, BFS_LOADING and BFS_REFRESHING are also used as load states (RX) + * with correspondent semantics (-, expecting End-of-RIB, expecting EoRR). + */ + +#define BFS_NONE 0 /* No feed or original non-demarcated feed */ +#define BFS_LOADING 1 /* Initial feed active, End-of-RIB planned */ +#define BFS_LOADED 2 /* Loading done, End-of-RIB marker scheduled */ +#define BFS_REFRESHING 3 /* Route refresh (introduced by BoRR) active */ +#define BFS_REFRESHED 4 /* Refresh done, EoRR packet scheduled */ + + /* Error classes */ #define BE_NONE 0 diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 69646c7d..4bd68f52 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -22,6 +22,12 @@ #include "bgp.h" + +#define BGP_RR_REQUEST 0 +#define BGP_RR_BEGIN 1 +#define BGP_RR_END 2 + + static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS; static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS; @@ -209,6 +215,15 @@ bgp_put_cap_add_path(struct bgp_proto *p, byte *buf) return buf; } +static byte * +bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) +{ + *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ + *buf++ = 0; /* Capability data length */ + return buf; +} + + static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { @@ -256,6 +271,9 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) if (p->cf->add_path) cap = bgp_put_cap_add_path(p, cap); + if (p->cf->enable_refresh) + cap = bgp_put_cap_err(p, cap); + cap_len = cap - buf - 12; if (cap_len > 0) { @@ -389,7 +407,7 @@ static byte * bgp_create_end_mark(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending End-of-RIB"); + BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); put_u32(buf, 0); return buf+4; @@ -568,7 +586,7 @@ static byte * bgp_create_end_mark(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending End-of-RIB"); + BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); put_u16(buf+0, 0); put_u16(buf+2, 6); /* length 4-9 */ @@ -586,19 +604,49 @@ bgp_create_end_mark(struct bgp_conn *conn, byte *buf) #endif -static byte * +static inline byte * bgp_create_route_refresh(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + /* Original original route refresh request, RFC 2918 */ *buf++ = 0; *buf++ = BGP_AF; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ + *buf++ = BGP_RR_REQUEST; + *buf++ = 1; /* SAFI */ return buf; } +static inline byte * +bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf) +{ + struct bgp_proto *p = conn->bgp; + BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + + /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ + *buf++ = 0; + *buf++ = BGP_AF; + *buf++ = BGP_RR_BEGIN; + *buf++ = 1; /* SAFI */ + return buf; +} + +static inline byte * +bgp_create_end_refresh(struct bgp_conn *conn, byte *buf) +{ + struct bgp_proto *p = conn->bgp; + BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + + /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ + *buf++ = 0; + *buf++ = BGP_AF; + *buf++ = BGP_RR_END; + *buf++ = 1; /* SAFI */ + return buf; +} + + static void bgp_create_header(byte *buf, unsigned int len, unsigned int type) { @@ -666,24 +714,44 @@ bgp_fire_tx(struct bgp_conn *conn) type = PKT_ROUTE_REFRESH; end = bgp_create_route_refresh(conn, pkt); } + else if (s & (1 << PKT_BEGIN_REFRESH)) + { + s &= ~(1 << PKT_BEGIN_REFRESH); + type = PKT_ROUTE_REFRESH; /* BoRR is a subtype of RR */ + end = bgp_create_begin_refresh(conn, pkt); + } else if (s & (1 << PKT_UPDATE)) { - end = bgp_create_update(conn, pkt); type = PKT_UPDATE; + end = bgp_create_update(conn, pkt); if (!end) - { + { + /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + conn->packets_to_send = 0; - if (!p->send_end_mark) + if (p->feed_state == BFS_LOADED) + { + type = PKT_UPDATE; + end = bgp_create_end_mark(conn, pkt); + } + + else if (p->feed_state == BFS_REFRESHED) + { + type = PKT_ROUTE_REFRESH; + end = bgp_create_end_refresh(conn, pkt); + } + + else /* Really nothing to send */ return 0; - p->send_end_mark = 0; - end = bgp_create_end_mark(conn, pkt); + p->feed_state = BFS_NONE; } } else return 0; + conn->packets_to_send = s; bgp_create_header(buf, end - buf, type); return sk_send(sk, end - buf); @@ -701,7 +769,7 @@ bgp_schedule_packet(struct bgp_conn *conn, int type) { DBG("BGP: Scheduling packet type %d\n", type); conn->packets_to_send |= 1 << type; - if (conn->sk && conn->sk->tpos == conn->sk->tbuf) + if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev)) ev_schedule(conn->tx_ev); } @@ -737,7 +805,7 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) { if (len < 2 || len < 2 + opt[1]) goto err; - + cl = opt[1]; switch (opt[0]) @@ -780,7 +848,12 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_add_path = opt[2+i+3]; if (conn->peer_add_path > ADD_PATH_FULL) goto err; + break; + case 70: /* Enhanced route refresh capability, RFC 7313 */ + if (cl != 0) + goto err; + conn->peer_enhanced_refresh_support = 1; break; /* We can safely ignore all other capabilities */ @@ -870,7 +943,8 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) if (hold > 0 && hold < 3) { bgp_error(conn, 2, 6, pkt+22, 2); return; } - if (!id || id == 0xffffffff || id == p->local_id) + /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ + if (!id || (p->is_internal && id == p->local_id)) { bgp_error(conn, 2, 3, pkt+24, -4); return; } if ((conn->advertised_as != base_as) && (base_as != AS_TRANS)) @@ -905,8 +979,23 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) break; case BS_OPENCONFIRM: - if ((p->local_id < id) == (conn == &p->incoming_conn)) - { + /* + * Description of collision detection rules in RFC 4271 is confusing and + * contradictory, but it is essentially: + * + * 1. Router with higher ID is dominant + * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] + * 3. When both connections are in OpenConfirm state, one initiated by + * the dominant router is kept. + * + * The first line in the expression below evaluates whether the neighbor + * is dominant, the second line whether the new connection was initiated + * by the neighbor. If both are true (or both are false), we keep the new + * connection, otherwise we keep the old one. + */ + if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as))) + == (conn == &p->incoming_conn)) + { /* Should close the other connection */ BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); bgp_error(other, 6, 7, NULL, 0); @@ -945,7 +1034,10 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) static inline void bgp_rx_end_mark(struct bgp_proto *p) { - BGP_TRACE(D_PACKETS, "Got End-of-RIB"); + BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); + + if (p->load_state == BFS_LOADING) + p->load_state = BFS_NONE; if (p->p.gr_recovery) proto_graceful_restart_unlock(&p->p); @@ -999,7 +1091,11 @@ bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, if (!*a) { a0->src = *src; + + /* Workaround for rta_lookup() breaking eattrs */ + ea_list *ea = a0->eattrs; *a = rta_lookup(a0); + a0->eattrs = ea; } net *n = net_get(p->p.table, prefix, pxlen); @@ -1349,7 +1445,9 @@ static struct { { 6, 5, "Connection rejected" }, { 6, 6, "Other configuration change" }, { 6, 7, "Connection collision resolution" }, - { 6, 8, "Out of Resources" } + { 6, 8, "Out of Resources" }, + { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */ + { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */ }; /** @@ -1480,22 +1578,47 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, int len) { struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - if (conn->state != BS_ESTABLISHED) { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } if (!p->cf->enable_refresh) { bgp_error(conn, 1, 3, pkt+18, 1); return; } - if (len != (BGP_HEADER_LENGTH + 4)) + if (len < (BGP_HEADER_LENGTH + 4)) { bgp_error(conn, 1, 2, pkt+16, 2); return; } + if (len > (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } + /* FIXME - we ignore AFI/SAFI values, as we support just one value and even an error code for an invalid request is not defined */ - proto_request_feeding(&p->p); + /* RFC 7313 redefined reserved field as RR message subtype */ + uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST; + + switch (subtype) + { + case BGP_RR_REQUEST: + BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); + proto_request_feeding(&p->p); + break; + + case BGP_RR_BEGIN: + BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); + bgp_refresh_begin(p); + break; + + case BGP_RR_END: + BGP_TRACE(D_PACKETS, "Got END-OF-RR"); + bgp_refresh_end(p); + break; + + default: + log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", + p->p.name, subtype); + break; + } } diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index c324f431..a4e525ec 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -916,9 +916,11 @@ static inline void ospf_send_to_des(struct ospf_iface *ifa) ospf_send_to_bdr(ifa); } +#ifndef PARSER #define DROP(DSC,VAL) do { err_dsc = DSC; err_val = VAL; goto drop; } while(0) #define DROP1(DSC) do { err_dsc = DSC; goto drop; } while(0) #define SKIP(DSC) do { err_dsc = DSC; goto skip; } while(0) +#endif static inline uint ospf_pkt_hdrlen(struct ospf_proto *p) { return ospf_is_v2(p) ? (sizeof(struct ospf_packet) + sizeof(union ospf_auth)) : sizeof(struct ospf_packet); } diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index b616c0d1..74d10c7b 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -95,6 +95,8 @@ merge_nexthops(struct ospf_proto *p, struct mpnh *s1, struct mpnh *s2, int r1, i struct mpnh **n = &root; int count = p->ecmp; + ASSERT(p->ecmp); + /* * r1, r2 signalize whether we can reuse nexthops from s1, s2. * New nexthops (s2, new) can be reused if they are not inherited @@ -153,6 +155,9 @@ fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw) struct mpnh **nn1 = &root1; struct mpnh **nn2 = &root2; + if (!p->ecmp) + return new_nexthop(p, gw, n->iface, n->weight); + /* This is a bit tricky. We cannot just copy the list and update n->gw, because the list should stay sorted, so we create two lists, one with new gateways and one with old ones, and then merge them. */ diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 0e65c51c..73f69df5 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -247,7 +247,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) #ifdef IPV6 /* Embed interface ID to link-local address */ - if (ipa_has_link_scope(gw)) + if (ipa_is_link_local(gw)) _I0(gw) = 0xfe800000 | (i->index & 0x0000ffff); #endif @@ -468,7 +468,7 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) #ifdef IPV6 /* Clean up embedded interface ID returned in link-local address */ - if (ipa_has_link_scope(a.gw)) + if (ipa_is_link_local(a.gw)) _I0(a.gw) = 0xfe800000; #endif @@ -662,10 +662,10 @@ krt_read_addr(struct ks_msg *msg, int scan) #ifdef IPV6 /* Clean up embedded interface ID returned in link-local address */ - if (ipa_has_link_scope(iaddr)) + if (ipa_is_link_local(iaddr)) _I0(iaddr) = 0xfe800000; - if (ipa_has_link_scope(ibrd)) + if (ipa_is_link_local(ibrd)) _I0(ibrd) = 0xfe800000; #endif diff --git a/sysdep/config.h b/sysdep/config.h index 36cf8391..08c15fe9 100644 --- a/sysdep/config.h +++ b/sysdep/config.h @@ -7,7 +7,7 @@ #define _BIRD_CONFIG_H_ /* BIRD version */ -#define BIRD_VERSION "1.5.0pre" +#define BIRD_VERSION "1.5.0" /* Include parameters determined by configure script */ #include "sysdep/autoconf.h" diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 860c8601..48dd8bab 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -436,6 +436,9 @@ nl_parse_link(struct nlmsghdr *h, int scan) else f.flags |= IF_MULTIACCESS; /* NBMA */ + if (fl & IFF_MULTICAST) + f.flags |= IF_MULTICAST; + ifi = if_update(&f); if (!scan) diff --git a/sysdep/unix/config.Y b/sysdep/unix/config.Y index 7fd0ad2d..d6ab8cab 100644 --- a/sysdep/unix/config.Y +++ b/sysdep/unix/config.Y @@ -15,6 +15,7 @@ CF_DECLS CF_KEYWORDS(LOG, SYSLOG, ALL, DEBUG, TRACE, INFO, REMOTE, WARNING, ERROR, AUTH, FATAL, BUG, STDERR, SOFT) CF_KEYWORDS(TIMEFORMAT, ISO, OLD, SHORT, LONG, BASE, NAME, CONFIRM, UNDO, CHECK, TIMEOUT) +CF_KEYWORDS(DEBUG, LATENCY, LIMIT, WATCHDOG, WARNING, TIMEOUT) %type log_mask log_mask_list log_cat cfg_timeout %type log_file @@ -83,6 +84,7 @@ mrtdump_base: } ; + CF_ADDTO(conf, timeformat_base) timeformat_which: @@ -104,6 +106,17 @@ timeformat_base: TIMEFORMAT timeformat_spec ';' ; + +CF_ADDTO(conf, debug_unix) + +debug_unix: + DEBUG LATENCY bool { new_config->latency_debug = $3; } + | DEBUG LATENCY LIMIT expr_us { new_config->latency_limit = $4; } + | WATCHDOG WARNING expr_us { new_config->watchdog_warning = $3; } + | WATCHDOG TIMEOUT expr_us { new_config->watchdog_timeout = ($3 + 999999) TO_S; } + ; + + /* Unix specific commands */ CF_CLI_HELP(CONFIGURE, ..., [[Reload configuration]]) diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index daf9d054..0724667d 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -332,6 +332,8 @@ tm_first_shot(void) return x; } +void io_log_event(void *hook, void *data); + static void tm_shot(void) { @@ -372,6 +374,7 @@ tm_shot(void) i = 0; tm_start(t, i); } + io_log_event(t->hook, t->data); t->hook(t); } } @@ -764,6 +767,32 @@ sk_set_tos6(sock *s, int tos) return 0; } +static inline int +sk_set_high_port(sock *s) +{ + /* Port range setting is optional, ignore it if not supported */ + +#ifdef IP_PORTRANGE + if (sk_is_ipv4(s)) + { + int range = IP_PORTRANGE_HIGH; + if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0) + ERR("IP_PORTRANGE"); + } +#endif + +#ifdef IPV6_PORTRANGE + if (sk_is_ipv6(s)) + { + int range = IPV6_PORTRANGE_HIGH; + if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0) + ERR("IPV6_PORTRANGE"); + } +#endif + + return 0; +} + static inline byte * sk_skip_ip_header(byte *pkt, int *len) { @@ -1103,7 +1132,7 @@ sk_dump(resource *r) sock *s = (sock *) r; static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" }; - debug("(%s, ud=%p, sa=%08x, sp=%d, da=%08x, dp=%d, tos=%d, ttl=%d, if=%s)\n", + debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n", sk_type_names[s->type], s->data, s->saddr, @@ -1399,14 +1428,10 @@ sk_open(sock *s) } #endif } -#ifdef IP_PORTRANGE - else if (s->flags & SKF_HIGH_PORT) - { - int range = IP_PORTRANGE_HIGH; - if (setsockopt(fd, IPPROTO_IP, IP_PORTRANGE, &range, sizeof(range)) < 0) - log(L_WARN "Socket error: %s%#m", "IP_PORTRANGE"); - } -#endif + else + if (s->flags & SKF_HIGH_PORT) + if (sk_set_high_port(s) < 0) + log(L_WARN "Socket error: %s%#m", s->err); sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port); if (bind(fd, &sa.sa, SA_LEN(sa)) < 0) @@ -1839,6 +1864,162 @@ sk_dump_all(void) } +/* + * Internal event log and watchdog + */ + +#define EVENT_LOG_LENGTH 32 + +struct event_log_entry +{ + void *hook; + void *data; + btime timestamp; + btime duration; +}; + +static struct event_log_entry event_log[EVENT_LOG_LENGTH]; +static struct event_log_entry *event_open; +static int event_log_pos, event_log_num, watchdog_active; +static btime last_time; +static btime loop_time; + +static void +io_update_time(void) +{ + struct timespec ts; + int rv; + + if (!clock_monotonic_available) + return; + + /* + * This is third time-tracking procedure (after update_times() above and + * times_update() in BFD), dedicated to internal event log and latency + * tracking. Hopefully, we consolidate these sometimes. + */ + + rv = clock_gettime(CLOCK_MONOTONIC, &ts); + if (rv < 0) + die("clock_gettime: %m"); + + last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000); + + if (event_open) + { + event_open->duration = last_time - event_open->timestamp; + + if (event_open->duration > config->latency_limit) + log(L_WARN "Event 0x%p 0x%p took %d ms", + event_open->hook, event_open->data, (int) (event_open->duration TO_MS)); + + event_open = NULL; + } +} + +/** + * io_log_event - mark approaching event into event log + * @hook: event hook address + * @data: event data address + * + * Store info (hook, data, timestamp) about the following internal event into + * a circular event log (@event_log). When latency tracking is enabled, the log + * entry is kept open (in @event_open) so the duration can be filled later. + */ +void +io_log_event(void *hook, void *data) +{ + if (config->latency_debug) + io_update_time(); + + struct event_log_entry *en = event_log + event_log_pos; + + en->hook = hook; + en->data = data; + en->timestamp = last_time; + en->duration = 0; + + event_log_num++; + event_log_pos++; + event_log_pos %= EVENT_LOG_LENGTH; + + event_open = config->latency_debug ? en : NULL; +} + +static inline void +io_close_event(void) +{ + if (event_open) + io_update_time(); +} + +void +io_log_dump(void) +{ + int i; + + log(L_DEBUG "Event log:"); + for (i = 0; i < EVENT_LOG_LENGTH; i++) + { + struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH; + if (en->hook) + log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data, + (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); + } +} + +void +watchdog_sigalrm(int sig UNUSED) +{ + /* Update last_time and duration, but skip latency check */ + config->latency_limit = 0xffffffff; + io_update_time(); + + /* We want core dump */ + abort(); +} + +static inline void +watchdog_start1(void) +{ + io_update_time(); + + loop_time = last_time; +} + +static inline void +watchdog_start(void) +{ + io_update_time(); + + loop_time = last_time; + event_log_num = 0; + + if (config->watchdog_timeout) + { + alarm(config->watchdog_timeout); + watchdog_active = 1; + } +} + +static inline void +watchdog_stop(void) +{ + io_update_time(); + + if (watchdog_active) + { + alarm(0); + watchdog_active = 0; + } + + btime duration = last_time - loop_time; + if (duration > config->watchdog_warning) + log(L_WARN "I/O loop cycle took %d ms for %d events", + (int) (duration TO_MS), event_log_num); +} + + /* * Main I/O Loop */ @@ -1873,6 +2054,7 @@ io_loop(void) sock *s; node *n; + watchdog_start1(); sock_recalc_fdsets_p = 1; for(;;) { @@ -1887,6 +2069,8 @@ io_loop(void) timo.tv_sec = events ? 0 : MIN(tout - now, 3); timo.tv_usec = 0; + io_close_event(); + if (sock_recalc_fdsets_p) { sock_recalc_fdsets_p = 0; @@ -1923,25 +2107,30 @@ io_loop(void) if (async_config_flag) { + io_log_event(async_config, NULL); async_config(); async_config_flag = 0; continue; } if (async_dump_flag) { + io_log_event(async_dump, NULL); async_dump(); async_dump_flag = 0; continue; } if (async_shutdown_flag) { + io_log_event(async_shutdown, NULL); async_shutdown(); async_shutdown_flag = 0; continue; } /* And finally enter select() to find active sockets */ + watchdog_stop(); hi = select(hi+1, &rd, &wr, NULL, &timo); + watchdog_start(); if (hi < 0) { @@ -1965,6 +2154,7 @@ io_loop(void) do { steps--; + io_log_event(s->rx_hook, s->data); e = sk_read(s); if (s != current_sock) goto next; @@ -1976,6 +2166,7 @@ io_loop(void) do { steps--; + io_log_event(s->tx_hook, s->data); e = sk_write(s); if (s != current_sock) goto next; @@ -2003,6 +2194,7 @@ io_loop(void) if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook) { count++; + io_log_event(s->rx_hook, s->data); e = sk_read(s); if (s != current_sock) goto next2; diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 78514cf5..0a223a4f 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -1023,7 +1023,7 @@ krt_reload_routes(struct proto *P) } static void -krt_feed_done(struct proto *P) +krt_feed_end(struct proto *P) { struct krt_proto *p = (struct krt_proto *) P; @@ -1056,7 +1056,7 @@ krt_init(struct proto_config *c) p->p.rt_notify = krt_rt_notify; p->p.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; - p->p.feed_done = krt_feed_done; + p->p.feed_end = krt_feed_end; p->p.make_tmp_attrs = krt_make_tmp_attrs; p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.rte_same = krt_rte_same; diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 61b306dc..05f7560d 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -168,6 +168,9 @@ sysdep_preconfig(struct config *c) { init_list(&c->logfiles); + c->latency_limit = UNIX_DEFAULT_LATENCY_LIMIT; + c->watchdog_warning = UNIX_DEFAULT_WATCHDOG_WARNING; + #ifdef PATH_IPROUTE_DIR read_iproute_table(PATH_IPROUTE_DIR "/rt_protos", "ipp_", 256); read_iproute_table(PATH_IPROUTE_DIR "/rt_realms", "ipr_", 256); @@ -585,6 +588,8 @@ handle_sigterm(int sig UNUSED) async_shutdown_flag = 1; } +void watchdog_sigalrm(int sig UNUSED); + static void signal_init(void) { @@ -600,6 +605,9 @@ signal_init(void) sa.sa_handler = handle_sigterm; sa.sa_flags = SA_RESTART; sigaction(SIGTERM, &sa, NULL); + sa.sa_handler = watchdog_sigalrm; + sa.sa_flags = 0; + sigaction(SIGALRM, &sa, NULL); signal(SIGPIPE, SIG_IGN); } diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index 3cee96b4..593978cc 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -27,8 +27,10 @@ void cmd_reconfig_confirm(void); void cmd_reconfig_undo(void); void cmd_shutdown(void); -#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300 +#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300 +#define UNIX_DEFAULT_LATENCY_LIMIT (1 S_) +#define UNIX_DEFAULT_WATCHDOG_WARNING (5 S_) /* io.c */ @@ -99,6 +101,7 @@ volatile int async_shutdown_flag; void io_init(void); void io_loop(void); +void io_log_dump(void); int sk_open_unix(struct birdsock *s, char *name); void *tracked_fopen(struct pool *, char *name, char *mode); void test_old_bird(char *path);