0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2025-01-18 15:01:53 +00:00
bird/proto/bgp/bgp.c

3007 lines
84 KiB
C
Raw Normal View History

2000-03-19 22:09:07 +00:00
/*
* BIRD -- The Border Gateway Protocol
*
* (c) 2000 Martin Mares <mj@ucw.cz>
* (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
* (c) 2008--2016 CZ.NIC z.s.p.o.
2000-03-19 22:09:07 +00:00
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
2000-06-04 17:06:18 +00:00
/**
* DOC: Border Gateway Protocol
*
* The BGP protocol is implemented in three parts: |bgp.c| which takes care of
* the connection and most of the interface with BIRD core, |packets.c| handling
2000-06-04 17:06:18 +00:00
* both incoming and outgoing BGP packets and |attrs.c| containing functions for
* manipulation with BGP attribute lists.
*
* As opposed to the other existing routing daemons, BIRD has a sophisticated
* core architecture which is able to keep all the information needed by BGP in
* the primary routing table, therefore no complex data structures like a
* central BGP table are needed. This increases memory footprint of a BGP router
* with many connections, but not too much and, which is more important, it
* makes BGP much easier to implement.
2000-06-04 17:06:18 +00:00
*
* Each instance of BGP (corresponding to a single BGP peer) is described by a
* &bgp_proto structure to which are attached individual connections represented
* by &bgp_connection (usually, there exists only one connection, but during BGP
* session setup, there can be more of them). The connections are handled
* according to the BGP state machine defined in the RFC with all the timers and
* all the parameters configurable.
2000-06-04 17:06:18 +00:00
*
* In incoming direction, we listen on the connection's socket and each time we
* receive some input, we pass it to bgp_rx(). It decodes packet headers and the
* markers and passes complete packets to bgp_rx_packet() which distributes the
* packet according to its type.
2000-06-04 17:06:18 +00:00
*
* In outgoing direction, we gather all the routing updates and sort them to
* buckets (&bgp_bucket) according to their attributes (we keep a hash table for
* fast comparison of &rta's and a &fib which helps us to find if we already
* have another route for the same destination queued for sending, so that we
* can replace it with the new one immediately instead of sending both
* updates). There also exists a special bucket holding all the route
* withdrawals which cannot be queued anywhere else as they don't have any
* attributes. If we have any packet to send (due to either new routes or the
* connection tracking code wanting to send a Open, Keepalive or Notification
* message), we call bgp_schedule_packet() which sets the corresponding bit in a
* @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
* buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
* packet type bits and calls the corresponding bgp_create_xx() functions,
* eventually rescheduling the same packet type if we have more data of the same
* type to send.
2000-06-04 17:06:18 +00:00
*
* The processing of attributes consists of two functions: bgp_decode_attrs()
* for checking of the attribute blocks and translating them to the language of
* BIRD's extended attributes and bgp_encode_attrs() which does the
* converse. Both functions are built around a @bgp_attr_table array describing
* all important characteristics of all known attributes. Unknown transitive
* attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
*
* BGP protocol implements graceful restart in both restarting (local restart)
* and receiving (neighbor restart) roles. The first is handled mostly by the
* graceful restart code in the nest, BGP protocol just handles capabilities,
* sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
* The second is implemented by internal restart of the BGP state to %BS_IDLE
* and protocol state to %PS_START, but keeping the protocol up from the core
* point of view and therefore maintaining received routes. Routing table
* refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
* stale routes after reestablishment of BGP session during graceful restart.
*
* Supported standards:
* RFC 4271 - Border Gateway Protocol 4 (BGP)
* RFC 1997 - BGP Communities Attribute
* RFC 2385 - Protection of BGP Sessions via TCP MD5 Signature
* RFC 2545 - Use of BGP Multiprotocol Extensions for IPv6
* RFC 2918 - Route Refresh Capability
* RFC 3107 - Carrying Label Information in BGP
* RFC 4360 - BGP Extended Communities Attribute
* RFC 4364 - BGP/MPLS IPv4 Virtual Private Networks
* RFC 4456 - BGP Route Reflection
* RFC 4486 - Subcodes for BGP Cease Notification Message
* RFC 4659 - BGP/MPLS IPv6 Virtual Private Networks
* RFC 4724 - Graceful Restart Mechanism for BGP
* RFC 4760 - Multiprotocol extensions for BGP
* RFC 4798 - Connecting IPv6 Islands over IPv4 MPLS
* RFC 5065 - AS confederations for BGP
* RFC 5082 - Generalized TTL Security Mechanism
* RFC 5492 - Capabilities Advertisement with BGP
* RFC 5575 - Dissemination of Flow Specification Rules
* RFC 5668 - 4-Octet AS Specific BGP Extended Community
* RFC 6286 - AS-Wide Unique BGP Identifier
* RFC 6608 - Subcodes for BGP Finite State Machine Error
* RFC 6793 - BGP Support for 4-Octet AS Numbers
2019-09-28 12:17:20 +00:00
* RFC 7311 - Accumulated IGP Metric Attribute for BGP
* RFC 7313 - Enhanced Route Refresh Capability for BGP
* RFC 7606 - Revised Error Handling for BGP UPDATE Messages
* RFC 7911 - Advertisement of Multiple Paths in BGP
* RFC 7947 - Internet Exchange BGP Route Server
* RFC 8092 - BGP Large Communities Attribute
* RFC 8203 - BGP Administrative Shutdown Communication
* RFC 8212 - Default EBGP Route Propagation Behavior without Policies
2019-10-25 23:28:38 +00:00
* RFC 8654 - Extended Message Support for BGP
* RFC 8950 - Advertising IPv4 NLRI with an IPv6 Next Hop
2023-02-19 14:35:07 +00:00
* RFC 9072 - Extended Optional Parameters Length for BGP OPEN Message
* RFC 9117 - Revised Validation Procedure for BGP Flow Specifications
* RFC 9234 - Route Leak Prevention and Detection Using Roles
* draft-uttaro-idr-bgp-persistence-04
* draft-walton-bgp-hostname-capability-02
*/
2000-06-04 17:06:18 +00:00
#undef LOCAL_DEBUG
2000-03-19 22:09:07 +00:00
#include <stdlib.h>
2000-03-19 22:09:07 +00:00
#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
#include "nest/mpls.h"
#include "nest/cli.h"
#include "nest/locks.h"
2000-03-19 22:09:07 +00:00
#include "conf/conf.h"
#include "filter/filter.h"
#include "lib/socket.h"
2000-03-30 18:44:23 +00:00
#include "lib/resource.h"
2002-11-13 08:47:06 +00:00
#include "lib/string.h"
2000-03-19 22:09:07 +00:00
#include "bgp.h"
#ifdef CONFIG_BMP
#include "proto/bmp/bmp.h"
#endif
2000-03-19 22:09:07 +00:00
static void bgp_listen_create(void *);
static list STATIC_LIST_INIT(bgp_sockets); /* Global list of listening sockets */
static list STATIC_LIST_INIT(bgp_listen_pending); /* Global list of listening socket open requests */
static event bgp_listen_event = { .hook = bgp_listen_create };
static DOMAIN(rtable) bgp_listen_domain;
static pool *bgp_listen_pool;
static void bgp_connect(struct bgp_proto *p);
static void bgp_active(struct bgp_proto *p);
static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn);
static void bgp_setup_sk(struct bgp_conn *conn, sock *s);
static void bgp_send_open(struct bgp_conn *conn);
static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd);
2000-03-19 22:09:07 +00:00
static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
static void bgp_listen_sock_err(sock *sk UNUSED, int err);
static void bgp_initiate_disable(struct bgp_proto *p, int err_val);
static void bgp_graceful_restart_feed(struct bgp_channel *c);
static inline int
bgp_setup_auth(struct bgp_proto *p, int enable)
{
2023-04-02 17:35:35 +00:00
/* Beware. This is done from main_birdloop and protocol birdloop is NOT ENTERED.
* Anyway, we are only accessing:
* - protocol config which can be changed only from main_birdloop (reconfig)
* - protocol listen socket which is always driven by main_birdloop
* - protocol name which is set on reconfig
*/
if (p->cf->password && p->listen.sock)
{
ip_addr prefix = p->cf->remote_ip;
int pxlen = -1;
if (p->cf->remote_range)
{
prefix = net_prefix(p->cf->remote_range);
pxlen = net_pxlen(p->cf->remote_range);
}
int rv = sk_set_md5_auth(p->listen.sock->sk,
p->cf->local_ip, prefix, pxlen, p->cf->iface,
enable ? p->cf->password : NULL, p->cf->setkey);
if (rv < 0)
sk_log_error(p->listen.sock->sk, p->p.name);
return rv;
}
else
return 0;
}
/**
* bgp_close - close a BGP instance
* @p: BGP instance
*
* This function frees and deconfigures shared BGP resources.
*/
static void
bgp_close(struct bgp_proto *p)
{
2023-04-02 17:35:35 +00:00
LOCK_DOMAIN(rtable, bgp_listen_domain);
struct bgp_listen_request *req = &p->listen;
struct bgp_socket *bs = req->sock;
2023-05-08 16:57:32 +00:00
if (enlisted(&req->n))
2023-04-02 17:35:35 +00:00
{
2023-05-08 16:57:32 +00:00
/* Remove listen request from listen socket or pending list */
2023-04-02 17:35:35 +00:00
rem_node(&req->n);
2023-05-08 16:57:32 +00:00
if (bs)
{
/* Already had a socket. */
req->sock = NULL;
/* Request listen socket cleanup */
if (bs && EMPTY_LIST(bs->requests))
ev_send(&global_event_list, &bgp_listen_event);
}
2023-04-02 17:35:35 +00:00
}
2023-04-02 17:35:35 +00:00
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
}
/**
* bgp_open - open a BGP instance
* @p: BGP instance
*
* This function allocates and configures shared BGP resources, mainly listening
* sockets. Should be called as the last step during initialization (when lock
* is acquired and neighbor is ready). When error, caller should change state to
* PS_DOWN and return immediately.
*/
static void
bgp_open(struct bgp_proto *p)
{
2023-04-02 17:35:35 +00:00
LOCK_DOMAIN(rtable, bgp_listen_domain);
struct bgp_listen_request *req = &p->listen;
/* We assume that cf->iface is defined iff cf->local_ip is link-local */
req->iface = p->cf->strict_bind ? p->cf->iface : NULL;
req->vrf = p->p.vrf;
req->addr = p->cf->strict_bind ? p->cf->local_ip :
(p->ipv4 ? IPA_NONE4 : IPA_NONE6);
req->port = p->cf->local_port;
req->flags = p->cf->free_bind ? SKF_FREEBIND : 0;
BGP_TRACE(D_EVENTS, "Requesting listen socket at %I%J port %u", req->addr, req->iface, req->port);
add_tail(&bgp_listen_pending, &req->n);
2023-04-02 17:35:35 +00:00
ev_send(&global_event_list, &bgp_listen_event);
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
}
static void
bgp_listen_create(void *_ UNUSED)
{
2023-04-02 17:35:35 +00:00
ASSERT_DIE(birdloop_inside(&main_birdloop));
uint flag_mask = SKF_FREEBIND;
2023-04-02 17:35:35 +00:00
while (1) {
LOCK_DOMAIN(rtable, bgp_listen_domain);
if (EMPTY_LIST(bgp_listen_pending))
{
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
break;
}
/* Get the first request to match */
struct bgp_listen_request *req = HEAD(bgp_listen_pending);
SKIP_BACK_DECLARE(struct bgp_proto, p, listen, req);
rem_node(&req->n);
/* First try to find existing socket */
struct bgp_socket *bs;
WALK_LIST(bs, bgp_sockets)
if (ipa_equal(bs->sk->saddr, req->addr) &&
(bs->sk->sport == req->port) &&
(bs->sk->iface == req->iface) &&
(bs->sk->vrf == req->vrf) &&
((bs->sk->flags & flag_mask) == req->flags))
break;
/* Not found any */
if (NODE_VALID(bs))
BGP_TRACE(D_EVENTS, "Found a listening socket: %p", bs);
else
{
2023-04-02 17:35:35 +00:00
/* Allocating new socket from global protocol pool.
* We can do this in main_birdloop. */
sock *sk = sk_new(bgp_listen_pool);
sk->type = SK_TCP_PASSIVE;
sk->ttl = 255;
sk->saddr = req->addr;
sk->sport = req->port;
sk->iface = req->iface;
sk->vrf = req->vrf;
sk->flags = req->flags;
sk->tos = IP_PREC_INTERNET_CONTROL;
sk->rbsize = BGP_RX_BUFFER_SIZE;
sk->tbsize = BGP_TX_BUFFER_SIZE;
sk->rx_hook = bgp_incoming_connection;
sk->err_hook = bgp_listen_sock_err;
if (sk_open(sk, &main_birdloop) < 0)
{
sk_log_error(sk, p->p.name);
log(L_ERR "%s: Cannot open listening socket", p->p.name);
sk_close(sk);
2023-04-02 17:35:35 +00:00
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
2023-04-02 17:35:35 +00:00
bgp_initiate_disable(p, BEM_NO_SOCKET);
continue;
}
bs = mb_allocz(bgp_listen_pool, sizeof(struct bgp_socket));
bs->sk = sk;
sk->data = bs;
init_list(&bs->requests);
add_tail(&bgp_sockets, &bs->n);
BGP_TRACE(D_EVENTS, "Created new listening socket: %p", bs);
}
req->sock = bs;
2023-04-02 17:35:35 +00:00
add_tail(&bs->requests, &req->n);
if (bgp_setup_auth(p, 1) < 0)
{
2023-04-02 17:35:35 +00:00
rem_node(&req->n);
req->sock = NULL;
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
bgp_initiate_disable(p, BEM_INVALID_MD5);
2023-04-02 17:35:35 +00:00
continue;
}
2023-04-02 17:35:35 +00:00
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
}
/* Cleanup leftover listening sockets */
2023-04-02 17:35:35 +00:00
LOCK_DOMAIN(rtable, bgp_listen_domain);
struct bgp_socket *bs;
node *nxt;
WALK_LIST_DELSAFE(bs, nxt, bgp_sockets)
if (EMPTY_LIST(bs->requests))
{
sk_close(bs->sk);
rem_node(&bs->n);
mb_free(bs);
}
2023-04-02 17:35:35 +00:00
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
}
static inline struct bgp_channel *
bgp_find_channel(struct bgp_proto *p, u32 afi)
{
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
if (c->afi == afi)
return c;
return NULL;
}
static void
bgp_startup(struct bgp_proto *p)
{
BGP_TRACE(D_EVENTS, "Started");
bgp_set_start_state(p, BSS_CONNECT);
2009-09-24 20:12:11 +00:00
if (!p->passive)
2009-09-24 20:12:11 +00:00
bgp_active(p);
if (p->postponed_sk)
{
/* Apply postponed incoming connection */
sk_reloop(p->postponed_sk, p->p.loop);
2023-04-02 17:35:35 +00:00
bgp_setup_conn(p, &p->incoming_conn);
bgp_setup_sk(&p->incoming_conn, p->postponed_sk);
bgp_send_open(&p->incoming_conn);
p->postponed_sk = NULL;
}
}
static void
bgp_startup_timeout(timer *t)
{
bgp_startup(t->data);
}
static void
bgp_initiate(struct bgp_proto *p)
{
bgp_open(p);
if (p->cf->bfd)
bgp_update_bfd(p, p->cf->bfd);
if (p->startup_delay)
{
bgp_set_start_state(p, BSS_DELAY);
BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
2023-04-02 17:35:35 +00:00
bgp_start_timer(p, p->startup_timer, p->startup_delay);
}
else
bgp_startup(p);
}
static void
bgp_initiate_disable(struct bgp_proto *p, int err_val)
{
2023-04-02 17:35:35 +00:00
PROTO_LOCKED_FROM_MAIN(&p->p)
{
/* The protocol may be already down for another reason.
* Shutdown the protocol only if it isn't already shutting down. */
switch (p->p.proto_state)
{
case PS_START:
case PS_UP:
p->p.disabled = 1;
bgp_store_error(p, NULL, BE_MISC, err_val);
bgp_stop(p, err_val, NULL, 0);
}
}
}
2000-06-04 17:06:18 +00:00
/**
* bgp_start_timer - start a BGP timer
* @t: timer
2017-06-21 12:14:51 +00:00
* @value: time (in seconds) to fire (0 to disable the timer)
2000-06-04 17:06:18 +00:00
*
* This functions calls tm_start() on @t with time @value and the amount of
* randomization suggested by the BGP standard. Please use it for all BGP
* timers.
2000-06-04 17:06:18 +00:00
*/
void
2023-04-02 17:35:35 +00:00
bgp_start_timer(struct bgp_proto *p, timer *t, uint value)
{
if (value)
{
2017-06-21 12:14:51 +00:00
/* The randomization procedure is specified in RFC 4271 section 10 */
btime time = value S;
btime randomize = random() % ((time / 4) + 1);
2023-04-02 17:35:35 +00:00
tm_start_in(t, time - randomize, p->p.loop);
}
2000-03-30 17:39:48 +00:00
else
tm_stop(t);
2000-03-30 17:39:48 +00:00
}
2000-06-04 17:06:18 +00:00
/**
* bgp_close_conn - close a BGP connection
* @conn: connection to close
*
* This function takes a connection described by the &bgp_conn structure, closes
* its socket and frees all resources associated with it.
2000-06-04 17:06:18 +00:00
*/
2000-03-30 17:39:48 +00:00
void
bgp_close_conn(struct bgp_conn *conn)
{
// struct bgp_proto *p = conn->bgp;
2000-03-30 17:39:48 +00:00
DBG("BGP: Closing connection\n");
conn->packets_to_send = 0;
conn->channels_to_send = 0;
rfree(conn->connect_timer);
conn->connect_timer = NULL;
2000-03-30 17:39:48 +00:00
rfree(conn->keepalive_timer);
conn->keepalive_timer = NULL;
rfree(conn->hold_timer);
conn->hold_timer = NULL;
rfree(conn->send_hold_timer);
conn->send_hold_timer = NULL;
2023-04-02 17:35:35 +00:00
sk_close(conn->sk);
conn->sk = NULL;
mb_free(conn->local_open_msg);
conn->local_open_msg = NULL;
mb_free(conn->remote_open_msg);
conn->remote_open_msg = NULL;
conn->local_open_length = 0;
conn->remote_open_length = 0;
ea_list *attr = conn->bgp->p.ea_state;
if (conn == &conn->bgp->incoming_conn)
{
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_in_conn_local_open_msg, 0, NULL, 0));
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_in_conn_remote_open_msg, 0, NULL, 0));
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_in_conn_sk, 0, NULL, 0));
}
else
{
ASSERT_DIE(conn == &conn->bgp->outgoing_conn);
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_out_conn_local_open_msg, 0, NULL, 0));
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_out_conn_remote_open_msg, 0, NULL, 0));
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_out_conn_sk, 0, NULL, 0));
}
conn->bgp->p.ea_state = ea_lookup(conn->bgp->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&conn->bgp->p, attr);
mb_free(conn->local_caps);
conn->local_caps = NULL;
mb_free(conn->remote_caps);
conn->remote_caps = NULL;
conn->notify_data = NULL;
conn->notify_size = 0;
}
/**
* bgp_update_startup_delay - update a startup delay
* @p: BGP instance
*
* This function updates a startup delay that is used to postpone next BGP
* connect. It also handles disable_after_error and might stop BGP instance
* when error happened and disable_after_error is on.
*
* It should be called when BGP protocol error happened.
*/
void
bgp_update_startup_delay(struct bgp_proto *p)
{
const struct bgp_config *cf = p->cf;
DBG("BGP: Updating startup delay\n");
2017-06-21 12:14:51 +00:00
if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S))
p->startup_delay = 0;
2017-06-21 12:14:51 +00:00
p->last_proto_error = current_time();
if (cf->disable_after_error)
{
p->startup_delay = 0;
p->p.disabled = 1;
return;
}
if (!p->startup_delay)
p->startup_delay = cf->error_delay_time_min;
else
p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
}
static void
bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len)
{
switch (conn->state)
{
case BS_IDLE:
case BS_CLOSE:
return;
case BS_CONNECT:
case BS_ACTIVE:
bgp_conn_enter_idle_state(conn);
return;
case BS_OPENSENT:
case BS_OPENCONFIRM:
case BS_ESTABLISHED:
if (subcode < 0)
{
bgp_conn_enter_close_state(conn);
bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
}
else
bgp_error(conn, 6, subcode, data, len);
return;
default:
bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
}
}
static void
bgp_down(struct bgp_proto *p)
{
if (bgp_start_state(p) > BSS_PREPARE)
{
bgp_setup_auth(p, 0);
bgp_close(p);
}
if (p->neigh)
{
neigh_unlink(p->neigh);
p->neigh = NULL;
}
BGP_TRACE(D_EVENTS, "Down");
proto_notify_state(&p->p, PS_FLUSH);
}
static void
bgp_decision(void *vp)
{
struct bgp_proto *p = vp;
DBG("BGP: Decision start\n");
if ((p->p.proto_state == PS_START) &&
(p->outgoing_conn.state == BS_IDLE) &&
(p->incoming_conn.state != BS_OPENCONFIRM) &&
!p->passive)
bgp_active(p);
if ((p->p.proto_state == PS_STOP) &&
(p->outgoing_conn.state == BS_IDLE) &&
(p->incoming_conn.state == BS_IDLE))
bgp_down(p);
}
static struct bgp_proto *
bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip)
{
struct symbol *sym;
char fmt[SYM_MAX_LEN];
bsprintf(fmt, "%s%%0%dd", pp->cf->dynamic_name, pp->cf->dynamic_name_digits);
/* This is hack, we would like to share config, but we need to copy it now */
new_config = OBSREF_GET(config);
cfg_mem = new_config->mem;
new_config->current_scope = new_config->root_scope;
sym = cf_default_name(new_config, fmt, &(pp->dynamic_name_counter));
proto_clone_config(sym, pp->p.cf);
new_config = NULL;
cfg_mem = NULL;
/* Just pass remote_ip to bgp_init() */
((struct bgp_config *) sym->proto)->remote_ip = remote_ip;
return (void *) proto_spawn(sym->proto, 0);
}
void
bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len)
{
proto_notify_state(&p->p, PS_STOP);
bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
bgp_free_pending_tx(c);
2023-04-02 17:35:35 +00:00
proto_send_event(&p->p, p->event);
}
2010-01-03 11:17:52 +00:00
static inline void
bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
2010-01-03 11:17:52 +00:00
{
if (conn->bgp->p.mrtdump & MD_STATES)
bgp_dump_state_change(conn, conn->state, new_state);
2010-01-03 11:17:52 +00:00
conn->state = new_state;
if (conn == &conn->bgp->incoming_conn)
ea_set_attr(&conn->bgp->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_in_conn_state, 0, new_state));
else
{
ASSERT_DIE(conn == &conn->bgp->outgoing_conn);
ea_set_attr(&conn->bgp->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_out_conn_state, 0, new_state));
}
conn->bgp->p.ea_state = ea_lookup(conn->bgp->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&conn->bgp->p, conn->bgp->p.ea_state);
2010-01-03 11:17:52 +00:00
}
void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
/* Really, most of the work is done in bgp_rx_open(). */
bgp_conn_set_state(conn, BS_OPENCONFIRM);
}
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
struct bgp_proto *p = conn->bgp;
struct bgp_caps *local = conn->local_caps;
struct bgp_caps *peer = conn->remote_caps;
struct bgp_channel *c;
BGP_TRACE(D_EVENTS, "BGP session established");
p->last_established = current_time();
p->stats.fsm_established_transitions++;
/* For multi-hop BGP sessions */
if (ipa_zero(p->local_ip))
p->local_ip = conn->sk->saddr;
/* For promiscuous sessions */
if (!p->remote_as)
p->remote_as = conn->received_as;
/* In case of LLv6 is not valid during BGP start */
if (ipa_zero(p->link_addr) && p->neigh && p->neigh->iface && p->neigh->iface->llv6)
p->link_addr = p->neigh->iface->llv6->ip;
conn->sk->fast_rx = 0;
p->conn = conn;
p->last_error_class = 0;
p->last_error_code = 0;
p->as4_session = conn->as4_session;
ea_set_attr(&p->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_as4_session, 0, conn->as4_session));
p->p.ea_state = ea_lookup(p->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&conn->bgp->p, conn->bgp->p.ea_state);
p->route_refresh = peer->route_refresh;
p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
/* Whether we may handle possible GR/LLGR of peer (it has some AF GR-able) */
p->gr_ready = p->llgr_ready = 0; /* Updated later */
/* Whether peer is ready to handle our GR recovery */
int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
if (p->gr_active_num)
tm_stop(p->gr_timer);
/* Number of active channels */
int num = 0;
/* Summary state of ADD_PATH RX for active channels */
uint summary_add_path_rx = 0;
BGP_WALK_CHANNELS(p, c)
{
const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi);
int active = loc->ready && rem->ready;
c->c.disabled = !active;
c->c.reloadable = p->route_refresh || ((c->c.in_keep & RIK_PREFILTER) == RIK_PREFILTER);
c->index = active ? num++ : 0;
c->feed_state = BFS_NONE;
c->load_state = BFS_NONE;
/* Channels where peer may do GR */
uint gr_ready = active && local->gr_aware && rem->gr_able;
uint llgr_ready = active && local->llgr_aware && rem->llgr_able;
c->gr_ready = gr_ready || llgr_ready;
p->gr_ready = p->gr_ready || c->gr_ready;
p->llgr_ready = p->llgr_ready || llgr_ready;
/* Remember last LLGR stale time */
c->stale_time = local->llgr_aware ? rem->llgr_time : 0;
/* Channels not able to recover gracefully */
if (p->p.gr_recovery && (!active || !peer_gr_ready))
channel_graceful_restart_unlock(&c->c);
/* Channels waiting for local convergence */
if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
c->c.gr_wait = 1;
/* Channels where regular graceful restart failed */
if ((c->gr_active == BGP_GRS_ACTIVE) &&
!(active && rem->gr_able && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
bgp_graceful_restart_done(c);
/* Channels where regular long-lived restart failed */
if ((c->gr_active == BGP_GRS_LLGR) &&
!(active && rem->llgr_able && (rem->gr_af_flags & BGP_LLGRF_FORWARDING)))
bgp_graceful_restart_done(c);
/* GR capability implies that neighbor will send End-of-RIB */
if (peer->gr_aware)
c->load_state = BFS_LOADING;
/* We'll also send End-of-RIB */
if (p->cf->gr_mode)
c->feed_state = BFS_LOADING;
c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);
if (active)
summary_add_path_rx |= !c->add_path_rx ? 1 : 2;
2016-12-23 22:03:26 +00:00
/* Update RA mode */
if (c->add_path_tx)
c->c.ra_mode = RA_ANY;
2016-12-23 22:03:26 +00:00
else if (c->cf->secondary)
c->c.ra_mode = RA_ACCEPTED;
else
c->c.ra_mode = RA_OPTIMAL;
}
p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
p->channel_count = num;
p->summary_add_path_rx = summary_add_path_rx;
BGP_WALK_CHANNELS(p, c)
{
if (c->c.disabled)
continue;
p->afi_map[c->index] = c->afi;
p->channel_map[c->index] = c;
}
/* Breaking rx_hook for simulating receive problem */
if (p->cf->disable_rx)
{
conn->sk->rx_hook = NULL;
tm_stop(conn->hold_timer);
}
/* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_ESTABLISHED);
proto_notify_state(&p->p, PS_UP);
}
static void
bgp_conn_leave_established_state(struct bgp_conn *conn, struct bgp_proto *p)
{
BGP_TRACE(D_EVENTS, "BGP session closed");
p->last_established = current_time();
p->conn = NULL;
if (p->p.proto_state == PS_UP)
bgp_stop(p, 0, NULL, 0);
uint adsz;
struct bgp_session_close_ad *bscad = alloca(adsz = sizeof *bscad + conn->notify_size);
*bscad = (struct bgp_session_close_ad) {
.ad.length = adsz - sizeof(adata),
.last_error_class = p->last_error_class,
.notify_code = conn->notify_code,
.notify_subcode = conn->notify_subcode,
};
memcpy(bscad->data, conn->notify_data, conn->notify_size);
ea_set_attr(&p->p.ea_state, EA_LITERAL_DIRECT_ADATA(&ea_bgp_close_bmp, 0, &bscad->ad));
p->p.ea_state = ea_lookup(p->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&p->p, p->p.ea_state);
}
void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
struct bgp_proto *p = conn->bgp;
int os = conn->state;
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_CLOSE);
tm_stop(conn->keepalive_timer);
conn->sk->rx_hook = NULL;
/* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
2023-04-02 17:35:35 +00:00
bgp_start_timer(p, conn->hold_timer, 10);
if (os == BS_ESTABLISHED)
bgp_conn_leave_established_state(conn, p);
}
void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
struct bgp_proto *p = conn->bgp;
int os = conn->state;
bgp_close_conn(conn);
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_IDLE);
2023-04-02 17:35:35 +00:00
proto_send_event(&p->p, p->event);
if (os == BS_ESTABLISHED)
bgp_conn_leave_established_state(conn, p);
}
/**
* bgp_handle_graceful_restart - handle detected BGP graceful restart
* @p: BGP instance
*
* This function is called when a BGP graceful restart of the neighbor is
* detected (when the TCP connection fails or when a new TCP connection
* appears). The function activates processing of the restart - starts routing
* table refresh cycle and activates BGP restart timer. The protocol state goes
* back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
* caller.
*/
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
p->gr_active_num ? " - already pending" : "");
p->gr_active_num = 0;
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
{
2017-12-09 23:55:34 +00:00
/* FIXME: perhaps check for channel state instead of disabled flag? */
if (c->c.disabled)
continue;
if (c->gr_ready)
{
p->gr_active_num++;
switch (c->gr_active)
{
case BGP_GRS_NONE:
c->gr_active = BGP_GRS_ACTIVE;
/* fall through */
case BGP_GRS_ACTIVE:
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_begin(&c->c.in_req);
break;
case BGP_GRS_LLGR:
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_begin(&c->c.in_req);
bgp_graceful_restart_feed(c);
break;
}
}
else
{
/* Just flush the routes */
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_begin(&c->c.in_req);
rt_refresh_end(&c->c.in_req);
}
2017-12-09 23:55:34 +00:00
/* Reset bucket and prefix tables */
bgp_free_pending_tx(c);
bgp_init_pending_tx(c);
2017-12-09 23:55:34 +00:00
c->packets_to_send = 0;
}
/* p->gr_ready -> at least one active channel is c->gr_ready */
ASSERT(p->gr_active_num > 0);
proto_notify_state(&p->p, PS_START);
2023-04-02 17:35:35 +00:00
tm_start_in(p->gr_timer, p->conn->remote_caps->gr_time S, p->p.loop);
}
static void
bgp_graceful_restart_feed(struct bgp_channel *c)
{
c->stale_feed = (struct rt_export_feeder) {
.name = mb_sprintf(c->c.proto->pool, "%s.%s.llgr", c->c.proto->name, c->c.name),
.trace_routes = c->c.debug,
};
c->stale_event = (event) {
.hook = bgp_rte_modify_stale,
.data = c,
};
rt_feeder_subscribe(&c->c.table->export_all, &c->stale_feed);
proto_send_event(c->c.proto, &c->stale_event);
}
/**
* bgp_graceful_restart_done - finish active BGP graceful restart
* @c: BGP channel
*
* This function is called when the active BGP graceful restart of the neighbor
* should be finished for channel @c - either successfully (the neighbor sends
* all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
* unsuccessfully (the neighbor does not support BGP graceful restart on the new
* session). The function ends the routing table refresh cycle.
*/
void
bgp_graceful_restart_done(struct bgp_channel *c)
{
struct bgp_proto *p = (void *) c->c.proto;
ASSERT(c->gr_active);
c->gr_active = 0;
p->gr_active_num--;
if (!p->gr_active_num)
BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
tm_stop(c->stale_timer);
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_end(&c->c.in_req);
}
/**
* bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
* @t: timer
*
* This function is a timeout hook for @gr_timer, implementing BGP restart time
* limit for reestablisment of the BGP session after the graceful restart. When
* fired, we just proceed with the usual protocol restart.
*/
static void
bgp_graceful_restart_timeout(timer *t)
{
struct bgp_proto *p = t->data;
BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
if (p->llgr_ready)
{
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
{
/* Channel is not in GR and is already flushed */
if (!c->gr_active)
continue;
/* Channel is already in LLGR from past restart */
if (c->gr_active == BGP_GRS_LLGR)
continue;
/* Channel is in GR, but does not support LLGR -> stop GR */
if (!c->stale_time)
{
bgp_graceful_restart_done(c);
continue;
}
/* Channel is in GR, and supports LLGR -> start LLGR */
c->gr_active = BGP_GRS_LLGR;
2023-04-02 17:35:35 +00:00
tm_start_in(c->stale_timer, c->stale_time S, p->p.loop);
bgp_graceful_restart_feed(c);
}
}
else
bgp_stop(p, 0, NULL, 0);
}
static void
bgp_long_lived_stale_timeout(timer *t)
{
struct bgp_channel *c = t->data;
struct bgp_proto *p = (void *) c->c.proto;
BGP_TRACE(D_EVENTS, "Long-lived stale timeout");
bgp_graceful_restart_done(c);
}
/**
* bgp_refresh_begin - start incoming enhanced route refresh sequence
* @c: BGP channel
*
* This function is called when an incoming enhanced route refresh sequence is
* started by the neighbor, demarcated by the BoRR packet. The function updates
* the load state and starts the routing table refresh cycle. Note that graceful
* restart also uses routing table refresh cycle, but RFC 7313 and load states
* ensure that these two sequences do not overlap.
*/
void
bgp_refresh_begin(struct bgp_channel *c)
{
struct bgp_proto *p = (void *) c->c.proto;
if (c->load_state == BFS_LOADING)
{ log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
c->load_state = BFS_REFRESHING;
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_begin(&c->c.in_req);
}
/**
* bgp_refresh_end - finish incoming enhanced route refresh sequence
* @c: BGP channel
*
* This function is called when an incoming enhanced route refresh sequence is
* finished by the neighbor, demarcated by the EoRR packet. The function updates
* the load state and ends the routing table refresh cycle. Routes not received
* during the sequence are removed by the nest.
*/
void
bgp_refresh_end(struct bgp_channel *c)
{
struct bgp_proto *p = (void *) c->c.proto;
if (c->load_state != BFS_REFRESHING)
{ log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
c->load_state = BFS_NONE;
Route refresh in tables uses a stale counter. Until now, we were marking routes as REF_STALE and REF_DISCARD to cleanup old routes after route refresh. This needed a synchronous route table walk at both beginning and the end of route refresh routine, marking the routes by the flags. We avoid these walks by using a stale counter. Every route contains: u8 stale_cycle; Every import hook contains: u8 stale_set; u8 stale_valid; u8 stale_pruned; u8 stale_pruning; In base_state, stale_set == stale_valid == stale_pruned == stale_pruning and all routes' stale_cycle also have the same value. The route refresh looks like follows: + ----------- + --------- + ----------- + ------------- + ------------ + | | stale_set | stale_valid | stale_pruning | stale_pruned | | Base | x | x | x | x | | Begin | x+1 | x | x | x | ... now routes are being inserted with stale_cycle == (x+1) | End | x+1 | x+1 | x | x | ... now table pruning routine is scheduled | Prune begin | x+1 | x+1 | x+1 | x | ... now routes with stale_cycle not between stale_set and stale_valid are deleted | Prune end | x+1 | x+1 | x+1 | x+1 | + ----------- + --------- + ----------- + ------------- + ------------ + The pruning routine is asynchronous and may have high latency in high-load environments. Therefore, multiple route refresh requests may happen before the pruning routine starts, leading to this situation: | Prune begin | x+k | x+k | x -> x+k | x | ... or even | Prune begin | x+k+1 | x+k | x -> x+k | x | ... if the prune event starts while another route refresh is running. In such a case, the pruning routine still deletes routes not fitting between stale_set and and stale_valid, effectively pruning the remnants of all unpruned route refreshes from before: | Prune end | x+k | x+k | x+k | x+k | In extremely rare cases, there may happen too many route refreshes before any route prune routine finishes. If the difference between stale_valid and stale_pruned becomes more than 128 when requesting for another route refresh, the routine walks the table synchronously and resets all the stale values to a base state, while logging a warning.
2022-07-12 08:36:10 +00:00
rt_refresh_end(&c->c.in_req);
}
static void
bgp_send_open(struct bgp_conn *conn)
{
DBG("BGP: Sending open\n");
conn->sk->rx_hook = bgp_rx;
2000-03-30 17:39:48 +00:00
conn->sk->tx_hook = bgp_tx;
tm_stop(conn->connect_timer);
bgp_prepare_capabilities(conn);
bgp_schedule_packet(conn, NULL, PKT_OPEN);
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_OPENSENT);
2023-04-02 17:35:35 +00:00
bgp_start_timer(conn->bgp, conn->hold_timer, conn->bgp->cf->initial_hold_time);
}
static void
bgp_connected(sock *sk)
{
struct bgp_conn *conn = sk->data;
2000-05-02 16:07:41 +00:00
struct bgp_proto *p = conn->bgp;
2000-05-02 16:07:41 +00:00
BGP_TRACE(D_EVENTS, "Connected");
bgp_send_open(conn);
}
static void
bgp_connect_timeout(timer *t)
{
struct bgp_conn *conn = t->data;
2000-05-02 16:07:41 +00:00
struct bgp_proto *p = conn->bgp;
2000-05-02 16:07:41 +00:00
DBG("BGP: connect_timeout\n");
if (p->p.proto_state == PS_START)
{
bgp_close_conn(conn);
bgp_connect(p);
}
else
bgp_conn_enter_idle_state(conn);
}
static void
bgp_sock_err(sock *sk, int err)
{
struct bgp_conn *conn = sk->data;
2000-05-02 16:07:41 +00:00
struct bgp_proto *p = conn->bgp;
/*
* This error hook may be called either asynchronously from main
* loop, or synchronously from sk_send(). But sk_send() is called
* only from bgp_tx() and bgp_kick_tx(), which are both called
* asynchronously from main loop. Moreover, they end if err hook is
* called. Therefore, we could suppose that it is always called
* asynchronously.
*/
bgp_store_error(p, conn, BE_SOCKET, err);
if (err)
BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
else
BGP_TRACE(D_EVENTS, "Connection closed");
if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(conn);
}
static void
bgp_hold_timeout(timer *t)
{
struct bgp_conn *conn = t->data;
struct bgp_proto *p = conn->bgp;
DBG("BGP: Hold timeout\n");
/* We are already closing the connection - just do hangup */
if (conn->state == BS_CLOSE)
{
BGP_TRACE(D_EVENTS, "Connection stalled");
bgp_conn_enter_idle_state(conn);
return;
}
/* If there is something in input queue, we are probably congested
and perhaps just not processed BGP packets in time. */
if (sk_rx_ready(conn->sk) > 0)
2023-04-02 17:35:35 +00:00
bgp_start_timer(p, conn->hold_timer, 10);
else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready)
{
BGP_TRACE(D_EVENTS, "Hold timer expired");
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(conn);
}
else
bgp_error(conn, 4, 0, NULL, 0);
}
static void
bgp_keepalive_timeout(timer *t)
{
struct bgp_conn *conn = t->data;
DBG("BGP: Keepalive timer\n");
bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
}
void
bgp_send_hold_timeout(timer *t)
{
struct bgp_conn *conn = t->data;
struct bgp_proto *p = conn->bgp;
DBG("BGP: Send hold timeout\n");
if (conn->state == BS_CLOSE)
return;
uint code = 8;
uint subcode = 0;
/* Like bgp_error() but without NOTIFICATION */
bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, NULL, 0);
bgp_store_error(p, conn, BE_BGP_TX, (code << 16) | subcode);
bgp_conn_enter_idle_state(conn);
bgp_update_startup_delay(p);
bgp_stop(p, 0, NULL, 0);
}
static void
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
{
conn->sk = NULL;
conn->bgp = p;
2000-03-21 15:53:50 +00:00
conn->packets_to_send = 0;
conn->channels_to_send = 0;
conn->last_channel = 0;
conn->last_channel_count = 0;
conn->connect_timer = tm_new_init(p->p.pool, bgp_connect_timeout, conn, 0, 0);
conn->hold_timer = tm_new_init(p->p.pool, bgp_hold_timeout, conn, 0, 0);
conn->keepalive_timer = tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
conn->send_hold_timer = tm_new_init(p->p.pool, bgp_send_hold_timeout, conn, 0, 0);
ea_list *attr = conn->bgp->p.ea_state;
if (conn == &conn->bgp->incoming_conn)
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_in_conn_sk, 0, NULL, 0));
else
{
ASSERT_DIE(conn == &conn->bgp->outgoing_conn);
ea_set_attr(&attr, EA_LITERAL_STORE_ADATA(&ea_bgp_out_conn_sk, 0, NULL, 0));
}
conn->bgp->p.ea_state = ea_lookup(conn->bgp->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&p->p, attr);
}
static void
bgp_setup_sk(struct bgp_conn *conn, sock *s)
{
s->data = conn;
s->err_hook = bgp_sock_err;
s->fast_rx = 1;
conn->sk = s;
struct bgp_conn_sk_ad sk_ad = {
.ad = { .length = sizeof sk_ad - sizeof sk_ad.ad },
.saddr = s->saddr,
.daddr = s->daddr,
.sport = s->sport,
.dport = s->dport,
};
ea_list *attr = conn->bgp->p.ea_state;
if (conn == &conn->bgp->incoming_conn)
ea_set_attr(&attr, EA_LITERAL_DIRECT_ADATA(&ea_bgp_in_conn_sk, 0, &sk_ad.ad));
else
{
ASSERT_DIE(conn == &conn->bgp->outgoing_conn);
ea_set_attr(&attr, EA_LITERAL_DIRECT_ADATA(&ea_bgp_out_conn_sk, 0, &sk_ad.ad));
}
conn->bgp->p.ea_state = ea_lookup(conn->bgp->p.ea_state, 0, EALS_CUSTOM);
proto_announce_state_later(&conn->bgp->p, attr);
}
static void
bgp_active(struct bgp_proto *p)
{
int delay = MAX(1, p->cf->connect_delay_time);
struct bgp_conn *conn = &p->outgoing_conn;
BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
bgp_setup_conn(p, conn);
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_ACTIVE);
2023-04-02 17:35:35 +00:00
bgp_start_timer(p, conn->connect_timer, delay);
}
2000-06-04 17:06:18 +00:00
/**
* bgp_connect - initiate an outgoing connection
* @p: BGP instance
*
* The bgp_connect() function creates a new &bgp_conn and initiates
* a TCP connection to the peer. The rest of connection setup is governed
* by the BGP state machine as described in the standard.
*/
static void
bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */
{
2000-03-30 17:39:48 +00:00
struct bgp_conn *conn = &p->outgoing_conn;
2024-01-22 05:29:14 +00:00
int hops = p->cf->multihop ?: 1;
DBG("BGP: Connecting\n");
sock *s = sk_new(p->p.pool);
s->type = SK_TCP_ACTIVE;
s->saddr = p->local_ip;
s->daddr = p->remote_ip;
s->dport = p->cf->remote_port;
s->iface = p->neigh ? p->neigh->iface : NULL;
Basic VRF support Add basic VRF (virtual routing and forwarding) support. Protocols can be associated with VRFs, such protocols will be restricted to interfaces assigned to the VRF (as reported by Linux kernel) and will use sockets bound to the VRF. E.g., different multihop BGP instances can use diffent kernel routing tables to handle BGP TCP connections. The VRF support is preliminary, currently there are several limitations: - Recent Linux kernels (4.11) do not handle correctly sockets bound to interaces that are part of VRF, so most protocols other than multihop BGP do not work. This will be fixed by future kernel versions. - Neighbor cache ignores VRFs. Breaks config with the same prefix on local interfaces in different VRFs. Not much problem as single hop protocols do not work anyways. - Olock code ignores VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. - Incoming BGP connections are not dispatched according to VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. Perhaps we would need some kernel API to read VRF of incoming connection? Or probably use multiple listening sockets in int-new branch. - We should handle master VRF interface up/down events and perhaps disable associated protocols when VRF goes down. Or at least disable associated interfaces. - Also we should check if the master iface is really VRF iface and not some other kind of master iface. - BFD session request dispatch should be aware of VRFs. - Perhaps kernel protocol should read default kernel table ID from VRF iface so it is not necessary to configure it. - Perhaps we should have per-VRF default table.
2017-09-06 15:38:48 +00:00
s->vrf = p->p.vrf;
s->ttl = p->cf->ttl_security ? 255 : hops;
s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
s->tos = IP_PREC_INTERNET_CONTROL;
s->password = p->cf->password;
s->tx_hook = bgp_connected;
s->flags = p->cf->free_bind ? SKF_FREEBIND : 0;
BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J",
s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL,
2014-10-24 09:11:43 +00:00
s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
bgp_setup_conn(p, conn);
bgp_setup_sk(conn, s);
2010-01-03 11:17:52 +00:00
bgp_conn_set_state(conn, BS_CONNECT);
if (sk_open(s, p->p.loop) < 0)
2014-05-18 09:42:26 +00:00
goto err;
/* Set minimal receive TTL if needed */
if (p->cf->ttl_security)
if (sk_set_min_ttl(s, 256 - hops) < 0)
2014-05-18 09:42:26 +00:00
goto err;
DBG("BGP: Waiting for connect success\n");
2023-04-02 17:35:35 +00:00
bgp_start_timer(p, conn->connect_timer, p->cf->connect_retry_time);
2014-05-18 09:42:26 +00:00
return;
err:
2014-05-18 09:42:26 +00:00
sk_log_error(s, p->p.name);
bgp_sock_err(s, 0);
return;
}
static inline int bgp_is_dynamic(struct bgp_proto *p)
{ return ipa_zero(p->remote_ip); }
/**
* bgp_find_proto - find existing proto for incoming connection
* @sk: TCP socket
*
*/
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
struct bgp_proto *best = NULL;
struct bgp_socket *bs = sk->data;
struct bgp_listen_request *req;
/* sk->iface is valid only if src or dst address is link-local */
int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr);
2023-04-02 17:35:35 +00:00
LOCK_DOMAIN(rtable, bgp_listen_domain);
WALK_LIST(req, bs->requests)
{
SKIP_BACK_DECLARE(struct bgp_proto, p, listen, req);
if ((p->p.proto == &proto_bgp) &&
(ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) &&
(!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) &&
(p->p.vrf == sk->vrf) &&
(p->cf->local_port == sk->sport) &&
(!link || (p->cf->iface == sk->iface)) &&
(ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)))
{
best = p;
if (!bgp_is_dynamic(p))
break;
}
}
2023-04-02 17:35:35 +00:00
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
return best;
}
2000-06-04 17:06:18 +00:00
/**
* bgp_incoming_connection - handle an incoming connection
* @sk: TCP socket
* @dummy: unused
*
* This function serves as a socket hook for accepting of new BGP
* connections. It searches a BGP instance corresponding to the peer
* which has connected and if such an instance exists, it creates a
* &bgp_conn structure, attaches it to the instance and either sends
* an Open message or (if there already is an active connection) it
* closes the new connection by sending a Notification message.
*/
static int
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
{
2023-04-02 17:35:35 +00:00
ASSERT_DIE(birdloop_inside(&main_birdloop));
struct bgp_proto *p;
int acc, hops;
DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
p = bgp_find_proto(sk);
if (!p)
{
log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
LOCK_DOMAIN(rtable, bgp_listen_domain);
sk_close(sk);
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
return 0;
}
2023-04-02 17:35:35 +00:00
birdloop_enter(p->p.loop);
/*
* BIRD should keep multiple incoming connections in OpenSent state (for
* details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
* connections are rejected istead. The exception is the case where an
* incoming connection triggers a graceful restart.
*/
acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
(bgp_start_state(p) >= BSS_CONNECT) && (!p->incoming_conn.sk);
if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
{
bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(p->conn);
acc = 1;
/* There might be separate incoming connection in OpenSent state */
if (p->incoming_conn.state > BS_ACTIVE)
bgp_close_conn(&p->incoming_conn);
}
LOCK_DOMAIN(rtable, bgp_listen_domain);
BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
sk->dport, acc ? "accepted" : "rejected");
if (!acc)
{
sk_close(sk);
2023-04-02 17:35:35 +00:00
goto leave;
}
2024-01-22 05:29:14 +00:00
hops = p->cf->multihop ?: 1;
if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
goto err;
if (p->cf->ttl_security)
if (sk_set_min_ttl(sk, 256 - hops) < 0)
goto err;
if (p->cf->enable_extended_messages)
{
sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
sk_reallocate(sk);
}
/* For dynamic BGP, spawn new instance and postpone the socket */
if (bgp_is_dynamic(p))
{
p = bgp_spawn(p, sk->daddr);
p->postponed_sk = sk;
rmove(sk, p->p.pool);
2023-04-02 17:35:35 +00:00
goto leave;
}
rmove(sk, p->p.pool);
2023-04-02 17:35:35 +00:00
sk_reloop(sk, p->p.loop);
bgp_setup_conn(p, &p->incoming_conn);
bgp_setup_sk(&p->incoming_conn, sk);
bgp_send_open(&p->incoming_conn);
2023-04-02 17:35:35 +00:00
goto leave;
err:
sk_log_error(sk, p->p.name);
log(L_ERR "%s: Incoming connection aborted", p->p.name);
sk_close(sk);
2023-04-02 17:35:35 +00:00
leave:
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
2023-04-02 17:35:35 +00:00
birdloop_leave(p->p.loop);
return 0;
}
static void
bgp_listen_sock_err(sock *sk UNUSED, int err)
{
if (err == ECONNABORTED)
log(L_WARN "BGP: Incoming connection aborted");
else
log(L_ERR "BGP: Error on listening socket: %M", err);
}
static void
bgp_start_neighbor(struct bgp_proto *p)
{
/* Called only for single-hop BGP sessions */
if (ipa_zero(p->local_ip))
p->local_ip = p->neigh->ifa->ip;
if (ipa_is_link_local(p->local_ip))
p->link_addr = p->local_ip;
else if (p->neigh->iface->llv6)
p->link_addr = p->neigh->iface->llv6->ip;
bgp_initiate(p);
}
static void
bgp_neigh_notify(neighbor *n)
{
struct bgp_proto *p = (struct bgp_proto *) n->proto;
int ps = p->p.proto_state;
if (n != p->neigh)
return;
if ((ps == PS_FLUSH) || (ps == PS_STOP))
return;
int prepare = (ps == PS_START) && (bgp_start_state(p) == BSS_PREPARE);
if (n->scope <= 0)
{
if (!prepare)
{
BGP_TRACE(D_EVENTS, "Neighbor lost");
bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
/* Perhaps also run bgp_update_startup_delay(p)? */
bgp_stop(p, 0, NULL, 0);
}
}
else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
{
if (!prepare)
{
BGP_TRACE(D_EVENTS, "Link down");
bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
if (ps == PS_UP)
bgp_update_startup_delay(p);
bgp_stop(p, 0, NULL, 0);
}
}
else
{
if (prepare)
{
BGP_TRACE(D_EVENTS, "Neighbor ready");
bgp_start_neighbor(p);
}
}
}
static void
bgp_bfd_notify(struct bfd_request *req)
{
struct bgp_proto *p = req->data;
int ps = p->p.proto_state;
if (req->down && ((ps == PS_START) || (ps == PS_UP)))
{
BGP_TRACE(D_EVENTS, "BFD session down");
bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
if (req->opts.mode == BGP_BFD_GRACEFUL)
{
/* Trigger graceful restart */
if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
bgp_handle_graceful_restart(p);
if (p->incoming_conn.state > BS_IDLE)
bgp_conn_enter_idle_state(&p->incoming_conn);
if (p->outgoing_conn.state > BS_IDLE)
bgp_conn_enter_idle_state(&p->outgoing_conn);
}
else
{
/* Trigger session down */
if (ps == PS_UP)
bgp_update_startup_delay(p);
bgp_stop(p, 0, NULL, 0);
}
}
}
static void
bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd)
{
if (bfd && p->bfd_req)
{
BGP_TRACE(D_EVENTS, "Updating existing BFD request");
bfd_update_request(p->bfd_req, bfd);
}
if (bfd && !p->bfd_req && !bgp_is_dynamic(p))
{
p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip,
p->cf->multihop ? NULL : p->neigh->iface,
p->p.vrf, bgp_bfd_notify, p, p->p.loop, bfd);
BGP_TRACE(D_EVENTS, "Requesting a new BFD session");
}
if (!bfd && p->bfd_req)
{
BGP_TRACE(D_EVENTS, "Retracting the BFD request");
rfree(p->bfd_req);
p->bfd_req = NULL;
}
}
void
bgp_reload_in(struct proto *P, uintptr_t _ UNUSED, int __ UNUSED)
2009-11-26 19:47:59 +00:00
{
SKIP_BACK_DECLARE(struct bgp_proto, p, p, P);
2009-11-26 19:47:59 +00:00
if (P->proto_state == PS_UP)
{
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
if (&c->c != P->mpls_channel)
{
cli_msg(-15, "%s.%s: reloading", P->name, c->c.name);
bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
}
}
else
cli_msg(-8006, "%s: not reloading, not up", P->name);
}
void
bgp_reload_out(struct proto *P, uintptr_t _ UNUSED, int __ UNUSED)
{
SKIP_BACK_DECLARE(struct bgp_proto, p, p, P);
if (P->proto_state == PS_UP)
{
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
if (&c->c != P->mpls_channel)
if (c->tx_keep)
{
bgp_tx_resend(p, c);
cli_msg(-15, "%s.%s: reloading", P->name, c->c.name);
}
else
{
rt_export_refeed(&c->c.out_req, NULL);
cli_msg(-15, "%s.%s: reloading by table refeed", P->name, c->c.name);
}
}
else
cli_msg(-8006, "%s: not reloading, not up", P->name);
2009-11-26 19:47:59 +00:00
}
struct bgp_enhanced_refresh_request {
struct rt_feeding_request rfr;
struct bgp_channel *c;
};
void
bgp_done_route_refresh(struct rt_feeding_request *rfr)
{
SKIP_BACK_DECLARE(struct bgp_enhanced_refresh_request, berr, rfr, rfr);
struct bgp_channel *c = berr->c;
SKIP_BACK_DECLARE(struct bgp_proto, p, p, c->c.proto);
/* Schedule EoRR packet */
ASSERT_DIE(c->feed_state == BFS_REFRESHING);
c->feed_state = BFS_REFRESHED;
bgp_schedule_packet(p->conn, c, PKT_UPDATE);
mb_free(berr);
}
static void
bgp_export_fed(struct channel *C)
{
SKIP_BACK_DECLARE(struct bgp_channel, c, c, C);
SKIP_BACK_DECLARE(struct bgp_proto, p, p, c->c.proto);
/* Schedule End-of-RIB packet */
if (c->feed_state == BFS_LOADING)
{
c->feed_state = BFS_LOADED;
bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}
}
static void
bgp_start_locked(void *_p)
{
struct bgp_proto *p = _p;
const struct bgp_config *cf = p->cf;
if (p->p.proto_state != PS_START)
{
DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
return;
}
DBG("BGP: Got lock\n");
if (cf->multihop || bgp_is_dynamic(p))
{
/* Multi-hop sessions do not use neighbor entries */
bgp_initiate(p);
return;
}
neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY);
if (!n)
{
log(L_ERR "%s: Invalid remote address %I%J", p->p.name, p->remote_ip, cf->iface);
/* As we do not start yet, we can just disable protocol */
p->p.disabled = 1;
bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
proto_notify_state(&p->p, PS_FLUSH);
return;
}
p->neigh = n;
neigh_link(n);
if (n->scope <= 0)
BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", p->remote_ip, cf->iface);
else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
else
bgp_start_neighbor(p);
}
2000-03-19 22:09:07 +00:00
static int
bgp_start(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
const struct bgp_config *cf = p->cf;
p->local_ip = cf->local_ip;
p->local_as = cf->local_as;
p->remote_as = cf->remote_as;
p->public_as = cf->local_as;
/* For dynamic BGP childs, remote_ip is already set */
if (ipa_nonzero(cf->remote_ip))
p->remote_ip = cf->remote_ip;
/* Confederation ID is used for truly external peers */
if (p->cf->confederation && !p->is_interior)
p->public_as = cf->confederation;
p->passive = cf->passive || bgp_is_dynamic(p);
bgp_set_start_state(p, BSS_PREPARE);
2000-03-30 17:39:48 +00:00
p->outgoing_conn.state = BS_IDLE;
p->incoming_conn.state = BS_IDLE;
p->neigh = NULL;
p->bfd_req = NULL;
p->postponed_sk = NULL;
p->gr_ready = 0;
p->gr_active_num = 0;
/* Reset some stats */
p->stats.rx_messages = p->stats.tx_messages = 0;
p->stats.rx_updates = p->stats.tx_updates = 0;
p->stats.rx_bytes = p->stats.tx_bytes = 0;
p->last_rx_update = 0;
2018-10-01 13:55:23 +00:00
p->event = ev_new_init(p->p.pool, bgp_decision, p);
callback_init(&p->uncork.cb, bgp_do_uncork, p->p.loop);
2022-08-02 10:54:11 +00:00
p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0);
p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0);
p->local_id = proto_get_router_id(P->cf);
if (p->rr_client)
p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
p->remote_id = 0;
p->link_addr = IPA_NONE;
ea_list *eal = p->p.ea_state;
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_rem_id, 0, p->remote_id));
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_loc_as, 0, p->local_as));
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_rem_as, 0, p->remote_as));
ea_set_attr(&eal, EA_LITERAL_STORE_ADATA(&ea_bgp_rem_ip, 0, &p->remote_ip, sizeof(ip_addr)));
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_out_conn_state, 0, BS_IDLE));
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_in_conn_state, 0, BS_IDLE));
proto_announce_state(&p->p, eal);
2017-12-09 23:55:34 +00:00
/* Lock all channels when in GR recovery mode */
if (p->p.gr_recovery && p->cf->gr_mode)
{
struct bgp_channel *c;
BGP_WALK_CHANNELS(p, c)
channel_graceful_restart_lock(&c->c);
}
/*
* Before attempting to create the connection, we need to lock the port,
* so that we are the only instance attempting to talk with that neighbor.
*/
struct object_lock *lock;
lock = p->lock = olock_new(P->pool_inloop);
lock->addr = p->remote_ip;
lock->addr_local = p->cf->local_ip;
lock->port = p->cf->remote_port;
lock->iface = p->cf->iface;
lock->vrf = p->cf->iface ? NULL : p->p.vrf;
lock->type = OBJLOCK_TCP;
lock->event = (event) {
.hook = bgp_start_locked,
.data = p,
};
2023-04-02 17:35:35 +00:00
lock->target = proto_event_list(P);
/* For dynamic BGP, we use inst 1 to avoid collisions with regular BGP */
if (bgp_is_dynamic(p))
{
lock->addr = net_prefix(p->cf->remote_range);
lock->inst = 1;
}
olock_acquire(lock);
return PS_START;
2000-03-19 22:09:07 +00:00
}
extern int proto_restart;
2000-03-19 22:09:07 +00:00
static int
bgp_shutdown(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
int subcode = 0;
char *message = NULL;
byte *data = NULL;
uint len = 0;
2000-05-02 16:07:41 +00:00
BGP_TRACE(D_EVENTS, "Shutdown requested");
switch (P->down_code)
{
case PDC_CF_REMOVE:
case PDC_CF_DISABLE:
subcode = 3; // Errcode 6, 3 - peer de-configured
break;
case PDC_CF_RESTART:
subcode = 6; // Errcode 6, 6 - other configuration change
break;
case PDC_CMD_DISABLE:
case PDC_CMD_SHUTDOWN:
shutdown:
subcode = 2; // Errcode 6, 2 - administrative shutdown
message = P->message;
break;
case PDC_CMD_RESTART:
subcode = 4; // Errcode 6, 4 - administrative reset
message = P->message;
break;
case PDC_CMD_GR_DOWN:
if ((p->cf->gr_mode != BGP_GR_ABLE) &&
(p->cf->llgr_mode != BGP_LLGR_ABLE))
goto shutdown;
subcode = -1; // Do not send NOTIFICATION, just close the connection
break;
case PDC_RX_LIMIT_HIT:
case PDC_IN_LIMIT_HIT:
subcode = 1; // Errcode 6, 1 - max number of prefixes reached
/* log message for compatibility */
log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
goto limit;
case PDC_OUT_LIMIT_HIT:
subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
limit:
bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
if (proto_restart)
bgp_update_startup_delay(p);
else
p->startup_delay = 0;
goto done;
}
bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
p->startup_delay = 0;
/* RFC 8203 - shutdown communication */
if (message)
{
uint msg_len = strlen(message);
2019-04-30 11:55:43 +00:00
msg_len = MIN(msg_len, 255);
/* Buffer will be freed automatically by protocol shutdown */
data = mb_alloc(p->p.pool, msg_len + 1);
len = msg_len + 1;
data[0] = msg_len;
memcpy(data+1, message, msg_len);
}
done:
bgp_stop(p, subcode, data, len);
return p->p.proto_state;
2000-03-19 22:09:07 +00:00
}
struct rte_owner_class bgp_rte_owner_class = {
.get_route_info = bgp_get_route_info,
.rte_better = bgp_rte_better,
.rte_mergable = bgp_rte_mergable,
.rte_igp_metric = bgp_rte_igp_metric,
};
static struct proto *
bgp_init(struct proto_config *CF)
{
struct proto *P = proto_new(CF);
struct bgp_proto *p = (struct bgp_proto *) P;
struct bgp_config *cf = (struct bgp_config *) CF;
P->rt_notify = bgp_rt_notify;
Terminology cleanup: The import_control hook is now called preexport. Once upon a time, far far away, there were the old Bird developers discussing what direction of route flow shall be called import and export. They decided to say "import to protocol" and "export to table" when speaking about a protocol. When speaking about a table, they spoke about "importing to table" and "exporting to protocol". The latter terminology was adopted in configuration, then also the bird CLI in commit ea2ae6dd0 started to use it (in year 2009). Now it's 2018 and the terminology is the latter. Import is from protocol to table, export is from table to protocol. Anyway, there was still an import_control hook which executed right before route export. One thing is funny. There are two commits in April 1999 with just two minutes between them. The older announces the final settlement on config terminology, the newer uses the other definition. Let's see their commit messages as the git-log tool shows them (the newer first): commit 9e0e485e50ea74c4f1c5cb65bdfe6ce819c2cee2 Author: Martin Mares <mj@ucw.cz> Date: Mon Apr 5 20:17:59 1999 +0000 Added some new protocol hooks (look at the comments for better explanation): make_tmp_attrs Convert inline attributes to ea_list store_tmp_attrs Convert ea_list to inline attributes import_control Pre-import decisions commit 5056c559c4eb253a4eee10cf35b694faec5265eb Author: Martin Mares <mj@ucw.cz> Date: Mon Apr 5 20:15:31 1999 +0000 Changed syntax of attaching filters to protocols to hopefully the final version: EXPORT <filter-spec> for outbound routes (i.e., those announced by BIRD to the rest of the world). IMPORT <filter-spec> for inbound routes (i.e., those imported by BIRD from the rest of the world). where <filter-spec> is one of: ALL pass all routes NONE drop all routes FILTER <name> use named filter FILTER { <filter> } use explicitly defined filter For all protocols, the default is IMPORT ALL, EXPORT NONE. This includes the kernel protocol, so that you need to add EXPORT ALL to get the previous configuration of kernel syncer (as usually, see doc/bird.conf.example for a bird.conf example :)). Let's say RIP to this almost 19-years-old inconsistency. For now, if you import a route, it is always from protocol to table. If you export a route, it is always from table to protocol. And they lived happily ever after.
2018-02-14 12:42:53 +00:00
P->preexport = bgp_preexport;
P->iface_sub.neigh_notify = bgp_neigh_notify;
P->export_fed = bgp_export_fed;
P->sources.class = &bgp_rte_owner_class;
P->sources.rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;
p->cf = cf;
p->is_internal = (cf->local_as == cf->remote_as);
p->is_interior = p->is_internal || cf->confederation_member;
p->rs_client = cf->rs_client;
p->rr_client = cf->rr_client;
p->ipv4 = ipa_nonzero(cf->remote_ip) ?
ipa_is_ip4(cf->remote_ip) :
(cf->remote_range && (cf->remote_range->type == NET_IP4));
p->remote_ip = cf->remote_ip;
p->remote_as = cf->remote_as;
/* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */
if (cf->c.parent)
cf->remote_ip = IPA_NONE;
/* Add all BGP channels */
struct bgp_channel_config *cc;
BGP_CF_WALK_CHANNELS(cf, cc)
proto_add_channel(P, &cc->c);
/* Add MPLS channel */
proto_configure_mpls_channel(P, CF, RTS_BGP);
ea_set_attr(&p->p.ea_state, EA_LITERAL_STORE_ADATA(&ea_bgp_rem_ip, 0, &cf->remote_ip, sizeof(ip_addr)));
ea_set_attr(&p->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_peer_type, 0, cf->peer_type));
ea_set_attr(&p->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_loc_as, 0, cf->local_as));
ea_set_attr(&p->p.ea_state, EA_LITERAL_EMBEDDED(&ea_bgp_rem_as, 0, cf->remote_as));
proto_announce_state_later(&p->p, p->p.ea_state);
return P;
}
static void
bgp_channel_init(struct channel *C, struct channel_config *CF)
{
struct bgp_channel *c = (void *) C;
struct bgp_channel_config *cf = (void *) CF;
c->cf = cf;
c->afi = cf->afi;
c->desc = cf->desc;
if (cf->igp_table_ip4)
c->igp_table_ip4 = cf->igp_table_ip4->table;
if (cf->igp_table_ip6)
c->igp_table_ip6 = cf->igp_table_ip6->table;
if (cf->base_table)
c->base_table = cf->base_table->table;
PST_LOCKED(ts)
{
ea_list *eal = ea_free_later(ts->channels[c->c.id]);
ea_set_attr(&eal, EA_LITERAL_EMBEDDED(&ea_bgp_afi, 0, c->afi));
ts->channels[c->c.id] = ea_lookup_slow(eal, 0, EALS_IN_TABLE);
}
}
static int
bgp_channel_start(struct channel *C)
{
struct bgp_proto *p = (void *) C->proto;
struct bgp_channel *c = (void *) C;
ip_addr src = p->local_ip;
if (c->igp_table_ip4)
rt_lock_table(c->igp_table_ip4);
if (c->igp_table_ip6)
rt_lock_table(c->igp_table_ip6);
if (c->base_table)
{
rt_lock_table(c->base_table);
rt_flowspec_link(c->base_table, c->c.table);
}
c->pool = p->p.pool; // XXXX
bgp_init_pending_tx(c);
c->tx_keep = c->cf->export_table;
c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
c->next_hop_addr = c->cf->next_hop_addr;
c->link_addr = IPA_NONE;
c->packets_to_send = 0;
/* Try to use source address as next hop address */
if (ipa_zero(c->next_hop_addr))
{
if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop))
c->next_hop_addr = src;
if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop))
c->next_hop_addr = src;
}
/* Use preferred addresses associated with interface / source address */
if (ipa_zero(c->next_hop_addr))
{
/* We know the iface for single-hop, we make lookup for multihop */
struct neighbor *nbr = p->neigh ?: neigh_find(&p->p, src, NULL, 0);
struct iface *iface = nbr ? nbr->iface : NULL;
if (bgp_channel_is_ipv4(c) && iface && iface->addr4)
c->next_hop_addr = iface->addr4->ip;
if (bgp_channel_is_ipv6(c) && iface && iface->addr6)
c->next_hop_addr = iface->addr6->ip;
}
/* Exit if no feasible next hop address is found */
if (ipa_zero(c->next_hop_addr))
{
log(L_WARN "%s: Missing next hop address", p->p.name);
return 0;
}
/* Set link-local address for IPv6 single-hop BGP */
if (ipa_is_ip6(c->next_hop_addr) && p->neigh)
{
c->link_addr = p->link_addr;
if (ipa_zero(c->link_addr))
log(L_WARN "%s: Missing link-local address", p->p.name);
}
/* Link local address is already in c->link_addr */
if (ipa_is_link_local(c->next_hop_addr))
c->next_hop_addr = IPA_NONE;
return 0; /* XXXX: Currently undefined */
}
static void
bgp_channel_shutdown(struct channel *C)
{
struct bgp_channel *c = (void *) C;
c->next_hop_addr = IPA_NONE;
c->link_addr = IPA_NONE;
2017-12-09 23:55:34 +00:00
c->packets_to_send = 0;
}
static void
bgp_channel_cleanup(struct channel *C)
{
struct bgp_channel *c = (void *) C;
if (c->igp_table_ip4)
rt_unlock_table(c->igp_table_ip4);
if (c->igp_table_ip6)
rt_unlock_table(c->igp_table_ip6);
if (c->base_table)
{
rt_flowspec_unlink(c->base_table, c->c.table);
rt_unlock_table(c->base_table);
}
c->index = 0;
/* Cleanup rest of bgp_channel starting at pool field */
memset(&(c->pool), 0, sizeof(struct bgp_channel) - OFFSETOF(struct bgp_channel, pool));
}
static inline struct bgp_channel_config *
bgp_find_channel_config(struct bgp_config *cf, u32 afi)
{
struct bgp_channel_config *cc;
BGP_CF_WALK_CHANNELS(cf, cc)
if (cc->afi == afi)
return cc;
return NULL;
}
struct rtable_config *
bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type)
{
struct bgp_channel_config *cc2;
struct rtable_config *tab;
/* First, try table connected by the channel */
if (cc->c.table->addr_type == type)
return cc->c.table;
/* Find paired channel with the same SAFI but the other AFI */
u32 afi2 = cc->afi ^ 0x30000;
cc2 = bgp_find_channel_config(cf, afi2);
/* Second, try IGP table configured in the paired channel */
if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6))
return tab;
/* Third, try table connected by the paired channel */
if (cc2 && (cc2->c.table->addr_type == type))
return cc2->c.table;
/* Last, try default table of given type */
if (tab = rt_get_default_table(cf->c.global, type))
return tab;
cf_error("Undefined IGP table");
}
static struct rtable_config *
bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc)
{
/* Expected table type */
u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6;
/* First, try appropriate IP channel */
u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST);
struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2);
if (cc2 && (cc2->c.table->addr_type == type))
return cc2->c.table;
/* Last, try default table of given type */
struct rtable_config *tab = rt_get_default_table(cf->c.global, type);
if (tab)
return tab;
cf_error("Undefined base table");
}
void
bgp_postconfig(struct proto_config *CF)
{
struct bgp_config *cf = (void *) CF;
/* Do not check templates at all */
if (cf->c.class == SYM_TEMPLATE)
return;
/* Handle undefined remote_as, zero should mean unspecified external */
if (!cf->remote_as && (cf->peer_type == BGP_PT_INTERNAL))
cf->remote_as = cf->local_as;
int internal = (cf->local_as == cf->remote_as);
int interior = internal || cf->confederation_member;
/* EBGP direct by default, IBGP multihop by default */
if (cf->multihop < 0)
cf->multihop = internal ? 64 : 0;
/* LLGR mode default based on GR mode */
if (cf->llgr_mode < 0)
cf->llgr_mode = cf->gr_mode ? BGP_LLGR_AWARE : 0;
2017-12-08 16:00:47 +00:00
/* Link check for single-hop BGP by default */
if (cf->check_link < 0)
cf->check_link = !cf->multihop;
if (!cf->local_as)
cf_error("Local AS number must be set");
if (ipa_zero(cf->remote_ip) && !cf->remote_range)
cf_error("Neighbor must be configured");
if (ipa_zero(cf->local_ip) && cf->strict_bind)
cf_error("Local address must be configured for strict bind");
if (!cf->remote_as && !cf->peer_type)
cf_error("Remote AS number (or peer type) must be set");
if ((cf->peer_type == BGP_PT_INTERNAL) && !internal)
cf_error("IBGP cannot have different ASNs");
if ((cf->peer_type == BGP_PT_EXTERNAL) && internal)
cf_error("EBGP cannot have the same ASNs");
if (!cf->iface && (ipa_is_link_local(cf->local_ip) ||
ipa_is_link_local(cf->remote_ip)))
cf_error("Link-local addresses require defined interface");
if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF))
cf_error("Neighbor AS number out of range (AS4 not available)");
if (!internal && cf->rr_client)
cf_error("Only internal neighbor can be RR client");
if (internal && cf->rs_client)
cf_error("Only external neighbor can be RS client");
if (internal && (cf->local_role != BGP_ROLE_UNDEFINED))
cf_error("Local role cannot be set on IBGP sessions");
if (interior && (cf->local_role != BGP_ROLE_UNDEFINED))
log(L_WARN "BGP roles are not recommended to be used within AS confederations");
if (cf->require_enhanced_refresh && !(cf->enable_refresh && cf->enable_enhanced_refresh))
cf_warn("Enhanced refresh required but disabled");
if (cf->require_as4 && !cf->enable_as4)
cf_warn("AS4 support required but disabled");
if (cf->require_extended_messages && !cf->enable_extended_messages)
cf_warn("Extended messages required but not enabled");
if (cf->require_gr && !cf->gr_mode)
cf_warn("Graceful restart required but not enabled");
if (cf->require_llgr && !cf->llgr_mode)
cf_warn("Long-lived graceful restart required but not enabled");
if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED))
cf_error("Local role must be set if roles are required");
if (!cf->confederation && cf->confederation_member)
cf_error("Confederation ID must be set for member sessions");
if (cf->multihop && (ipa_is_link_local(cf->local_ip) ||
ipa_is_link_local(cf->remote_ip)))
cf_error("Multihop BGP cannot be used with link-local addresses");
2017-04-28 09:19:12 +00:00
if (cf->multihop && cf->iface)
cf_error("Multihop BGP cannot be bound to interface");
if (cf->multihop && cf->check_link)
cf_error("Multihop BGP cannot depend on link state");
if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip))
cf_error("Multihop BGP with BFD requires specified local address");
if (!cf->gr_mode && cf->llgr_mode)
cf_error("Long-lived graceful restart requires basic graceful restart");
if (internal && cf->enforce_first_as)
cf_error("Enforce first AS check is requires EBGP sessions");
if (cf->keepalive_time > cf->hold_time)
cf_error("Keepalive time must be at most hold time");
if (cf->keepalive_time > (cf->hold_time / 2))
log(L_WARN "Keepalive time should be at most 1/2 of hold time");
if (cf->min_hold_time > cf->hold_time)
cf_error("Min hold time (%u) exceeds hold time (%u)",
cf->min_hold_time, cf->hold_time);
uint keepalive_time = cf->keepalive_time ?: cf->hold_time / 3;
if (cf->min_keepalive_time > keepalive_time)
cf_error("Min keepalive time (%u) exceeds keepalive time (%u)",
cf->min_keepalive_time, keepalive_time);
struct bgp_channel_config *cc;
BGP_CF_WALK_CHANNELS(cf, cc)
{
/* Handle undefined import filter */
if (cc->c.in_filter == FILTER_UNDEF)
if (interior)
cc->c.in_filter = FILTER_ACCEPT;
else
cf_error("EBGP requires explicit import policy");
/* Handle undefined export filter */
if (cc->c.out_filter == FILTER_UNDEF)
if (interior)
cc->c.out_filter = FILTER_REJECT;
else
cf_error("EBGP requires explicit export policy");
/* Disable after error incompatible with restart limit action */
if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error)
cc->c.in_limit.action = PLA_DISABLE;
/* Different default based on rr_client, rs_client */
if (cc->next_hop_keep == 0xff)
cc->next_hop_keep = cf->rr_client ? NH_IBGP : (cf->rs_client ? NH_ALL : NH_NO);
/* Different default for gw_mode */
if (!cc->gw_mode)
cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT;
/* Different default for next_hop_prefer */
if (!cc->next_hop_prefer)
cc->next_hop_prefer = (cc->gw_mode == GW_DIRECT) ? NHP_GLOBAL : NHP_LOCAL;
/* Defaults based on proto config */
if (cc->gr_able == 0xff)
cc->gr_able = (cf->gr_mode == BGP_GR_ABLE);
2012-07-04 19:31:03 +00:00
if (cc->llgr_able == 0xff)
cc->llgr_able = (cf->llgr_mode == BGP_LLGR_ABLE);
if (cc->llgr_time == ~0U)
cc->llgr_time = cf->llgr_time;
2019-09-28 12:17:20 +00:00
/* AIGP enabled by default on interior sessions */
if (cc->aigp == 0xff)
cc->aigp = interior;
2017-03-29 13:31:04 +00:00
/* Default values of IGP tables */
if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp)
{
if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop))
cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4);
if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop))
cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6);
2017-03-29 13:31:04 +00:00
if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop)
cf_error("Mismatched IGP table type");
if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop)
cf_error("Mismatched IGP table type");
}
/* Default value of base table */
if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table)
cc->base_table = bgp_default_base_table(cf, cc);
if (cc->base_table && !cc->base_table->trie_used)
cf_error("Flowspec validation requires base table (%s) with trie",
cc->base_table->name);
if (cf->multihop && (cc->gw_mode == GW_DIRECT))
cf_error("Multihop BGP cannot use direct gateway mode");
2012-07-04 19:31:03 +00:00
if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted)
cf_error("BGP in recursive mode prohibits sorted table");
if (cf->deterministic_med && cc->c.table->sorted)
cf_error("BGP with deterministic MED prohibits sorted table");
if (cc->secondary && !cc->c.table->sorted)
cf_error("BGP with secondary option requires sorted table");
if (cc->require_ext_next_hop && !cc->ext_next_hop)
cf_warn("Extended next hop required but not enabled");
if (cc->require_add_path && !cc->add_path)
cf_warn("ADD-PATH required but not enabled");
}
}
static int
bgp_reconfigure(struct proto *P, struct proto_config *CF)
{
struct bgp_proto *p = (void *) P;
const struct bgp_config *new = (void *) CF;
const struct bgp_config *old = p->cf;
if (proto_get_router_id(CF) != p->local_id)
return 0;
int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
((byte *) new) + sizeof(struct proto_config),
// password item is last and must be checked separately
OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
&& !bstrcmp(old->password, new->password)
&& ((!old->remote_range && !new->remote_range)
|| (old->remote_range && new->remote_range && net_equal(old->remote_range, new->remote_range)))
&& !bstrcmp(old->dynamic_name, new->dynamic_name)
&& (old->dynamic_name_digits == new->dynamic_name_digits);
/* FIXME: Move channel reconfiguration to generic protocol code ? */
struct channel *C, *C2;
struct bgp_channel_config *cc;
WALK_LIST(C, p->p.channels)
C->stale = 1;
/* Reconfigure BGP channels */
BGP_CF_WALK_CHANNELS(new, cc)
{
C = (struct channel *) bgp_find_channel(p, cc->afi);
same = proto_configure_channel(P, &C, &cc->c) && same;
}
/* Reconfigure MPLS channel */
same = proto_configure_mpls_channel(P, CF, RTS_BGP) && same;
WALK_LIST_DELSAFE(C, C2, p->p.channels)
if (C->stale)
same = proto_configure_channel(P, &C, NULL) && same;
/* Reset name counter */
p->dynamic_name_counter = 0;
if (!same)
return 0;
/* We should update our copy of configuration ptr as old configuration will be freed */
p->cf = new;
/* Check whether existing connections are compatible with required capabilities */
struct bgp_conn *ci = &p->incoming_conn;
if (((ci->state == BS_OPENCONFIRM) || (ci->state == BS_ESTABLISHED)) && !bgp_check_capabilities(ci))
return 0;
struct bgp_conn *co = &p->outgoing_conn;
if (((co->state == BS_OPENCONFIRM) || (co->state == BS_ESTABLISHED)) && !bgp_check_capabilities(co))
return 0;
if (bgp_start_state(p) > BSS_PREPARE)
bgp_update_bfd(p, new->bfd);
return 1;
}
2024-01-22 05:29:14 +00:00
#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL)
static int
bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed)
{
struct bgp_proto *p = (void *) C->proto;
struct bgp_channel *c = (void *) C;
struct bgp_channel_config *new = (void *) CC;
struct bgp_channel_config *old = c->cf;
if ((new->secondary != old->secondary) ||
(new->validate != old->validate) ||
(new->gr_able != old->gr_able) ||
(new->llgr_able != old->llgr_able) ||
(new->llgr_time != old->llgr_time) ||
(new->ext_next_hop != old->ext_next_hop) ||
(new->add_path != old->add_path) ||
(new->export_table != old->export_table) ||
(TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) ||
(TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) ||
(TABLE(new, base_table) != TABLE(old, base_table)))
return 0;
if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP))
return 0;
2019-09-28 12:17:20 +00:00
if ((new->gw_mode != old->gw_mode) ||
(new->next_hop_prefer != old->next_hop_prefer) ||
2019-09-28 12:17:20 +00:00
(new->aigp != old->aigp) ||
(new->cost != old->cost))
{
/* If import table is active and route refresh is possible, we just ask for route refresh */
if ((c->c.in_keep & RIK_PREFILTER) && (c->c.channel_state == CS_UP) && p->route_refresh)
bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
/* Otherwise we do complete reload */
else
*import_changed = 1;
}
if (!ipa_equal(new->next_hop_addr, old->next_hop_addr) ||
(new->next_hop_self != old->next_hop_self) ||
(new->next_hop_keep != old->next_hop_keep) ||
2019-09-28 12:17:20 +00:00
(new->aigp != old->aigp) ||
(new->aigp_originate != old->aigp_originate))
*export_changed = 1;
/* Update prefix exporter settle timer */
if (c->tx)
c->tx->exporter.journal.announce_timer.cf = c->cf->ptx_exporter_settle;
c->cf = new;
return 1;
}
static void
bgp_copy_config(struct proto_config *dest, struct proto_config *src)
{
struct bgp_config *d = (void *) dest;
struct bgp_config *s = (void *) src;
/* Copy BFD options */
if (s->bfd)
{
struct bfd_options *opts = cfg_alloc(sizeof(struct bfd_options));
memcpy(opts, s->bfd, sizeof(struct bfd_options));
d->bfd = opts;
}
}
2000-06-04 17:06:18 +00:00
/**
* bgp_error - report a protocol error
* @c: connection
* @code: error code (according to the RFC)
2000-06-07 12:29:08 +00:00
* @subcode: error sub-code
2000-06-04 17:06:18 +00:00
* @data: data to be passed in the Notification message
* @len: length of the data
*
* bgp_error() sends a notification packet to tell the other side that a protocol
2000-06-07 12:29:08 +00:00
* error has occurred (including the data considered erroneous if possible) and
2000-06-04 17:06:18 +00:00
* closes the connection.
*/
void
bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len)
{
struct bgp_proto *p = c->bgp;
if (c->state == BS_CLOSE)
return;
bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len));
bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
c->notify_code = code;
c->notify_subcode = subcode;
c->notify_data = data;
c->notify_size = (len > 0) ? len : 0;
bgp_conn_enter_close_state(c);
bgp_schedule_packet(c, NULL, PKT_NOTIFICATION);
if (code != 6)
{
bgp_update_startup_delay(p);
bgp_stop(p, 0, NULL, 0);
}
}
/**
* bgp_store_error - store last error for status report
* @p: BGP instance
* @c: connection
* @class: error class (BE_xxx constants)
* @code: error code (class specific)
*
* bgp_store_error() decides whether given error is interesting enough
* and store that error to last_error variables of @p
*/
void
bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
{
/* During PS_UP, we ignore errors on secondary connection */
if ((p->p.proto_state == PS_UP) && c && (c != p->conn))
return;
/* During PS_STOP, we ignore any errors, as we want to report
* the error that caused transition to PS_STOP
*/
if (p->p.proto_state == PS_STOP)
return;
p->last_error_class = class;
p->last_error_code = code;
}
static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
2024-01-22 05:29:14 +00:00
static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", "" };
static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "Link down", "BFD session down", "Graceful restart" };
static char *bgp_auto_errors[] = { "", "Route limit exceeded" };
static char *bgp_gr_states[] = { "None", "Regular", "Long-lived" };
static const char *
bgp_last_errmsg(struct bgp_proto *p)
2000-03-30 18:44:23 +00:00
{
switch (p->last_error_class)
{
case BE_MISC:
return bgp_misc_errors[p->last_error_code];
case BE_SOCKET:
return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
case BE_BGP_RX:
case BE_BGP_TX:
return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
case BE_AUTO_DOWN:
return bgp_auto_errors[p->last_error_code];
default:
return "";
}
}
static const char *
bgp_state_dsc(struct bgp_proto *p)
{
if (p->p.proto_state == PS_DOWN_XX)
return "Down";
if (p->p.proto_state == PS_FLUSH)
return "Flush";
int state = MAX(p->incoming_conn.state, p->outgoing_conn.state);
if ((state == BS_IDLE) && (bgp_start_state(p) >= BSS_CONNECT) && p->passive)
return "Passive";
return bgp_state_names[state];
}
static void
bgp_get_status(struct proto *P, byte *buf)
{
struct bgp_proto *p = (struct bgp_proto *) P;
const char *err1 = bgp_err_classes[p->last_error_class];
const char *err2 = bgp_last_errmsg(p);
if (P->proto_state == PS_DOWN_XX)
bsprintf(buf, "%s%s", err1, err2);
else
bsprintf(buf, "%-14s%s%s", bgp_state_dsc(p), err1, err2);
}
static void
bgp_show_afis(int code, char *s, u32 *afis, uint count)
{
buffer b;
STACK_BUFFER_INIT(b, CLI_MSG_SIZE);
buffer_puts(&b, s);
for (u32 *af = afis; af < (afis + count); af++)
{
const struct bgp_af_desc *desc = bgp_get_af_desc(*af);
if (desc)
buffer_print(&b, " %s", desc->name);
else
buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af));
}
if (b.pos == b.end)
strcpy(b.end - 32, " ... <too long>");
cli_msg(code, b.start);
}
const char *
bgp_format_role_name(u8 role)
{
static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" };
if (role == BGP_ROLE_UNDEFINED) return "undefined";
if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role];
return "?";
}
static void
bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
{
struct bgp_af_caps *ac;
uint any_mp_bgp = 0;
uint any_gr_able = 0;
uint any_add_path = 0;
uint any_ext_next_hop = 0;
uint any_llgr_able = 0;
u32 *afl1 = alloca(caps->af_count * sizeof(u32));
u32 *afl2 = alloca(caps->af_count * sizeof(u32));
uint afn1, afn2;
WALK_AF_CAPS(caps, ac)
{
any_mp_bgp |= ac->ready;
any_gr_able |= ac->gr_able;
any_add_path |= ac->add_path;
any_ext_next_hop |= ac->ext_next_hop;
any_llgr_able |= ac->llgr_able;
}
if (any_mp_bgp)
{
cli_msg(-1006, " Multiprotocol");
afn1 = 0;
WALK_AF_CAPS(caps, ac)
if (ac->ready)
afl1[afn1++] = ac->afi;
bgp_show_afis(-1006, " AF announced:", afl1, afn1);
}
if (caps->route_refresh)
cli_msg(-1006, " Route refresh");
if (any_ext_next_hop)
{
cli_msg(-1006, " Extended next hop");
afn1 = 0;
WALK_AF_CAPS(caps, ac)
if (ac->ext_next_hop)
afl1[afn1++] = ac->afi;
bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1);
}
if (caps->ext_messages)
cli_msg(-1006, " Extended message");
if (caps->gr_aware)
cli_msg(-1006, " Graceful restart");
if (any_gr_able)
{
/* Continues from gr_aware */
cli_msg(-1006, " Restart time: %u", caps->gr_time);
if (caps->gr_flags & BGP_GRF_RESTART)
cli_msg(-1006, " Restart recovery");
afn1 = afn2 = 0;
WALK_AF_CAPS(caps, ac)
{
if (ac->gr_able)
afl1[afn1++] = ac->afi;
if (ac->gr_af_flags & BGP_GRF_FORWARDING)
afl2[afn2++] = ac->afi;
}
bgp_show_afis(-1006, " AF supported:", afl1, afn1);
bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
}
if (caps->as4_support)
cli_msg(-1006, " 4-octet AS numbers");
if (any_add_path)
{
cli_msg(-1006, " ADD-PATH");
afn1 = afn2 = 0;
WALK_AF_CAPS(caps, ac)
{
if (ac->add_path & BGP_ADD_PATH_RX)
afl1[afn1++] = ac->afi;
if (ac->add_path & BGP_ADD_PATH_TX)
afl2[afn2++] = ac->afi;
}
bgp_show_afis(-1006, " RX:", afl1, afn1);
bgp_show_afis(-1006, " TX:", afl2, afn2);
}
if (caps->enhanced_refresh)
cli_msg(-1006, " Enhanced refresh");
if (caps->llgr_aware)
cli_msg(-1006, " Long-lived graceful restart");
if (any_llgr_able)
{
u32 stale_time = 0;
afn1 = afn2 = 0;
WALK_AF_CAPS(caps, ac)
{
stale_time = MAX(stale_time, ac->llgr_time);
if (ac->llgr_able && ac->llgr_time)
afl1[afn1++] = ac->afi;
if (ac->llgr_flags & BGP_GRF_FORWARDING)
afl2[afn2++] = ac->afi;
}
/* Continues from llgr_aware */
cli_msg(-1006, " LL stale time: %u", stale_time);
bgp_show_afis(-1006, " AF supported:", afl1, afn1);
bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
}
if (caps->hostname)
cli_msg(-1006, " Hostname: %s", caps->hostname);
if (caps->role != BGP_ROLE_UNDEFINED)
cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role));
}
static void
bgp_show_proto_info(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p));
if (bgp_is_dynamic(p) && p->cf->remote_range)
cli_msg(-1006, " Neighbor range: %N", p->cf->remote_range);
else
cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface);
if ((p->conn == &p->outgoing_conn) && (p->cf->remote_port != BGP_PORT))
cli_msg(-1006, " Neighbor port: %u", p->cf->remote_port);
cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
cli_msg(-1006, " Local AS: %u", p->cf->local_as);
if (p->gr_active_num)
cli_msg(-1006, " Neighbor graceful restart active");
if (P->proto_state == PS_START)
{
struct bgp_conn *oc = &p->outgoing_conn;
if ((bgp_start_state(p) < BSS_CONNECT) &&
(tm_active(p->startup_timer)))
cli_msg(-1006, " Error wait: %t/%u",
tm_remains(p->startup_timer), p->startup_delay);
if ((oc->state == BS_ACTIVE) &&
(tm_active(oc->connect_timer)))
cli_msg(-1006, " Connect delay: %t/%u",
tm_remains(oc->connect_timer), p->cf->connect_delay_time);
if (p->gr_active_num && tm_active(p->gr_timer))
cli_msg(-1006, " Restart timer: %t/-",
tm_remains(p->gr_timer));
}
else if (P->proto_state == PS_UP)
{
cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
cli_msg(-1006, " Local capabilities");
bgp_show_capabilities(p, p->conn->local_caps);
cli_msg(-1006, " Neighbor capabilities");
bgp_show_capabilities(p, p->conn->remote_caps);
2017-12-09 23:55:34 +00:00
cli_msg(-1006, " Session: %s%s%s%s%s",
p->is_internal ? "internal" : "external",
p->cf->multihop ? " multihop" : "",
p->rr_client ? " route-reflector" : "",
p->rs_client ? " route-server" : "",
p->as4_session ? " AS4" : "");
cli_msg(-1006, " Source address: %I", p->local_ip);
cli_msg(-1006, " Hold timer: %t/%u",
tm_remains(p->conn->hold_timer), p->conn->hold_time);
cli_msg(-1006, " Keepalive timer: %t/%u",
tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time);
cli_msg(-1006, " TX pending: %d bytes",
p->conn->sk->tpos - p->conn->sk->ttx);
cli_msg(-1006, " Send hold timer: %t/%u",
tm_remains(p->conn->send_hold_timer), p->conn->send_hold_time);
}
#if 0
struct bgp_stats *s = &p->stats;
cli_msg(-1006, " FSM established transitions: %u",
s->fsm_established_transitions);
cli_msg(-1006, " Rcvd messages: %u total / %u updates / %lu bytes",
s->rx_messages, s->rx_updates, s->rx_bytes);
cli_msg(-1006, " Sent messages: %u total / %u updates / %lu bytes",
s->tx_messages, s->tx_updates, s->tx_bytes);
cli_msg(-1006, " Last rcvd update elapsed time: %t s",
p->last_rx_update ? (current_time() - p->last_rx_update) : 0);
#endif
if ((p->last_error_class != BE_NONE) &&
(p->last_error_class != BE_MAN_DOWN))
{
const char *err1 = bgp_err_classes[p->last_error_class];
const char *err2 = bgp_last_errmsg(p);
cli_msg(-1006, " Last error: %s%s", err1, err2);
}
{
struct bgp_channel *c;
WALK_LIST(c, p->p.channels)
{
channel_show_info(&c->c);
if (c->c.class != &channel_bgp)
continue;
if (p->gr_active_num)
cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]);
if (c->stale_timer && tm_active(c->stale_timer))
cli_msg(-1006, " LL stale timer: %t/-", tm_remains(c->stale_timer));
2017-12-09 23:55:34 +00:00
if (c->c.channel_state == CS_UP)
{
if (ipa_zero(c->link_addr))
cli_msg(-1006, " BGP Next hop: %I", c->next_hop_addr);
else
cli_msg(-1006, " BGP Next hop: %I %I", c->next_hop_addr, c->link_addr);
}
if (c->igp_table_ip4)
cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name);
if (c->igp_table_ip6)
cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name);
if (c->base_table)
cli_msg(-1006, " Base table: %s", c->base_table->name);
if (!c->tx)
continue;
BGP_PTX_LOCK(c->tx, tx);
uint bucket_cnt = 0;
uint prefix_cnt = 0;
struct bgp_bucket *buck;
struct bgp_prefix *px;
WALK_LIST(buck, tx->bucket_queue)
{
bucket_cnt++;
WALK_LIST(px, buck->prefixes)
if (px->cur)
prefix_cnt++;
}
cli_msg(-1006, " Pending %u attribute sets with total %u prefixes to send",
bucket_cnt, prefix_cnt);
}
}
2000-03-30 18:44:23 +00:00
}
const struct channel_class channel_bgp = {
.channel_size = sizeof(struct bgp_channel),
.config_size = sizeof(struct bgp_channel_config),
.init = bgp_channel_init,
.start = bgp_channel_start,
.shutdown = bgp_channel_shutdown,
.cleanup = bgp_channel_cleanup,
.reconfigure = bgp_channel_reconfigure,
};
2000-03-19 22:09:07 +00:00
struct protocol proto_bgp = {
.name = "BGP",
.template = "bgp%d",
.preference = DEF_PREF_BGP,
.channel_mask = NB_IP | NB_VPN | NB_FLOW | NB_MPLS,
.proto_size = sizeof(struct bgp_proto),
.config_size = sizeof(struct bgp_config),
.postconfig = bgp_postconfig,
.init = bgp_init,
.start = bgp_start,
.shutdown = bgp_shutdown,
.reconfigure = bgp_reconfigure,
.copy_config = bgp_copy_config,
.get_status = bgp_get_status,
.show_proto_info = bgp_show_proto_info
2000-03-19 22:09:07 +00:00
};
void bgp_build(void)
{
proto_build(&proto_bgp);
bgp_register_attrs();
bgp_listen_domain = DOMAIN_NEW(rtable);
LOCK_DOMAIN(rtable, bgp_listen_domain);
bgp_listen_pool = rp_new(proto_pool, bgp_listen_domain.rtable, "BGP Listen Sockets");
UNLOCK_DOMAIN(rtable, bgp_listen_domain);
}