From 8bcb5fb1e8a0718f88f99cde2f5b5a3bae5c4451 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 2 Mar 2015 09:41:14 +0100
Subject: [PATCH 01/12] Implement latency tracking, internal event log and
 watchdog

---
 conf/conf.h           |   4 +
 doc/bird.conf.example |  10 ++-
 doc/bird.sgml         |  17 +++++
 lib/event.c           |   7 ++
 nest/config.Y         |   2 +
 sysdep/unix/config.Y  |  13 ++++
 sysdep/unix/io.c      | 170 ++++++++++++++++++++++++++++++++++++++++++
 sysdep/unix/main.c    |   8 ++
 sysdep/unix/unix.h    |   5 +-
 9 files changed, 232 insertions(+), 4 deletions(-)
diff --git a/conf/conf.h b/conf/conf.h
index 00a8c8f2..6ab53e25 100644
--- a/conf/conf.h
+++ b/conf/conf.h
@@ -41,6 +41,10 @@ struct config {
   u32 gr_wait;				/* Graceful restart wait timeout */
 
   int cli_debug;			/* Tracing of CLI connections and commands */
+  int latency_debug;			/* I/O loop tracks duration of each event */
+  u32 latency_limit;			/* Events with longer duration are logged (us) */
+  u32 watchdog_warning;			/* I/O loop watchdog limit for warning (us) */
+  u32 watchdog_timeout;			/* Watchdog timeout (in seconds, 0 = disabled) */
   char *err_msg;			/* Parser error message */
   int err_lino;				/* Line containing error */
   char *err_file_name;			/* File name containing error */
diff --git a/doc/bird.conf.example b/doc/bird.conf.example
index dcc62e29..bbfe0020 100644
--- a/doc/bird.conf.example
+++ b/doc/bird.conf.example
@@ -33,6 +33,10 @@
 # Turn on global debugging of all protocols
 #debug protocols all;
 
+# Turn on internal watchdog
+#watchdog warning 5 s;
+#watchdog timeout 30 s;
+
 # The direct protocol automatically generates device routes to
 # all network interfaces. Can exist in as many instances as you wish
 # if you want to populate multiple routing tables with device routes.
@@ -162,7 +166,7 @@ protocol static {
 #               };
 #	};
 #}
-		
+
 
 #protocol bgp {
 #	disabled;
@@ -186,7 +190,7 @@ protocol static {
 #	source address 198.51.100.14;	# What local address we use for the TCP connection
 #	password "secret";	# Password used for MD5 authentication
 #	rr client;		# I am a route reflector and the neighor is my client
-#	rr cluster id 1.0.0.1;	# Use this value for cluster id instead of my router id 
+#	rr cluster id 1.0.0.1;	# Use this value for cluster id instead of my router id
 #	export where source=RTS_STATIC;
 #	export filter {
 #		if source = RTS_STATIC then {
@@ -202,7 +206,7 @@ protocol static {
 #		reject;
 #	};
 #}
-# 
+#
 # Template usage example
 #template bgp rr_client {
 #	disabled;
diff --git a/doc/bird.sgml b/doc/bird.sgml
index 04d8284f..bcf1c8fb 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -344,6 +344,23 @@ protocol rip {
 	of connects and disconnects, 2 and higher for logging of all client
 	commands). Default: 0.
 
+	<tag>debug latency <m/switch/</tag>
+	Activate tracking of elapsed time for internal events. Recent events
+	could be examined using <cf/dump events/ command. Default: off.
+
+	<tag>debug latency limit <m/time/</tag>
+	If <cf/debug latency/ is enabled, this option allows to specify a limit
+	for elapsed time. Events exceeding the limit are logged. Default: 1 s.
+
+	<tag>watchdog warning <m/time/</tag>
+	Set time limit for I/O loop cycle. If one iteration took more time to
+	complete, a warning is logged. Default: 5 s.
+
+	<tag>watchdog timeout <m/time/</tag>
+	Set time limit for I/O loop cycle. If the limit is breached, BIRD is
+	killed by abort signal. The timeout has effective granularity of
+	seconds, zero means disabled. Default: disabled (0).
+
 	<tag>mrtdump "<m/filename/"</tag>
 	Set MRTdump file name. This option must be specified to allow MRTdump
 	feature. Default: no dump file.
diff --git a/lib/event.c b/lib/event.c
index b429c205..c33e0ffc 100644
--- a/lib/event.c
+++ b/lib/event.c
@@ -114,6 +114,8 @@ ev_schedule(event *e)
   ev_enqueue(&global_event_list, e);
 }
 
+void io_log_event(void *hook, void *data);
+
 /**
  * ev_run_list - run an event list
  * @l: an event list
@@ -132,6 +134,11 @@ ev_run_list(event_list *l)
   WALK_LIST_FIRST(n, tmp_list)
     {
       event *e = SKIP_BACK(event, n, n);
+
+      /* This is ugly hack, we want to log just events executed from the main I/O loop */
+      if (l == &global_event_list)
+	io_log_event(e->hook, e->data);
+
       ev_run(e);
     }
   return !EMPTY_LIST(*l);
diff --git a/nest/config.Y b/nest/config.Y
index 8b697292..8e1e9880 100644
--- a/nest/config.Y
+++ b/nest/config.Y
@@ -621,6 +621,8 @@ CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]])
 { rdump(&root_pool); cli_msg(0, ""); } ;
 CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]])
 { sk_dump_all(); cli_msg(0, ""); } ;
+CF_CLI(DUMP EVENTS,,, [[Dump event log]])
+{ io_log_dump(); cli_msg(0, ""); } ;
 CF_CLI(DUMP INTERFACES,,, [[Dump interface information]])
 { if_dump_all(); cli_msg(0, ""); } ;
 CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]])
diff --git a/sysdep/unix/config.Y b/sysdep/unix/config.Y
index 7fd0ad2d..d6ab8cab 100644
--- a/sysdep/unix/config.Y
+++ b/sysdep/unix/config.Y
@@ -15,6 +15,7 @@ CF_DECLS
 
 CF_KEYWORDS(LOG, SYSLOG, ALL, DEBUG, TRACE, INFO, REMOTE, WARNING, ERROR, AUTH, FATAL, BUG, STDERR, SOFT)
 CF_KEYWORDS(TIMEFORMAT, ISO, OLD, SHORT, LONG, BASE, NAME, CONFIRM, UNDO, CHECK, TIMEOUT)
+CF_KEYWORDS(DEBUG, LATENCY, LIMIT, WATCHDOG, WARNING, TIMEOUT)
 
 %type <i> log_mask log_mask_list log_cat cfg_timeout
 %type <g> log_file
@@ -83,6 +84,7 @@ mrtdump_base:
    }
  ;
 
+
 CF_ADDTO(conf, timeformat_base)
 
 timeformat_which:
@@ -104,6 +106,17 @@ timeformat_base:
    TIMEFORMAT timeformat_spec ';'
  ;
 
+
+CF_ADDTO(conf, debug_unix)
+
+debug_unix:
+   DEBUG LATENCY bool { new_config->latency_debug = $3; }
+ | DEBUG LATENCY LIMIT expr_us { new_config->latency_limit = $4; }
+ | WATCHDOG WARNING expr_us { new_config->watchdog_warning = $3; }
+ | WATCHDOG TIMEOUT expr_us { new_config->watchdog_timeout = ($3 + 999999) TO_S; }
+ ;
+
+
 /* Unix specific commands */
 
 CF_CLI_HELP(CONFIGURE, ..., [[Reload configuration]])
diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index daf9d054..bbb87ca9 100644
--- a/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@ -332,6 +332,8 @@ tm_first_shot(void)
   return x;
 }
 
+void io_log_event(void *hook, void *data);
+
 static void
 tm_shot(void)
 {
@@ -372,6 +374,7 @@ tm_shot(void)
 	    i = 0;
 	  tm_start(t, i);
 	}
+      io_log_event(t->hook, t->data);
       t->hook(t);
     }
 }
@@ -1839,6 +1842,162 @@ sk_dump_all(void)
 }
 
 
+/*
+ *	Internal event log and watchdog
+ */
+
+#define EVENT_LOG_LENGTH 32
+
+struct event_log_entry
+{
+  void *hook;
+  void *data;
+  btime timestamp;
+  btime duration;
+};
+
+static struct event_log_entry event_log[EVENT_LOG_LENGTH];
+static struct event_log_entry *event_open;
+static int event_log_pos, event_log_num, watchdog_active;
+static btime last_time;
+static btime loop_time;
+
+static void
+io_update_time(void)
+{
+  struct timespec ts;
+  int rv;
+
+  if (!clock_monotonic_available)
+    return;
+
+  /*
+   * This is third time-tracking procedure (after update_times() above and
+   * times_update() in BFD), dedicated to internal event log and latency
+   * tracking. Hopefully, we consolidate these sometimes.
+   */
+
+  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
+  if (rv < 0)
+    die("clock_gettime: %m");
+
+  last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
+
+  if (event_open)
+  {
+    event_open->duration = last_time - event_open->timestamp;
+
+    if (event_open->duration > config->latency_limit)
+      log(L_WARN "Event 0x%p 0x%p took %d ms",
+	  event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
+
+    event_open = NULL;
+  }
+}
+
+/**
+ * io_log_event - mark approaching event into event log
+ * @hook: event hook address
+ * @data: event data address
+ *
+ * Store info (hook, data, timestamp) about the following internal event into
+ * a circular event log (@event_log). When latency tracking is enabled, the log
+ * entry is kept open (in @event_open) so the duration can be filled later.
+ */
+void
+io_log_event(void *hook, void *data)
+{
+  if (config->latency_debug)
+    io_update_time();
+
+  struct event_log_entry *en = event_log + event_log_pos;
+
+  en->hook = hook;
+  en->data = data;
+  en->timestamp = last_time;
+  en->duration = 0;
+
+  event_log_num++;
+  event_log_pos++;
+  event_log_pos %= EVENT_LOG_LENGTH;
+
+  event_open = config->latency_debug ? en : NULL;
+}
+
+static inline void
+io_close_event(void)
+{
+  if (event_open)
+    io_update_time();
+}
+
+void
+io_log_dump(void)
+{
+  int i;
+
+  log(L_DEBUG "Event log:");
+  for (i = 0; i < EVENT_LOG_LENGTH; i++)
+  {
+    struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
+    if (en->hook)
+      log(L_DEBUG "  Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
+	  (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
+  }
+}
+
+void
+watchdog_sigalrm(int sig UNUSED)
+{
+  /* Update last_time and duration, but skip latency check */
+  config->latency_limit = 0xffffffff;
+  io_update_time();
+
+  /* We want core dump */
+  abort();
+}
+
+static inline void
+watchdog_start1(void)
+{
+  io_update_time();
+
+  loop_time = last_time;
+}
+
+static inline void
+watchdog_start(void)
+{
+  io_update_time();
+
+  loop_time = last_time;
+  event_log_num = 0;
+
+  if (config->watchdog_timeout)
+  {
+    alarm(config->watchdog_timeout);
+    watchdog_active = 1;
+  }
+}
+
+static inline void
+watchdog_stop(void)
+{
+  io_update_time();
+
+  if (watchdog_active)
+  {
+    alarm(0);
+    watchdog_active = 0;
+  }
+
+  btime duration = last_time - loop_time;
+  if (duration > config->watchdog_warning)
+    log(L_WARN "I/O loop cycle took %d ms for %d events",
+	(int) (duration TO_MS), event_log_num);
+}
+
+
 /*
  *	Main I/O Loop
  */
@@ -1873,6 +2032,7 @@ io_loop(void)
   sock *s;
   node *n;
 
+  watchdog_start1();
   sock_recalc_fdsets_p = 1;
   for(;;)
     {
@@ -1887,6 +2047,8 @@ io_loop(void)
       timo.tv_sec = events ? 0 : MIN(tout - now, 3);
       timo.tv_usec = 0;
 
+      io_close_event();
+
       if (sock_recalc_fdsets_p)
 	{
 	  sock_recalc_fdsets_p = 0;
@@ -1923,25 +2085,30 @@ io_loop(void)
 
       if (async_config_flag)
 	{
+	  io_log_event(async_config, NULL);
 	  async_config();
 	  async_config_flag = 0;
 	  continue;
 	}
       if (async_dump_flag)
 	{
+	  io_log_event(async_dump, NULL);
 	  async_dump();
 	  async_dump_flag = 0;
 	  continue;
 	}
       if (async_shutdown_flag)
 	{
+	  io_log_event(async_shutdown, NULL);
 	  async_shutdown();
 	  async_shutdown_flag = 0;
 	  continue;
 	}
 
       /* And finally enter select() to find active sockets */
+      watchdog_stop();
       hi = select(hi+1, &rd, &wr, NULL, &timo);
+      watchdog_start();
 
       if (hi < 0)
 	{
@@ -1965,6 +2132,7 @@ io_loop(void)
 		do
 		  {
 		    steps--;
+		    io_log_event(s->rx_hook, s->data);
 		    e = sk_read(s);
 		    if (s != current_sock)
 		      goto next;
@@ -1976,6 +2144,7 @@ io_loop(void)
 		do
 		  {
 		    steps--;
+		    io_log_event(s->tx_hook, s->data);
 		    e = sk_write(s);
 		    if (s != current_sock)
 		      goto next;
@@ -2003,6 +2172,7 @@ io_loop(void)
 	      if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
 		{
 		  count++;
+		  io_log_event(s->rx_hook, s->data);
 		  e = sk_read(s);
 		  if (s != current_sock)
 		      goto next2;
diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c
index 61b306dc..05f7560d 100644
--- a/sysdep/unix/main.c
+++ b/sysdep/unix/main.c
@@ -168,6 +168,9 @@ sysdep_preconfig(struct config *c)
 {
   init_list(&c->logfiles);
 
+  c->latency_limit = UNIX_DEFAULT_LATENCY_LIMIT;
+  c->watchdog_warning = UNIX_DEFAULT_WATCHDOG_WARNING;
+
 #ifdef PATH_IPROUTE_DIR
   read_iproute_table(PATH_IPROUTE_DIR "/rt_protos", "ipp_", 256);
   read_iproute_table(PATH_IPROUTE_DIR "/rt_realms", "ipr_", 256);
@@ -585,6 +588,8 @@ handle_sigterm(int sig UNUSED)
   async_shutdown_flag = 1;
 }
 
+void watchdog_sigalrm(int sig UNUSED);
+
 static void
 signal_init(void)
 {
@@ -600,6 +605,9 @@ signal_init(void)
   sa.sa_handler = handle_sigterm;
   sa.sa_flags = SA_RESTART;
   sigaction(SIGTERM, &sa, NULL);
+  sa.sa_handler = watchdog_sigalrm;
+  sa.sa_flags = 0;
+  sigaction(SIGALRM, &sa, NULL);
   signal(SIGPIPE, SIG_IGN);
 }
 
diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h
index 3cee96b4..593978cc 100644
--- a/sysdep/unix/unix.h
+++ b/sysdep/unix/unix.h
@@ -27,8 +27,10 @@ void cmd_reconfig_confirm(void);
 void cmd_reconfig_undo(void);
 void cmd_shutdown(void);
 
-#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300
+#define UNIX_DEFAULT_CONFIGURE_TIMEOUT	300
 
+#define UNIX_DEFAULT_LATENCY_LIMIT	(1 S_)
+#define UNIX_DEFAULT_WATCHDOG_WARNING	(5 S_)
 
 /* io.c */
 
@@ -99,6 +101,7 @@ volatile int async_shutdown_flag;
 
 void io_init(void);
 void io_loop(void);
+void io_log_dump(void);
 int sk_open_unix(struct birdsock *s, char *name);
 void *tracked_fopen(struct pool *, char *name, char *mode);
 void test_old_bird(char *path);

From af454f9b7c3930a7900e60a7fb608b7de11852aa Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 2 Mar 2015 09:42:44 +0100
Subject: [PATCH 02/12] Fixes bug in debug dumps

Using 'dump sockets' in IPv6 mode caused crash due to mismatched format string.

Thanks to Pavel Tvrdik for noticing it.
---
 sysdep/unix/io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index bbb87ca9..b4fec9cd 100644
--- a/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@ -1106,7 +1106,7 @@ sk_dump(resource *r)
   sock *s = (sock *) r;
   static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
 
-  debug("(%s, ud=%p, sa=%08x, sp=%d, da=%08x, dp=%d, tos=%d, ttl=%d, if=%s)\n",
+  debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
 	sk_type_names[s->type],
 	s->data,
 	s->saddr,

From 509aab5debef5b4710d8983da6ef076a226fd7ea Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 2 Mar 2015 10:58:20 +0100
Subject: [PATCH 03/12] Fixes serious bug in BGP add-path

Temporary rta is reused in BGP, while rta_lookup() breaks it.

Thanks to Alexander Chernikov for analysing the problem.
---
 proto/bgp/packets.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 69646c7d..d34e7c56 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -999,7 +999,11 @@ bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen,
   if (!*a)
     {
       a0->src = *src;
+
+      /* Workaround for rta_lookup() breaking eattrs */
+      ea_list *ea = a0->eattrs;
       *a = rta_lookup(a0);
+      a0->eattrs = ea;
     }
 
   net *n = net_get(p->p.table, prefix, pxlen);

From a5a5a41e2ee51ad6dfef0ab24e07d6d9b16a4215 Mon Sep 17 00:00:00 2001
From: Ondrej Filip <feela@network.cz>
Date: Mon, 9 Mar 2015 23:59:26 +0100
Subject: [PATCH 04/12] Possibility to define unnamed protocols from template
 added.

---
 nest/config.Y | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nest/config.Y b/nest/config.Y
index 8e1e9880..939bed6a 100644
--- a/nest/config.Y
+++ b/nest/config.Y
@@ -181,6 +181,12 @@ proto_name:
      cf_define_symbol($1, this_proto->class, this_proto);
      this_proto->name = $1->name;
    }
+ | FROM SYM {
+     struct symbol *s = cf_default_name(this_proto->protocol->template, &this_proto->protocol->name_counter);
+     this_proto->name = s->name;
+     if (($2->class != SYM_TEMPLATE) && ($2->class != SYM_PROTO)) cf_error("Template or protocol name expected");
+     proto_copy_config(this_proto, $2->def);
+   }
  | SYM FROM SYM {
      if (($3->class != SYM_TEMPLATE) && ($3->class != SYM_PROTO)) cf_error("Template or protocol name expected");
 

From 9aed29e605334d34d0e6a90fc172ee83d0274ad3 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 29 Mar 2015 18:27:13 +0200
Subject: [PATCH 05/12] BGP: Enhanced route refresh (RFC 7313) support

Also hook feed_done is renamed to feed_end.
---
 doc/bird.sgml       |  18 +++---
 nest/proto.c        |   7 ++-
 nest/protocol.h     |   6 +-
 proto/bgp/bgp.c     | 102 +++++++++++++++++++++++++++++++--
 proto/bgp/bgp.h     |  46 ++++++++++++---
 proto/bgp/packets.c | 137 ++++++++++++++++++++++++++++++++++++++------
 sysdep/unix/krt.c   |   4 +-
 7 files changed, 278 insertions(+), 42 deletions(-)

diff --git a/doc/bird.sgml b/doc/bird.sgml
index bcf1c8fb..fc5fc9ae 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -1804,13 +1804,17 @@ using the following configuration parameters:
 	other means. Default: 0 (no local AS number allowed).
 
 	<tag>enable route refresh <m/switch/</tag>
-	When BGP speaker changes its import filter, it has to re-examine all
-	routes received from its neighbor against the new filter. As these
-	routes might not be available, there is a BGP protocol extension Route
-	Refresh (specified in RFC 2918) that allows BGP speaker to request
-	re-advertisement of all routes from its neighbor. This option specifies
-	whether BIRD advertises this capability and accepts such requests. Even
-	when disabled, BIRD can send route refresh requests. Default: on.
+	After the initial route exchange, BGP protocol uses incremental updates
+	to keep BGP speakers synchronized. Sometimes (e.g., if BGP speaker
+	changes its import filter, or if there is suspicion of inconsistency) it
+	is necessary to do a new complete route exchange. BGP protocol extension
+	Route Refresh (RFC 2918) allows BGP speaker to request re-advertisement
+	of all routes from its neighbor. BGP protocol extension Enhanced Route
+	Refresh (RFC 7313) specifies explicit begin and end for such exchanges,
+	therefore the receiver can remove stale routes that were not advertised
+	during the exchange. This option specifies whether BIRD advertises these
+	capabilities and supports related procedures. Note that even when
+	disabled, BIRD can send route refresh requests. Default: on.
 
 	<tag>graceful restart <m/switch/|aware</tag>
 	When a BGP speaker restarts or crashes, neighbors will discard all
diff --git a/nest/proto.c b/nest/proto.c
index 7339e4f4..44cfb637 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -942,8 +942,8 @@ proto_feed_more(void *P)
       p->export_state = ES_READY;
       proto_log_state_change(p);
 
-      if (p->feed_done)
-	p->feed_done(p);
+      if (p->feed_end)
+	p->feed_end(p);
     }
   else
     {
@@ -976,6 +976,9 @@ proto_schedule_feed(struct proto *p, int initial)
 
   p->attn->hook = initial ? proto_feed_initial : proto_feed_more;
   ev_schedule(p->attn);
+
+  if (p->feed_begin)
+    p->feed_begin(p, initial);
 }
 
 /*
diff --git a/nest/protocol.h b/nest/protocol.h
index f46e0b13..8660cc2c 100644
--- a/nest/protocol.h
+++ b/nest/protocol.h
@@ -179,7 +179,8 @@ struct proto {
    *	   reload_routes   Request protocol to reload all its routes to the core
    *			(using rte_update()). Returns: 0=reload cannot be done,
    *			1= reload is scheduled and will happen (asynchronously).
-   *	   feed_done	Notify protocol about finish of route feeding.
+   *	   feed_begin	Notify protocol about beginning of route feeding.
+   *	   feed_end	Notify protocol about finish of route feeding.
    */
 
   void (*if_notify)(struct proto *, unsigned flags, struct iface *i);
@@ -190,7 +191,8 @@ struct proto {
   void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs);
   int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool);
   int (*reload_routes)(struct proto *);
-  void (*feed_done)(struct proto *);
+  void (*feed_begin)(struct proto *, int initial);
+  void (*feed_end)(struct proto *);
 
   /*
    *	Routing entry hooks (called only for routes belonging to this protocol):
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 050f737f..e48b643b 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -377,6 +377,8 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
   p->conn = conn;
   p->last_error_class = 0;
   p->last_error_code = 0;
+  p->feed_state = BFS_NONE;
+  p->load_state = BFS_NONE;
   bgp_init_bucket_table(p);
   bgp_init_prefix_table(p, 8);
 
@@ -394,6 +396,12 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
   if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
     bgp_graceful_restart_done(p);
 
+  /* GR capability implies that neighbor will send End-of-RIB */
+  if (conn->peer_gr_aware)
+    p->load_state = BFS_LOADING;
+
+  /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */
+
   bgp_conn_set_state(conn, BS_ESTABLISHED);
   proto_notify_state(&p->p, PS_UP);
 }
@@ -504,6 +512,47 @@ bgp_graceful_restart_timeout(timer *t)
   bgp_stop(p, 0);
 }
 
+
+/**
+ * bgp_refresh_begin - start incoming enhanced route refresh sequence
+ * @p: BGP instance
+ *
+ * This function is called when an incoming enhanced route refresh sequence is
+ * started by the neighbor, demarcated by the BoRR packet. The function updates
+ * the load state and starts the routing table refresh cycle. Note that graceful
+ * restart also uses routing table refresh cycle, but RFC 7313 and load states
+ * ensure that these two sequences do not overlap.
+ */
+void
+bgp_refresh_begin(struct bgp_proto *p)
+{
+  if (p->load_state == BFS_LOADING)
+    { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
+
+  p->load_state = BFS_REFRESHING;
+  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
+}
+
+/**
+ * bgp_refresh_end - finish incoming enhanced route refresh sequence
+ * @p: BGP instance
+ *
+ * This function is called when an incoming enhanced route refresh sequence is
+ * finished by the neighbor, demarcated by the EoRR packet. The function updates
+ * the load state and ends the routing table refresh cycle. Routes not received
+ * during the sequence are removed by the nest.
+ */
+void
+bgp_refresh_end(struct bgp_proto *p)
+{
+  if (p->load_state != BFS_REFRESHING)
+    { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
+
+  p->load_state = BFS_NONE;
+  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+}
+
+
 static void
 bgp_send_open(struct bgp_conn *conn)
 {
@@ -514,6 +563,7 @@ bgp_send_open(struct bgp_conn *conn)
   conn->peer_refresh_support = 0;
   conn->peer_as4_support = 0;
   conn->peer_add_path = 0;
+  conn->peer_enhanced_refresh_support = 0;
   conn->peer_gr_aware = 0;
   conn->peer_gr_able = 0;
   conn->peer_gr_time = 0;
@@ -959,16 +1009,56 @@ bgp_reload_routes(struct proto *P)
 }
 
 static void
-bgp_feed_done(struct proto *P)
+bgp_feed_begin(struct proto *P, int initial)
 {
   struct bgp_proto *p = (struct bgp_proto *) P;
-  if (!p->conn || !p->cf->gr_mode || p->p.refeeding)
+
+  /* This should not happen */
+  if (!p->conn)
     return;
 
-  p->send_end_mark = 1;
+  if (initial && p->cf->gr_mode)
+    p->feed_state = BFS_LOADING;
+
+  /* It is refeed and both sides support enhanced route refresh */
+  if (!initial && p->cf->enable_refresh &&
+      p->conn->peer_enhanced_refresh_support)
+    {
+      /* BoRR must not be sent before End-of-RIB */
+      if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED)
+	return;
+
+      p->feed_state = BFS_REFRESHING;
+      bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH);
+    }
+}
+
+static void
+bgp_feed_end(struct proto *P)
+{
+  struct bgp_proto *p = (struct bgp_proto *) P;
+
+  /* This should not happen */
+  if (!p->conn)
+    return;
+
+  /* Non-demarcated feed ended, nothing to do */
+  if (p->feed_state == BFS_NONE)
+    return;
+
+  /* Schedule End-of-RIB packet */
+  if (p->feed_state == BFS_LOADING)
+    p->feed_state = BFS_LOADED;
+
+  /* Schedule EoRR packet */
+  if (p->feed_state == BFS_REFRESHING)
+    p->feed_state = BFS_REFRESHED;
+
+  /* Kick TX hook */
   bgp_schedule_packet(p->conn, PKT_UPDATE);
 }
 
+
 static void
 bgp_start_locked(struct object_lock *lock)
 {
@@ -1150,7 +1240,8 @@ bgp_init(struct proto_config *C)
   P->import_control = bgp_import_control;
   P->neigh_notify = bgp_neigh_notify;
   P->reload_routes = bgp_reload_routes;
-  P->feed_done = bgp_feed_done;
+  P->feed_begin = bgp_feed_begin;
+  P->feed_end = bgp_feed_end;
   P->rte_better = bgp_rte_better;
   P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
 
@@ -1426,8 +1517,9 @@ bgp_show_proto_info(struct proto *P)
   else if (P->proto_state == PS_UP)
     {
       cli_msg(-1006, "    Neighbor ID:      %R", p->remote_id);
-      cli_msg(-1006, "    Neighbor caps:   %s%s%s%s%s",
+      cli_msg(-1006, "    Neighbor caps:   %s%s%s%s%s%s",
 	      c->peer_refresh_support ? " refresh" : "",
+	      c->peer_enhanced_refresh_support ? " enhanced-refresh" : "",
 	      c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""),
 	      c->peer_as4_support ? " AS4" : "",
 	      (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "",
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 2c2b02b8..f4f21226 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -103,6 +103,7 @@ struct bgp_conn {
   u8 peer_refresh_support;		/* Peer supports route refresh [RFC2918] */
   u8 peer_as4_support;			/* Peer supports 4B AS numbers [RFC4893] */
   u8 peer_add_path;			/* Peer supports ADD-PATH [draft] */
+  u8 peer_enhanced_refresh_support;	/* Peer supports enhanced refresh [RFC7313] */
   u8 peer_gr_aware;
   u8 peer_gr_able;
   u16 peer_gr_time;
@@ -127,6 +128,8 @@ struct bgp_proto {
   int rs_client;			/* Whether neighbor is RS client of me */
   u8 gr_ready;				/* Neighbor could do graceful restart */
   u8 gr_active;				/* Neighbor is doing graceful restart */
+  u8 feed_state;			/* Feed state (TX) for EoR, RR packets, see BFS_* */
+  u8 load_state;			/* Load state (RX) for EoR, RR packets, see BFS_* */
   struct bgp_conn *conn;		/* Connection we have established */
   struct bgp_conn outgoing_conn;	/* Outgoing connection we're working with */
   struct bgp_conn incoming_conn;	/* Incoming connection we have neither accepted nor rejected yet */
@@ -144,7 +147,6 @@ struct bgp_proto {
   slab *prefix_slab;			/* Slab holding prefix nodes */
   list bucket_queue;			/* Queue of buckets to send */
   struct bgp_bucket *withdraw_bucket;	/* Withdrawn routes */
-  unsigned send_end_mark;		/* End-of-RIB mark scheduled for transmit */
   unsigned startup_delay;		/* Time to delay protocol startup by due to errors */
   bird_clock_t last_proto_error;	/* Time of last error that leads to protocol stop */
   u8 last_error_class; 			/* Error class of last error */
@@ -196,6 +198,8 @@ void bgp_conn_enter_close_state(struct bgp_conn *conn);
 void bgp_conn_enter_idle_state(struct bgp_conn *conn);
 void bgp_handle_graceful_restart(struct bgp_proto *p);
 void bgp_graceful_restart_done(struct bgp_proto *p);
+void bgp_refresh_begin(struct bgp_proto *p);
+void bgp_refresh_end(struct bgp_proto *p);
 void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code);
 void bgp_stop(struct bgp_proto *p, unsigned subcode);
 
@@ -263,7 +267,8 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
 #define PKT_UPDATE		0x02
 #define PKT_NOTIFICATION	0x03
 #define PKT_KEEPALIVE		0x04
-#define PKT_ROUTE_REFRESH	0x05
+#define PKT_ROUTE_REFRESH	0x05	/* [RFC2918] */
+#define PKT_BEGIN_REFRESH	0x1e	/* Dummy type for BoRR packet [RFC7313] */
 #define PKT_SCHEDULE_CLOSE	0x1f	/* Used internally to schedule socket close */
 
 /* Attributes */
@@ -306,13 +311,13 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
 #define BS_MAX			7
 
 /* BGP start states
- * 
+ *
  * Used in PS_START for fine-grained specification of starting state.
  *
- * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP protocol
- * done what is neccessary to start itself (like acquiring the lock), it goes to BSS_CONNECT.
- * When some connection attempt failed because of option or capability error, it goes to
- * BSS_CONNECT_NOCAP.
+ * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP
+ * protocol done what is neccessary to start itself (like acquiring the lock),
+ * it goes to BSS_CONNECT.  When some connection attempt failed because of
+ * option or capability error, it goes to BSS_CONNECT_NOCAP.
  */
 
 #define BSS_PREPARE		0	/* Used before ordinary BGP started, i. e. waiting for lock */
@@ -320,6 +325,33 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
 #define BSS_CONNECT		2	/* Ordinary BGP connecting */
 #define BSS_CONNECT_NOCAP	3	/* Legacy BGP connecting (without capabilities) */
 
+
+/* BGP feed states (TX)
+ *
+ * RFC 4724 specifies that an initial feed should end with End-of-RIB mark.
+ *
+ * RFC 7313 specifies that a route refresh should be demarcated by BoRR and EoRR packets.
+ *
+ * These states (stored in p->feed_state) are used to keep track of these
+ * requirements. When such feed is started, BFS_LOADING / BFS_REFRESHING is
+ * set. When it ended, BFS_LOADED / BFS_REFRESHED is set to schedule End-of-RIB
+ * or EoRR packet. When the packet is sent, the state returned to BFS_NONE.
+ *
+ * Note that when a non-demarcated feed (e.g. plain RFC 4271 initial load
+ * without End-of-RIB or plain RFC 2918 route refresh without BoRR/EoRR
+ * demarcation) is active, BFS_NONE is set.
+ *
+ * BFS_NONE, BFS_LOADING and BFS_REFRESHING are also used as load states (RX)
+ * with correspondent semantics (-, expecting End-of-RIB, expecting EoRR).
+ */
+
+#define BFS_NONE		0	/* No feed or original non-demarcated feed */
+#define BFS_LOADING		1	/* Initial feed active, End-of-RIB planned */
+#define BFS_LOADED		2	/* Loading done, End-of-RIB marker scheduled */
+#define BFS_REFRESHING		3	/* Route refresh (introduced by BoRR) active */
+#define BFS_REFRESHED		4	/* Refresh done, EoRR packet scheduled */
+
+
 /* Error classes */
 
 #define BE_NONE			0
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index d34e7c56..2d2a84b3 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -22,6 +22,12 @@
 
 #include "bgp.h"
 
+
+#define BGP_RR_REQUEST		0
+#define BGP_RR_BEGIN		1
+#define BGP_RR_END		2
+
+
 static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
 static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
 
@@ -209,6 +215,15 @@ bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
   return buf;
 }
 
+static byte *
+bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf)
+{
+  *buf++ = 70;		/* Capability 70: Support for enhanced route refresh */
+  *buf++ = 0;		/* Capability data length */
+  return buf;
+}
+
+
 static byte *
 bgp_create_open(struct bgp_conn *conn, byte *buf)
 {
@@ -256,6 +271,9 @@ bgp_create_open(struct bgp_conn *conn, byte *buf)
   if (p->cf->add_path)
     cap = bgp_put_cap_add_path(p, cap);
 
+  if (p->cf->enable_refresh)
+    cap = bgp_put_cap_err(p, cap);
+
   cap_len = cap - buf - 12;
   if (cap_len > 0)
     {
@@ -389,7 +407,7 @@ static byte *
 bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
 {
   struct bgp_proto *p = conn->bgp;
-  BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
+  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
 
   put_u32(buf, 0);
   return buf+4;
@@ -568,7 +586,7 @@ static byte *
 bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
 {
   struct bgp_proto *p = conn->bgp;
-  BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
+  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
 
   put_u16(buf+0, 0);
   put_u16(buf+2, 6);	/* length 4-9 */
@@ -586,19 +604,49 @@ bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
 
 #endif
 
-static byte *
+static inline byte *
 bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
 {
   struct bgp_proto *p = conn->bgp;
   BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
 
+  /* Original original route refresh request, RFC 2918 */
   *buf++ = 0;
   *buf++ = BGP_AF;
-  *buf++ = 0;		/* RFU */
-  *buf++ = 1;		/* and SAFI 1 */
+  *buf++ = BGP_RR_REQUEST;
+  *buf++ = 1;		/* SAFI */
   return buf;
 }
 
+static inline byte *
+bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf)
+{
+  struct bgp_proto *p = conn->bgp;
+  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
+
+  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
+  *buf++ = 0;
+  *buf++ = BGP_AF;
+  *buf++ = BGP_RR_BEGIN;
+  *buf++ = 1;		/* SAFI */
+  return buf;
+}
+
+static inline byte *
+bgp_create_end_refresh(struct bgp_conn *conn, byte *buf)
+{
+  struct bgp_proto *p = conn->bgp;
+  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
+
+  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
+  *buf++ = 0;
+  *buf++ = BGP_AF;
+  *buf++ = BGP_RR_END;
+  *buf++ = 1;		/* SAFI */
+  return buf;
+}
+
+
 static void
 bgp_create_header(byte *buf, unsigned int len, unsigned int type)
 {
@@ -666,24 +714,44 @@ bgp_fire_tx(struct bgp_conn *conn)
       type = PKT_ROUTE_REFRESH;
       end = bgp_create_route_refresh(conn, pkt);
     }
+  else if (s & (1 << PKT_BEGIN_REFRESH))
+    {
+      s &= ~(1 << PKT_BEGIN_REFRESH);
+      type = PKT_ROUTE_REFRESH;	/* BoRR is a subtype of RR */
+      end = bgp_create_begin_refresh(conn, pkt);
+    }
   else if (s & (1 << PKT_UPDATE))
     {
-      end = bgp_create_update(conn, pkt);
       type = PKT_UPDATE;
+      end = bgp_create_update(conn, pkt);
 
       if (!end)
-	{
+        {
+	  /* No update to send, perhaps we need to send End-of-RIB or EoRR */
+
 	  conn->packets_to_send = 0;
 
-	  if (!p->send_end_mark)
+	  if (p->feed_state == BFS_LOADED)
+	  {
+	    type = PKT_UPDATE;
+	    end = bgp_create_end_mark(conn, pkt);
+	  }
+
+	  else if (p->feed_state == BFS_REFRESHED)
+	  {
+	    type = PKT_ROUTE_REFRESH;
+	    end = bgp_create_end_refresh(conn, pkt);
+	  }
+
+	  else /* Really nothing to send */
 	    return 0;
 
-	  p->send_end_mark = 0;
-	  end = bgp_create_end_mark(conn, pkt);
+	  p->feed_state = BFS_NONE;
 	}
     }
   else
     return 0;
+
   conn->packets_to_send = s;
   bgp_create_header(buf, end - buf, type);
   return sk_send(sk, end - buf);
@@ -737,7 +805,7 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
     {
       if (len < 2 || len < 2 + opt[1])
 	goto err;
-      
+
       cl = opt[1];
 
       switch (opt[0])
@@ -780,7 +848,12 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
 	      conn->peer_add_path = opt[2+i+3];
 	  if (conn->peer_add_path > ADD_PATH_FULL)
 	    goto err;
+	  break;
 
+	case 70: /* Enhanced route refresh capability, RFC 7313 */
+	  if (cl != 0)
+	    goto err;
+	  conn->peer_enhanced_refresh_support = 1;
 	  break;
 
 	  /* We can safely ignore all other capabilities */
@@ -945,7 +1018,10 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
 static inline void
 bgp_rx_end_mark(struct bgp_proto *p)
 {
-  BGP_TRACE(D_PACKETS, "Got End-of-RIB");
+  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
+
+  if (p->load_state == BFS_LOADING)
+    p->load_state = BFS_NONE;
 
   if (p->p.gr_recovery)
     proto_graceful_restart_unlock(&p->p);
@@ -1353,7 +1429,9 @@ static struct {
   { 6, 5, "Connection rejected" },
   { 6, 6, "Other configuration change" },
   { 6, 7, "Connection collision resolution" },
-  { 6, 8, "Out of Resources" }
+  { 6, 8, "Out of Resources" },
+  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
+  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
 };
 
 /**
@@ -1484,22 +1562,47 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, int len)
 {
   struct bgp_proto *p = conn->bgp;
 
-  BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
-
   if (conn->state != BS_ESTABLISHED)
     { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
 
   if (!p->cf->enable_refresh)
     { bgp_error(conn, 1, 3, pkt+18, 1); return; }
 
-  if (len != (BGP_HEADER_LENGTH + 4))
+  if (len < (BGP_HEADER_LENGTH + 4))
     { bgp_error(conn, 1, 2, pkt+16, 2); return; }
 
+  if (len > (BGP_HEADER_LENGTH + 4))
+    { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
+
   /* FIXME - we ignore AFI/SAFI values, as we support
      just one value and even an error code for an invalid
      request is not defined */
 
-  proto_request_feeding(&p->p);
+  /* RFC 7313 redefined reserved field as RR message subtype */
+  uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST;
+
+  switch (subtype)
+  {
+  case BGP_RR_REQUEST:
+    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
+    proto_request_feeding(&p->p);
+    break;
+
+  case BGP_RR_BEGIN:
+    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
+    bgp_refresh_begin(p);
+    break;
+
+  case BGP_RR_END:
+    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
+    bgp_refresh_end(p);
+    break;
+
+  default:
+    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
+	p->p.name, subtype);
+    break;
+  }
 }
 
 
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 78514cf5..0a223a4f 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -1023,7 +1023,7 @@ krt_reload_routes(struct proto *P)
 }
 
 static void
-krt_feed_done(struct proto *P)
+krt_feed_end(struct proto *P)
 {
   struct krt_proto *p = (struct krt_proto *) P;
 
@@ -1056,7 +1056,7 @@ krt_init(struct proto_config *c)
   p->p.rt_notify = krt_rt_notify;
   p->p.if_notify = krt_if_notify;
   p->p.reload_routes = krt_reload_routes;
-  p->p.feed_done = krt_feed_done;
+  p->p.feed_end = krt_feed_end;
   p->p.make_tmp_attrs = krt_make_tmp_attrs;
   p->p.store_tmp_attrs = krt_store_tmp_attrs;
   p->p.rte_same = krt_rte_same;

From 2eadd36fa004d705a4003892d1639485eeaf8486 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 29 Mar 2015 21:24:47 +0200
Subject: [PATCH 06/12] BGP: AS-wide unique router ID (RFC 6286) support

RFC 6286 relaxed rules for router IDs, allowing EBGP sessions between
routers with the same ID (but different ASN).
---
 proto/bgp/packets.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 2d2a84b3..27d82729 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -943,7 +943,8 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
   if (hold > 0 && hold < 3)
     { bgp_error(conn, 2, 6, pkt+22, 2); return; }
 
-  if (!id || id == 0xffffffff || id == p->local_id)
+  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
+  if (!id || (p->is_internal && id == p->local_id))
     { bgp_error(conn, 2, 3, pkt+24, -4); return; }
 
   if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
@@ -978,8 +979,23 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
       break;
 
     case BS_OPENCONFIRM:
-      if ((p->local_id < id) == (conn == &p->incoming_conn))
-	{
+      /*
+       * Description of collision detection rules in RFC 4271 is confusing and
+       * contradictory, but it is essentially:
+       *
+       * 1. Router with higher ID is dominant
+       * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
+       * 3. When both connections are in OpenConfirm state, one initiated by
+       *    the dominant router is kept.
+       *
+       * The first line in the expression below evaluates whether the neighbor
+       * is dominant, the second line whether the new connection was initiated
+       * by the neighbor. If both are true (or both are false), we keep the new
+       * connection, otherwise we keep the old one.
+       */
+      if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as)))
+	  == (conn == &p->incoming_conn))
+        {
 	  /* Should close the other connection */
 	  BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
 	  bgp_error(other, 6, 7, NULL, 0);

From 16a3254c4cb592e7cfa3aea744e9fd58665d6367 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Tue, 31 Mar 2015 23:59:40 +0200
Subject: [PATCH 07/12] Understand IFF_MULTICAST flag on ifaces in Linux

Unfortunately, some interfaces support multicast but do not have
this flag set, so we use it only as a positive hint.

Thanks to Clint Armstrong for noticing the problem.
---
 sysdep/linux/netlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index 860c8601..48dd8bab 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -436,6 +436,9 @@ nl_parse_link(struct nlmsghdr *h, int scan)
       else
 	f.flags |= IF_MULTIACCESS;	/* NBMA */
 
+      if (fl & IFF_MULTICAST)
+	f.flags |= IF_MULTICAST;
+
       ifi = if_update(&f);
 
       if (!scan)

From d924d5a5626397da7e71fddfb1c0fd22c2714f2c Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Wed, 1 Apr 2015 00:01:35 +0200
Subject: [PATCH 08/12] BGP: Fixes serious bug in TX handling

Under some circumstances and heavy load, TX could be postponed
until the session fails with hold timer expired.

Thanks to Javor Kliachev for making the bug reproductible.
---
 proto/bgp/packets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 27d82729..4bd68f52 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -769,7 +769,7 @@ bgp_schedule_packet(struct bgp_conn *conn, int type)
 {
   DBG("BGP: Scheduling packet type %d\n", type);
   conn->packets_to_send |= 1 << type;
-  if (conn->sk && conn->sk->tpos == conn->sk->tbuf)
+  if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev))
     ev_schedule(conn->tx_ev);
 }
 

From 304ac2e861a5ea28683489aff38ff37ff6873bb4 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 12 Apr 2015 10:47:17 +0200
Subject: [PATCH 09/12] Minor fixes

---
 proto/ospf/ospf.h     | 2 ++
 sysdep/bsd/krt-sock.c | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h
index c324f431..a4e525ec 100644
--- a/proto/ospf/ospf.h
+++ b/proto/ospf/ospf.h
@@ -916,9 +916,11 @@ static inline void ospf_send_to_des(struct ospf_iface *ifa)
     ospf_send_to_bdr(ifa);
 }
 
+#ifndef PARSER
 #define DROP(DSC,VAL) do { err_dsc = DSC; err_val = VAL; goto drop; } while(0)
 #define DROP1(DSC) do { err_dsc = DSC; goto drop; } while(0)
 #define SKIP(DSC) do { err_dsc = DSC; goto skip; } while(0)
+#endif
 
 static inline uint ospf_pkt_hdrlen(struct ospf_proto *p)
 { return ospf_is_v2(p) ? (sizeof(struct ospf_packet) + sizeof(union ospf_auth)) : sizeof(struct ospf_packet); }
diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c
index 0e65c51c..73f69df5 100644
--- a/sysdep/bsd/krt-sock.c
+++ b/sysdep/bsd/krt-sock.c
@@ -247,7 +247,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e)
 
 #ifdef IPV6
   /* Embed interface ID to link-local address */
-  if (ipa_has_link_scope(gw))
+  if (ipa_is_link_local(gw))
     _I0(gw) = 0xfe800000 | (i->index & 0x0000ffff);
 #endif
 
@@ -468,7 +468,7 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
 
 #ifdef IPV6
     /* Clean up embedded interface ID returned in link-local address */
-    if (ipa_has_link_scope(a.gw))
+    if (ipa_is_link_local(a.gw))
       _I0(a.gw) = 0xfe800000;
 #endif
 
@@ -662,10 +662,10 @@ krt_read_addr(struct ks_msg *msg, int scan)
 #ifdef IPV6
   /* Clean up embedded interface ID returned in link-local address */
 
-  if (ipa_has_link_scope(iaddr))
+  if (ipa_is_link_local(iaddr))
     _I0(iaddr) = 0xfe800000;
 
-  if (ipa_has_link_scope(ibrd))
+  if (ipa_is_link_local(ibrd))
     _I0(ibrd) = 0xfe800000;
 #endif
 

From ef3cac669ca0f6f2b983e33ab6d553705c35f3df Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sat, 18 Apr 2015 13:22:41 +0200
Subject: [PATCH 10/12] OSPF: Fixes handling of external routes with immediate
 gw

The bug caused that received external LSAs with locally reachable
next hops were ignored. I wonder why nobody noticed it sooner.
---
 proto/ospf/rt.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c
index b616c0d1..74d10c7b 100644
--- a/proto/ospf/rt.c
+++ b/proto/ospf/rt.c
@@ -95,6 +95,8 @@ merge_nexthops(struct ospf_proto *p, struct mpnh *s1, struct mpnh *s2, int r1, i
   struct mpnh **n = &root;
   int count = p->ecmp;
 
+  ASSERT(p->ecmp);
+
   /*
    * r1, r2 signalize whether we can reuse nexthops from s1, s2.
    * New nexthops (s2, new) can be reused if they are not inherited
@@ -153,6 +155,9 @@ fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw)
   struct mpnh **nn1 = &root1;
   struct mpnh **nn2 = &root2;
 
+  if (!p->ecmp)
+    return new_nexthop(p, gw, n->iface, n->weight);
+
   /* This is a bit tricky. We cannot just copy the list and update n->gw,
      because the list should stay sorted, so we create two lists, one with new
      gateways and one with old ones, and then merge them. */

From b867a87c2fd694e6e690dc94da76754e89f03370 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 19 Apr 2015 00:19:56 +0200
Subject: [PATCH 11/12] Fixes port range socket option

---
 sysdep/unix/io.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index b4fec9cd..0724667d 100644
--- a/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@ -767,6 +767,32 @@ sk_set_tos6(sock *s, int tos)
   return 0;
 }
 
+static inline int
+sk_set_high_port(sock *s)
+{
+  /* Port range setting is optional, ignore it if not supported */
+
+#ifdef IP_PORTRANGE
+  if (sk_is_ipv4(s))
+  {
+    int range = IP_PORTRANGE_HIGH;
+    if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
+      ERR("IP_PORTRANGE");
+  }
+#endif
+
+#ifdef IPV6_PORTRANGE
+  if (sk_is_ipv6(s))
+  {
+    int range = IPV6_PORTRANGE_HIGH;
+    if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
+      ERR("IPV6_PORTRANGE");
+  }
+#endif
+
+  return 0;
+}
+
 static inline byte *
 sk_skip_ip_header(byte *pkt, int *len)
 {
@@ -1402,14 +1428,10 @@ sk_open(sock *s)
       }
 #endif
     }
-#ifdef IP_PORTRANGE
-    else if (s->flags & SKF_HIGH_PORT)
-    {
-      int range = IP_PORTRANGE_HIGH;
-      if (setsockopt(fd, IPPROTO_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
-        log(L_WARN "Socket error: %s%#m", "IP_PORTRANGE");
-    }
-#endif
+    else
+      if (s->flags & SKF_HIGH_PORT)
+	if (sk_set_high_port(s) < 0)
+	  log(L_WARN "Socket error: %s%#m", s->err);
 
     sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
     if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)

From deec752ef941eef4c36c21c5c5426d08e98c7a44 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 20 Apr 2015 12:27:00 +0200
Subject: [PATCH 12/12] NEWS and version update

---
 NEWS            | 32 +++++++++++++++++++++++++++++---
 misc/bird.spec  |  6 +++---
 sysdep/config.h |  2 +-
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index 117f4d7b..f7e384b4 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,32 @@
-Version 1.5.0pre (2014-11-05) - Not for production
-  o Major OSPF protocol redesign
-  o RFC 6549 - OSPFv2 multi-instance extension
+Version 1.5.0 (2015-04-20)
+  o Major OSPF protocol redesign.
+  o OSPFv2 multi-instance extension (RFC 6549).
+  o BGP AS-wide unique router ID (RFC 6286).
+  o BGP enhanced route refresh (RFC 7313).
+  o Link state support in BGP.
+  o Latency tracking and internal watchdog.
+  o Uses high port range for BFD on BSD.
+  o Increase max symbol length to 64.
+  o Allows to define unnamed protocols from templates.
+  o Fixes two serious bugs in BGP.
+  o Several bugfixes and minor improvements.
+  o Several minor option changes:
+     - OSPF: Protocol-wide 'instance id' option added.
+     - BGP: Parameters to option 'neighbor' extended.
+     - BGP: Separate option 'interface' added.
+     - BGP: Option 'start delay time' renamed to 'connect delay time'.
+     - BGP: Option 'route limit' deprecated.
+
+  Upgrade notes:
+
+  For OSPF, there are deep internal changes, but user-visible changes
+  are limited to log messages and minor changes in formatting of command
+  output.
+
+  For BGP, version 1.5.0 is essentially a minor release. There are two
+  deprecated options ('start delay time' and 'route limit') and some
+  minor formatting changes.
+
 
 Version 1.4.5 (2014-10-06)
   o New 'show route noexport' command option.
diff --git a/misc/bird.spec b/misc/bird.spec
index 30601a91..e6b699a0 100644
--- a/misc/bird.spec
+++ b/misc/bird.spec
@@ -1,6 +1,6 @@
 Summary: BIRD Internet Routing Daemon
 Name: bird
-Version: 1.4.5
+Version: 1.5.0
 Release: 1
 Copyright: GPL
 Group: Networking/Daemons
@@ -41,11 +41,11 @@ install $RPM_SOURCE_DIR/birdc6 usr/sbin/birdc6
 %post
 /sbin/ldconfig
 /sbin/chkconfig --add bird
- 
+
 %preun
 if [ $1 = 0 ] ; then
         /sbin/chkconfig --del bird
-fi                                                                              
+fi
 
 %files
 %attr(755,root,root) /usr/sbin/bird
diff --git a/sysdep/config.h b/sysdep/config.h
index 36cf8391..08c15fe9 100644
--- a/sysdep/config.h
+++ b/sysdep/config.h
@@ -7,7 +7,7 @@
 #define _BIRD_CONFIG_H_
 
 /* BIRD version */
-#define BIRD_VERSION "1.5.0pre"
+#define BIRD_VERSION "1.5.0"
 
 /* Include parameters determined by configure script */
 #include "sysdep/autoconf.h"