Protocol restart timer reworked.

The restart timer was racy and didn't allow for immediate restarts from limits. Now the protocols stores the last restart time and in case of too frequent autorestarts caused by exceeded limits, the protocol gets disabled with an error message. Also now there is a configuration knob for this.
2025-04-18 04:54:38 +00:00 · 2024-11-25 12:02:13 +01:00 · 2024-11-25 12:02:13 +01:00 · 3f4332f0bd
commit 3f4332f0bd
parent 53431ff679
5 changed files with 30 additions and 36 deletions
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@ -839,6 +839,13 @@ agreement").
 	command line interface without needing to touch the configuration.
 	Disabled protocols are not activated. Default: protocol is enabled.

+	<tag><label id="proto-restart-limit">restart time limit <m/time/</tag>
+	Set time limit for subsequent automatic restarts of the protocol.
+        If the protocol hits the limit (with a restart action) before this time
+        elapses from starting the protocol, the protocol is disabled with
+	an error message in the config file. This doesn't apply to manual
+	restarts or reconfiguration. Default: 5 s.
+
 	<tag><label id="proto-debug">debug all|off|{ states|routes|filters|interfaces|events|packets [, <m/.../] }</tag>
 	Set protocol debugging options. If asked, each protocol is capable of
 	writing trace messages about its work to the log (with category
--- a/doc/migration-bird3.md
+++ b/doc/migration-bird3.md
@ -72,6 +72,11 @@ how to implement it properly.

 The `scope` route attribute has been removed. Use custom route attributes instead.

+## Protocols common
+
+There is now a guard against too frequent restarts due to limits, called
+`restart time`, set by default to 5 seconds. To disable, set this to 1 us.
+
 ## Pipe

 It's now impossible to check immediately whether the route has entered a pipe
--- a/nest/config.Y
+++ b/nest/config.Y
@ -340,6 +340,7 @@ proto_name:
 proto_item:
   /* EMPTY */
 | DISABLED bool { this_proto->disabled = $2; }
+ | RESTART TIME expr_us { this_proto->restart_limit = $3; }
 | DEBUG debug_mask { this_proto->debug = $2; }
 | MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; }
 | ROUTER ID idval { this_proto->router_id = $3; }
--- a/nest/proto.c
+++ b/nest/proto.c
@ -1335,6 +1335,9 @@ proto_new(struct proto_config *cf)
  p->hash_key = random_u32();
  cf->proto = p;

+  p->last_restart = current_time();
+  p->restart_limit = cf->restart_limit;
+
  PST_LOCKED(tp)
  {
    p->id = hmap_first_zero(&tp->proto_id_map);
@ -1454,6 +1457,8 @@ proto_config_new(struct protocol *pr, int class)
  cf->mrtdump = new_config->proto_default_mrtdump;
  cf->loop_order = DOMAIN_ORDER(the_bird);

+  cf->restart_limit = 5 S;
+
  init_list(&cf->channels);

  return cf;
@ -1561,7 +1566,7 @@ static int
 proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config *nc, int type)
 {
  /* If the protocol is DOWN, we just restart it */
-  if (p->proto_state == PS_DOWN_XX)
+  if ((p->proto_state == PS_DOWN_XX) || (p->proto_state == PS_FLUSH))
    return 0;

  /* If there is a too big change in core attributes, ... */
@ -1574,6 +1579,7 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config
  p->sources.name = p->name = nc->name;
  p->sources.debug = p->debug = nc->debug;
  p->mrtdump = nc->mrtdump;
+  p->restart_limit = nc->restart_limit;
  reconfigure_type = type;

  /* Execute protocol specific reconfigure hook */
@ -2167,29 +2173,15 @@ proto_restart_event_hook(void *_p)
  p->disabled = 1;
  proto_rethink_goal(p);

-  p->restart_event = NULL;
-  p->restart_timer = NULL;
-
  if (proto_restart)
+    if (current_time_now() - p->last_restart < p->restart_limit)
+      log(L_ERR "%s: too frequent restarts, disabling", p->name);
+    else
+      p->disabled = 0;
+
    /* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will
     * call it after the protocol stops ... and both these routines are fixed to main_birdloop.
     */
-    p->disabled = 0;
-}
-
-static void
-proto_send_restart_event(struct proto *p)
-{
-  if (!p->restart_event)
-    p->restart_event = ev_new_init(p->pool, proto_restart_event_hook, p);
-
-  ev_send(&global_event_list, p->restart_event);
-}
-
-static void
-proto_send_restart_event_from_timer(struct timer *t)
-{
-  proto_send_restart_event((struct proto *) t->data);
 }

 static inline void
@ -2205,20 +2197,8 @@ proto_schedule_down(struct proto *p, byte restart, byte code)
  p->down_sched = restart ? PDS_RESTART : PDS_DISABLE;
  p->down_code = code;

-  if (!restart)
-  {
-    if (p->restart_timer && tm_active(p->restart_timer))
-      tm_stop(p->restart_timer);
-
-    proto_send_restart_event(p);
-  }
-  else
-  {
-    if (!p->restart_timer)
-      p->restart_timer = tm_new_init(p->pool, proto_send_restart_event_from_timer, p, 0, 0);
-
-    tm_start_max_in(p->restart_timer, 250 MS, p->loop);
-  }
+  /* Request protocol restart to be initiated from the mainloop */
+  ev_send(&global_event_list, ev_new_init(p->pool, proto_restart_event_hook, p));
 }

 /**
--- a/nest/protocol.h
+++ b/nest/protocol.h
@ -113,6 +113,7 @@ struct proto_config {
  u32 router_id;			/* Protocol specific router ID */
  uint loop_order;			/* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */
  btime loop_max_latency;		/* Request this specific maximum latency of loop; zero to default */
+  btime restart_limit;			/* Minimum allowed time between limit restarts */

  list channels;			/* List of channel configs (struct channel_config) */
  struct iface *vrf;			/* Related VRF instance, NULL if global */
@ -142,8 +143,6 @@ struct proto {
  pool *pool_inloop;			/* Pool containing local objects which need to be freed
 					   before the protocol's birdloop actually stops, like olocks */
  event *event;				/* Protocol event */
-  timer *restart_timer;			/* Timer to restart the protocol from limits */
-  event *restart_event;			/* Event to restart/shutdown the protocol from limits */
  struct birdloop *loop;		/* BIRDloop running this protocol */

  list channels;			/* List of channels to rtables (struct channel) */
@ -170,6 +169,8 @@ struct proto {
  byte down_code;			/* Reason for shutdown (PDC_* codes) */
  u32 hash_key;				/* Random key used for hashing of neighbors */
  btime last_state_change;		/* Time of last state transition */
+  btime last_restart;			/* Time of last restart */
+  btime restart_limit;			/* Minimum allowed time between limit restarts */
  char *last_state_name_announced;	/* Last state name we've announced to the user */
  char *message;			/* State-change message, allocated from proto_pool */
  u32 id;				/* Sequential ID used as index in proto_state_table */