0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2025-01-08 18:11:54 +00:00

Protocol restart timer reworked.

The restart timer was racy and didn't allow for immediate restarts
from limits. Now the protocols stores the last restart time and in case
of too frequent autorestarts caused by exceeded limits, the protocol
gets disabled with an error message.

Also now there is a configuration knob for this.
This commit is contained in:
Maria Matejka 2024-11-25 12:02:13 +01:00
parent 53431ff679
commit 3f4332f0bd
5 changed files with 30 additions and 36 deletions

View File

@ -839,6 +839,13 @@ agreement").
command line interface without needing to touch the configuration. command line interface without needing to touch the configuration.
Disabled protocols are not activated. Default: protocol is enabled. Disabled protocols are not activated. Default: protocol is enabled.
<tag><label id="proto-restart-limit">restart time limit <m/time/</tag>
Set time limit for subsequent automatic restarts of the protocol.
If the protocol hits the limit (with a restart action) before this time
elapses from starting the protocol, the protocol is disabled with
an error message in the config file. This doesn't apply to manual
restarts or reconfiguration. Default: 5 s.
<tag><label id="proto-debug">debug all|off|{ states|routes|filters|interfaces|events|packets [, <m/.../] }</tag> <tag><label id="proto-debug">debug all|off|{ states|routes|filters|interfaces|events|packets [, <m/.../] }</tag>
Set protocol debugging options. If asked, each protocol is capable of Set protocol debugging options. If asked, each protocol is capable of
writing trace messages about its work to the log (with category writing trace messages about its work to the log (with category

View File

@ -72,6 +72,11 @@ how to implement it properly.
The `scope` route attribute has been removed. Use custom route attributes instead. The `scope` route attribute has been removed. Use custom route attributes instead.
## Protocols common
There is now a guard against too frequent restarts due to limits, called
`restart time`, set by default to 5 seconds. To disable, set this to 1 us.
## Pipe ## Pipe
It's now impossible to check immediately whether the route has entered a pipe It's now impossible to check immediately whether the route has entered a pipe

View File

@ -340,6 +340,7 @@ proto_name:
proto_item: proto_item:
/* EMPTY */ /* EMPTY */
| DISABLED bool { this_proto->disabled = $2; } | DISABLED bool { this_proto->disabled = $2; }
| RESTART TIME expr_us { this_proto->restart_limit = $3; }
| DEBUG debug_mask { this_proto->debug = $2; } | DEBUG debug_mask { this_proto->debug = $2; }
| MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; } | MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; }
| ROUTER ID idval { this_proto->router_id = $3; } | ROUTER ID idval { this_proto->router_id = $3; }

View File

@ -1335,6 +1335,9 @@ proto_new(struct proto_config *cf)
p->hash_key = random_u32(); p->hash_key = random_u32();
cf->proto = p; cf->proto = p;
p->last_restart = current_time();
p->restart_limit = cf->restart_limit;
PST_LOCKED(tp) PST_LOCKED(tp)
{ {
p->id = hmap_first_zero(&tp->proto_id_map); p->id = hmap_first_zero(&tp->proto_id_map);
@ -1454,6 +1457,8 @@ proto_config_new(struct protocol *pr, int class)
cf->mrtdump = new_config->proto_default_mrtdump; cf->mrtdump = new_config->proto_default_mrtdump;
cf->loop_order = DOMAIN_ORDER(the_bird); cf->loop_order = DOMAIN_ORDER(the_bird);
cf->restart_limit = 5 S;
init_list(&cf->channels); init_list(&cf->channels);
return cf; return cf;
@ -1561,7 +1566,7 @@ static int
proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config *nc, int type) proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config *nc, int type)
{ {
/* If the protocol is DOWN, we just restart it */ /* If the protocol is DOWN, we just restart it */
if (p->proto_state == PS_DOWN_XX) if ((p->proto_state == PS_DOWN_XX) || (p->proto_state == PS_FLUSH))
return 0; return 0;
/* If there is a too big change in core attributes, ... */ /* If there is a too big change in core attributes, ... */
@ -1574,6 +1579,7 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config
p->sources.name = p->name = nc->name; p->sources.name = p->name = nc->name;
p->sources.debug = p->debug = nc->debug; p->sources.debug = p->debug = nc->debug;
p->mrtdump = nc->mrtdump; p->mrtdump = nc->mrtdump;
p->restart_limit = nc->restart_limit;
reconfigure_type = type; reconfigure_type = type;
/* Execute protocol specific reconfigure hook */ /* Execute protocol specific reconfigure hook */
@ -2167,29 +2173,15 @@ proto_restart_event_hook(void *_p)
p->disabled = 1; p->disabled = 1;
proto_rethink_goal(p); proto_rethink_goal(p);
p->restart_event = NULL;
p->restart_timer = NULL;
if (proto_restart) if (proto_restart)
if (current_time_now() - p->last_restart < p->restart_limit)
log(L_ERR "%s: too frequent restarts, disabling", p->name);
else
p->disabled = 0;
/* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will /* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will
* call it after the protocol stops ... and both these routines are fixed to main_birdloop. * call it after the protocol stops ... and both these routines are fixed to main_birdloop.
*/ */
p->disabled = 0;
}
static void
proto_send_restart_event(struct proto *p)
{
if (!p->restart_event)
p->restart_event = ev_new_init(p->pool, proto_restart_event_hook, p);
ev_send(&global_event_list, p->restart_event);
}
static void
proto_send_restart_event_from_timer(struct timer *t)
{
proto_send_restart_event((struct proto *) t->data);
} }
static inline void static inline void
@ -2205,20 +2197,8 @@ proto_schedule_down(struct proto *p, byte restart, byte code)
p->down_sched = restart ? PDS_RESTART : PDS_DISABLE; p->down_sched = restart ? PDS_RESTART : PDS_DISABLE;
p->down_code = code; p->down_code = code;
if (!restart) /* Request protocol restart to be initiated from the mainloop */
{ ev_send(&global_event_list, ev_new_init(p->pool, proto_restart_event_hook, p));
if (p->restart_timer && tm_active(p->restart_timer))
tm_stop(p->restart_timer);
proto_send_restart_event(p);
}
else
{
if (!p->restart_timer)
p->restart_timer = tm_new_init(p->pool, proto_send_restart_event_from_timer, p, 0, 0);
tm_start_max_in(p->restart_timer, 250 MS, p->loop);
}
} }
/** /**

View File

@ -113,6 +113,7 @@ struct proto_config {
u32 router_id; /* Protocol specific router ID */ u32 router_id; /* Protocol specific router ID */
uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */ uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */
btime loop_max_latency; /* Request this specific maximum latency of loop; zero to default */ btime loop_max_latency; /* Request this specific maximum latency of loop; zero to default */
btime restart_limit; /* Minimum allowed time between limit restarts */
list channels; /* List of channel configs (struct channel_config) */ list channels; /* List of channel configs (struct channel_config) */
struct iface *vrf; /* Related VRF instance, NULL if global */ struct iface *vrf; /* Related VRF instance, NULL if global */
@ -142,8 +143,6 @@ struct proto {
pool *pool_inloop; /* Pool containing local objects which need to be freed pool *pool_inloop; /* Pool containing local objects which need to be freed
before the protocol's birdloop actually stops, like olocks */ before the protocol's birdloop actually stops, like olocks */
event *event; /* Protocol event */ event *event; /* Protocol event */
timer *restart_timer; /* Timer to restart the protocol from limits */
event *restart_event; /* Event to restart/shutdown the protocol from limits */
struct birdloop *loop; /* BIRDloop running this protocol */ struct birdloop *loop; /* BIRDloop running this protocol */
list channels; /* List of channels to rtables (struct channel) */ list channels; /* List of channels to rtables (struct channel) */
@ -170,6 +169,8 @@ struct proto {
byte down_code; /* Reason for shutdown (PDC_* codes) */ byte down_code; /* Reason for shutdown (PDC_* codes) */
u32 hash_key; /* Random key used for hashing of neighbors */ u32 hash_key; /* Random key used for hashing of neighbors */
btime last_state_change; /* Time of last state transition */ btime last_state_change; /* Time of last state transition */
btime last_restart; /* Time of last restart */
btime restart_limit; /* Minimum allowed time between limit restarts */
char *last_state_name_announced; /* Last state name we've announced to the user */ char *last_state_name_announced; /* Last state name we've announced to the user */
char *message; /* State-change message, allocated from proto_pool */ char *message; /* State-change message, allocated from proto_pool */
u32 id; /* Sequential ID used as index in proto_state_table */ u32 id; /* Sequential ID used as index in proto_state_table */