mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2024-12-22 09:41:54 +00:00
Implement latency tracking, internal event log and watchdog
This commit is contained in:
parent
4e63974471
commit
8bcb5fb1e8
@ -41,6 +41,10 @@ struct config {
|
|||||||
u32 gr_wait; /* Graceful restart wait timeout */
|
u32 gr_wait; /* Graceful restart wait timeout */
|
||||||
|
|
||||||
int cli_debug; /* Tracing of CLI connections and commands */
|
int cli_debug; /* Tracing of CLI connections and commands */
|
||||||
|
int latency_debug; /* I/O loop tracks duration of each event */
|
||||||
|
u32 latency_limit; /* Events with longer duration are logged (us) */
|
||||||
|
u32 watchdog_warning; /* I/O loop watchdog limit for warning (us) */
|
||||||
|
u32 watchdog_timeout; /* Watchdog timeout (in seconds, 0 = disabled) */
|
||||||
char *err_msg; /* Parser error message */
|
char *err_msg; /* Parser error message */
|
||||||
int err_lino; /* Line containing error */
|
int err_lino; /* Line containing error */
|
||||||
char *err_file_name; /* File name containing error */
|
char *err_file_name; /* File name containing error */
|
||||||
|
@ -33,6 +33,10 @@
|
|||||||
# Turn on global debugging of all protocols
|
# Turn on global debugging of all protocols
|
||||||
#debug protocols all;
|
#debug protocols all;
|
||||||
|
|
||||||
|
# Turn on internal watchdog
|
||||||
|
#watchdog warning 5 s;
|
||||||
|
#watchdog timeout 30 s;
|
||||||
|
|
||||||
# The direct protocol automatically generates device routes to
|
# The direct protocol automatically generates device routes to
|
||||||
# all network interfaces. Can exist in as many instances as you wish
|
# all network interfaces. Can exist in as many instances as you wish
|
||||||
# if you want to populate multiple routing tables with device routes.
|
# if you want to populate multiple routing tables with device routes.
|
||||||
|
@ -344,6 +344,23 @@ protocol rip {
|
|||||||
of connects and disconnects, 2 and higher for logging of all client
|
of connects and disconnects, 2 and higher for logging of all client
|
||||||
commands). Default: 0.
|
commands). Default: 0.
|
||||||
|
|
||||||
|
<tag>debug latency <m/switch/</tag>
|
||||||
|
Activate tracking of elapsed time for internal events. Recent events
|
||||||
|
could be examined using <cf/dump events/ command. Default: off.
|
||||||
|
|
||||||
|
<tag>debug latency limit <m/time/</tag>
|
||||||
|
If <cf/debug latency/ is enabled, this option allows to specify a limit
|
||||||
|
for elapsed time. Events exceeding the limit are logged. Default: 1 s.
|
||||||
|
|
||||||
|
<tag>watchdog warning <m/time/</tag>
|
||||||
|
Set time limit for I/O loop cycle. If one iteration took more time to
|
||||||
|
complete, a warning is logged. Default: 5 s.
|
||||||
|
|
||||||
|
<tag>watchdog timeout <m/time/</tag>
|
||||||
|
Set time limit for I/O loop cycle. If the limit is breached, BIRD is
|
||||||
|
killed by abort signal. The timeout has effective granularity of
|
||||||
|
seconds, zero means disabled. Default: disabled (0).
|
||||||
|
|
||||||
<tag>mrtdump "<m/filename/"</tag>
|
<tag>mrtdump "<m/filename/"</tag>
|
||||||
Set MRTdump file name. This option must be specified to allow MRTdump
|
Set MRTdump file name. This option must be specified to allow MRTdump
|
||||||
feature. Default: no dump file.
|
feature. Default: no dump file.
|
||||||
|
@ -114,6 +114,8 @@ ev_schedule(event *e)
|
|||||||
ev_enqueue(&global_event_list, e);
|
ev_enqueue(&global_event_list, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void io_log_event(void *hook, void *data);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ev_run_list - run an event list
|
* ev_run_list - run an event list
|
||||||
* @l: an event list
|
* @l: an event list
|
||||||
@ -132,6 +134,11 @@ ev_run_list(event_list *l)
|
|||||||
WALK_LIST_FIRST(n, tmp_list)
|
WALK_LIST_FIRST(n, tmp_list)
|
||||||
{
|
{
|
||||||
event *e = SKIP_BACK(event, n, n);
|
event *e = SKIP_BACK(event, n, n);
|
||||||
|
|
||||||
|
/* This is ugly hack, we want to log just events executed from the main I/O loop */
|
||||||
|
if (l == &global_event_list)
|
||||||
|
io_log_event(e->hook, e->data);
|
||||||
|
|
||||||
ev_run(e);
|
ev_run(e);
|
||||||
}
|
}
|
||||||
return !EMPTY_LIST(*l);
|
return !EMPTY_LIST(*l);
|
||||||
|
@ -621,6 +621,8 @@ CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]])
|
|||||||
{ rdump(&root_pool); cli_msg(0, ""); } ;
|
{ rdump(&root_pool); cli_msg(0, ""); } ;
|
||||||
CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]])
|
CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]])
|
||||||
{ sk_dump_all(); cli_msg(0, ""); } ;
|
{ sk_dump_all(); cli_msg(0, ""); } ;
|
||||||
|
CF_CLI(DUMP EVENTS,,, [[Dump event log]])
|
||||||
|
{ io_log_dump(); cli_msg(0, ""); } ;
|
||||||
CF_CLI(DUMP INTERFACES,,, [[Dump interface information]])
|
CF_CLI(DUMP INTERFACES,,, [[Dump interface information]])
|
||||||
{ if_dump_all(); cli_msg(0, ""); } ;
|
{ if_dump_all(); cli_msg(0, ""); } ;
|
||||||
CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]])
|
CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]])
|
||||||
|
@ -15,6 +15,7 @@ CF_DECLS
|
|||||||
|
|
||||||
CF_KEYWORDS(LOG, SYSLOG, ALL, DEBUG, TRACE, INFO, REMOTE, WARNING, ERROR, AUTH, FATAL, BUG, STDERR, SOFT)
|
CF_KEYWORDS(LOG, SYSLOG, ALL, DEBUG, TRACE, INFO, REMOTE, WARNING, ERROR, AUTH, FATAL, BUG, STDERR, SOFT)
|
||||||
CF_KEYWORDS(TIMEFORMAT, ISO, OLD, SHORT, LONG, BASE, NAME, CONFIRM, UNDO, CHECK, TIMEOUT)
|
CF_KEYWORDS(TIMEFORMAT, ISO, OLD, SHORT, LONG, BASE, NAME, CONFIRM, UNDO, CHECK, TIMEOUT)
|
||||||
|
CF_KEYWORDS(DEBUG, LATENCY, LIMIT, WATCHDOG, WARNING, TIMEOUT)
|
||||||
|
|
||||||
%type <i> log_mask log_mask_list log_cat cfg_timeout
|
%type <i> log_mask log_mask_list log_cat cfg_timeout
|
||||||
%type <g> log_file
|
%type <g> log_file
|
||||||
@ -83,6 +84,7 @@ mrtdump_base:
|
|||||||
}
|
}
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
CF_ADDTO(conf, timeformat_base)
|
CF_ADDTO(conf, timeformat_base)
|
||||||
|
|
||||||
timeformat_which:
|
timeformat_which:
|
||||||
@ -104,6 +106,17 @@ timeformat_base:
|
|||||||
TIMEFORMAT timeformat_spec ';'
|
TIMEFORMAT timeformat_spec ';'
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
|
CF_ADDTO(conf, debug_unix)
|
||||||
|
|
||||||
|
debug_unix:
|
||||||
|
DEBUG LATENCY bool { new_config->latency_debug = $3; }
|
||||||
|
| DEBUG LATENCY LIMIT expr_us { new_config->latency_limit = $4; }
|
||||||
|
| WATCHDOG WARNING expr_us { new_config->watchdog_warning = $3; }
|
||||||
|
| WATCHDOG TIMEOUT expr_us { new_config->watchdog_timeout = ($3 + 999999) TO_S; }
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
/* Unix specific commands */
|
/* Unix specific commands */
|
||||||
|
|
||||||
CF_CLI_HELP(CONFIGURE, ..., [[Reload configuration]])
|
CF_CLI_HELP(CONFIGURE, ..., [[Reload configuration]])
|
||||||
|
170
sysdep/unix/io.c
170
sysdep/unix/io.c
@ -332,6 +332,8 @@ tm_first_shot(void)
|
|||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void io_log_event(void *hook, void *data);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
tm_shot(void)
|
tm_shot(void)
|
||||||
{
|
{
|
||||||
@ -372,6 +374,7 @@ tm_shot(void)
|
|||||||
i = 0;
|
i = 0;
|
||||||
tm_start(t, i);
|
tm_start(t, i);
|
||||||
}
|
}
|
||||||
|
io_log_event(t->hook, t->data);
|
||||||
t->hook(t);
|
t->hook(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1839,6 +1842,162 @@ sk_dump_all(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Internal event log and watchdog
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define EVENT_LOG_LENGTH 32
|
||||||
|
|
||||||
|
struct event_log_entry
|
||||||
|
{
|
||||||
|
void *hook;
|
||||||
|
void *data;
|
||||||
|
btime timestamp;
|
||||||
|
btime duration;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
|
||||||
|
static struct event_log_entry *event_open;
|
||||||
|
static int event_log_pos, event_log_num, watchdog_active;
|
||||||
|
static btime last_time;
|
||||||
|
static btime loop_time;
|
||||||
|
|
||||||
|
static void
|
||||||
|
io_update_time(void)
|
||||||
|
{
|
||||||
|
struct timespec ts;
|
||||||
|
int rv;
|
||||||
|
|
||||||
|
if (!clock_monotonic_available)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is third time-tracking procedure (after update_times() above and
|
||||||
|
* times_update() in BFD), dedicated to internal event log and latency
|
||||||
|
* tracking. Hopefully, we consolidate these sometimes.
|
||||||
|
*/
|
||||||
|
|
||||||
|
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
if (rv < 0)
|
||||||
|
die("clock_gettime: %m");
|
||||||
|
|
||||||
|
last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
|
||||||
|
|
||||||
|
if (event_open)
|
||||||
|
{
|
||||||
|
event_open->duration = last_time - event_open->timestamp;
|
||||||
|
|
||||||
|
if (event_open->duration > config->latency_limit)
|
||||||
|
log(L_WARN "Event 0x%p 0x%p took %d ms",
|
||||||
|
event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
|
||||||
|
|
||||||
|
event_open = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* io_log_event - mark approaching event into event log
|
||||||
|
* @hook: event hook address
|
||||||
|
* @data: event data address
|
||||||
|
*
|
||||||
|
* Store info (hook, data, timestamp) about the following internal event into
|
||||||
|
* a circular event log (@event_log). When latency tracking is enabled, the log
|
||||||
|
* entry is kept open (in @event_open) so the duration can be filled later.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
io_log_event(void *hook, void *data)
|
||||||
|
{
|
||||||
|
if (config->latency_debug)
|
||||||
|
io_update_time();
|
||||||
|
|
||||||
|
struct event_log_entry *en = event_log + event_log_pos;
|
||||||
|
|
||||||
|
en->hook = hook;
|
||||||
|
en->data = data;
|
||||||
|
en->timestamp = last_time;
|
||||||
|
en->duration = 0;
|
||||||
|
|
||||||
|
event_log_num++;
|
||||||
|
event_log_pos++;
|
||||||
|
event_log_pos %= EVENT_LOG_LENGTH;
|
||||||
|
|
||||||
|
event_open = config->latency_debug ? en : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
io_close_event(void)
|
||||||
|
{
|
||||||
|
if (event_open)
|
||||||
|
io_update_time();
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
io_log_dump(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
log(L_DEBUG "Event log:");
|
||||||
|
for (i = 0; i < EVENT_LOG_LENGTH; i++)
|
||||||
|
{
|
||||||
|
struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
|
||||||
|
if (en->hook)
|
||||||
|
log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
|
||||||
|
(int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
watchdog_sigalrm(int sig UNUSED)
|
||||||
|
{
|
||||||
|
/* Update last_time and duration, but skip latency check */
|
||||||
|
config->latency_limit = 0xffffffff;
|
||||||
|
io_update_time();
|
||||||
|
|
||||||
|
/* We want core dump */
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
watchdog_start1(void)
|
||||||
|
{
|
||||||
|
io_update_time();
|
||||||
|
|
||||||
|
loop_time = last_time;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
watchdog_start(void)
|
||||||
|
{
|
||||||
|
io_update_time();
|
||||||
|
|
||||||
|
loop_time = last_time;
|
||||||
|
event_log_num = 0;
|
||||||
|
|
||||||
|
if (config->watchdog_timeout)
|
||||||
|
{
|
||||||
|
alarm(config->watchdog_timeout);
|
||||||
|
watchdog_active = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
watchdog_stop(void)
|
||||||
|
{
|
||||||
|
io_update_time();
|
||||||
|
|
||||||
|
if (watchdog_active)
|
||||||
|
{
|
||||||
|
alarm(0);
|
||||||
|
watchdog_active = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
btime duration = last_time - loop_time;
|
||||||
|
if (duration > config->watchdog_warning)
|
||||||
|
log(L_WARN "I/O loop cycle took %d ms for %d events",
|
||||||
|
(int) (duration TO_MS), event_log_num);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Main I/O Loop
|
* Main I/O Loop
|
||||||
*/
|
*/
|
||||||
@ -1873,6 +2032,7 @@ io_loop(void)
|
|||||||
sock *s;
|
sock *s;
|
||||||
node *n;
|
node *n;
|
||||||
|
|
||||||
|
watchdog_start1();
|
||||||
sock_recalc_fdsets_p = 1;
|
sock_recalc_fdsets_p = 1;
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
@ -1887,6 +2047,8 @@ io_loop(void)
|
|||||||
timo.tv_sec = events ? 0 : MIN(tout - now, 3);
|
timo.tv_sec = events ? 0 : MIN(tout - now, 3);
|
||||||
timo.tv_usec = 0;
|
timo.tv_usec = 0;
|
||||||
|
|
||||||
|
io_close_event();
|
||||||
|
|
||||||
if (sock_recalc_fdsets_p)
|
if (sock_recalc_fdsets_p)
|
||||||
{
|
{
|
||||||
sock_recalc_fdsets_p = 0;
|
sock_recalc_fdsets_p = 0;
|
||||||
@ -1923,25 +2085,30 @@ io_loop(void)
|
|||||||
|
|
||||||
if (async_config_flag)
|
if (async_config_flag)
|
||||||
{
|
{
|
||||||
|
io_log_event(async_config, NULL);
|
||||||
async_config();
|
async_config();
|
||||||
async_config_flag = 0;
|
async_config_flag = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (async_dump_flag)
|
if (async_dump_flag)
|
||||||
{
|
{
|
||||||
|
io_log_event(async_dump, NULL);
|
||||||
async_dump();
|
async_dump();
|
||||||
async_dump_flag = 0;
|
async_dump_flag = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (async_shutdown_flag)
|
if (async_shutdown_flag)
|
||||||
{
|
{
|
||||||
|
io_log_event(async_shutdown, NULL);
|
||||||
async_shutdown();
|
async_shutdown();
|
||||||
async_shutdown_flag = 0;
|
async_shutdown_flag = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* And finally enter select() to find active sockets */
|
/* And finally enter select() to find active sockets */
|
||||||
|
watchdog_stop();
|
||||||
hi = select(hi+1, &rd, &wr, NULL, &timo);
|
hi = select(hi+1, &rd, &wr, NULL, &timo);
|
||||||
|
watchdog_start();
|
||||||
|
|
||||||
if (hi < 0)
|
if (hi < 0)
|
||||||
{
|
{
|
||||||
@ -1965,6 +2132,7 @@ io_loop(void)
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
steps--;
|
steps--;
|
||||||
|
io_log_event(s->rx_hook, s->data);
|
||||||
e = sk_read(s);
|
e = sk_read(s);
|
||||||
if (s != current_sock)
|
if (s != current_sock)
|
||||||
goto next;
|
goto next;
|
||||||
@ -1976,6 +2144,7 @@ io_loop(void)
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
steps--;
|
steps--;
|
||||||
|
io_log_event(s->tx_hook, s->data);
|
||||||
e = sk_write(s);
|
e = sk_write(s);
|
||||||
if (s != current_sock)
|
if (s != current_sock)
|
||||||
goto next;
|
goto next;
|
||||||
@ -2003,6 +2172,7 @@ io_loop(void)
|
|||||||
if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
|
if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
|
||||||
{
|
{
|
||||||
count++;
|
count++;
|
||||||
|
io_log_event(s->rx_hook, s->data);
|
||||||
e = sk_read(s);
|
e = sk_read(s);
|
||||||
if (s != current_sock)
|
if (s != current_sock)
|
||||||
goto next2;
|
goto next2;
|
||||||
|
@ -168,6 +168,9 @@ sysdep_preconfig(struct config *c)
|
|||||||
{
|
{
|
||||||
init_list(&c->logfiles);
|
init_list(&c->logfiles);
|
||||||
|
|
||||||
|
c->latency_limit = UNIX_DEFAULT_LATENCY_LIMIT;
|
||||||
|
c->watchdog_warning = UNIX_DEFAULT_WATCHDOG_WARNING;
|
||||||
|
|
||||||
#ifdef PATH_IPROUTE_DIR
|
#ifdef PATH_IPROUTE_DIR
|
||||||
read_iproute_table(PATH_IPROUTE_DIR "/rt_protos", "ipp_", 256);
|
read_iproute_table(PATH_IPROUTE_DIR "/rt_protos", "ipp_", 256);
|
||||||
read_iproute_table(PATH_IPROUTE_DIR "/rt_realms", "ipr_", 256);
|
read_iproute_table(PATH_IPROUTE_DIR "/rt_realms", "ipr_", 256);
|
||||||
@ -585,6 +588,8 @@ handle_sigterm(int sig UNUSED)
|
|||||||
async_shutdown_flag = 1;
|
async_shutdown_flag = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void watchdog_sigalrm(int sig UNUSED);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
signal_init(void)
|
signal_init(void)
|
||||||
{
|
{
|
||||||
@ -600,6 +605,9 @@ signal_init(void)
|
|||||||
sa.sa_handler = handle_sigterm;
|
sa.sa_handler = handle_sigterm;
|
||||||
sa.sa_flags = SA_RESTART;
|
sa.sa_flags = SA_RESTART;
|
||||||
sigaction(SIGTERM, &sa, NULL);
|
sigaction(SIGTERM, &sa, NULL);
|
||||||
|
sa.sa_handler = watchdog_sigalrm;
|
||||||
|
sa.sa_flags = 0;
|
||||||
|
sigaction(SIGALRM, &sa, NULL);
|
||||||
signal(SIGPIPE, SIG_IGN);
|
signal(SIGPIPE, SIG_IGN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,8 +27,10 @@ void cmd_reconfig_confirm(void);
|
|||||||
void cmd_reconfig_undo(void);
|
void cmd_reconfig_undo(void);
|
||||||
void cmd_shutdown(void);
|
void cmd_shutdown(void);
|
||||||
|
|
||||||
#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300
|
#define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300
|
||||||
|
|
||||||
|
#define UNIX_DEFAULT_LATENCY_LIMIT (1 S_)
|
||||||
|
#define UNIX_DEFAULT_WATCHDOG_WARNING (5 S_)
|
||||||
|
|
||||||
/* io.c */
|
/* io.c */
|
||||||
|
|
||||||
@ -99,6 +101,7 @@ volatile int async_shutdown_flag;
|
|||||||
|
|
||||||
void io_init(void);
|
void io_init(void);
|
||||||
void io_loop(void);
|
void io_loop(void);
|
||||||
|
void io_log_dump(void);
|
||||||
int sk_open_unix(struct birdsock *s, char *name);
|
int sk_open_unix(struct birdsock *s, char *name);
|
||||||
void *tracked_fopen(struct pool *, char *name, char *mode);
|
void *tracked_fopen(struct pool *, char *name, char *mode);
|
||||||
void test_old_bird(char *path);
|
void test_old_bird(char *path);
|
||||||
|
Loading…
Reference in New Issue
Block a user