mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2025-01-03 07:31:54 +00:00
943478b00f
Add basic VRF (virtual routing and forwarding) support. Protocols can be associated with VRFs, such protocols will be restricted to interfaces assigned to the VRF (as reported by Linux kernel) and will use sockets bound to the VRF. E.g., different multihop BGP instances can use diffent kernel routing tables to handle BGP TCP connections. The VRF support is preliminary, currently there are several limitations: - Recent Linux kernels (4.11) do not handle correctly sockets bound to interaces that are part of VRF, so most protocols other than multihop BGP do not work. This will be fixed by future kernel versions. - Neighbor cache ignores VRFs. Breaks config with the same prefix on local interfaces in different VRFs. Not much problem as single hop protocols do not work anyways. - Olock code ignores VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. - Incoming BGP connections are not dispatched according to VRFs. Breaks config with multiple BGP peers with the same IP address in different VRFs. Perhaps we would need some kernel API to read VRF of incoming connection? Or probably use multiple listening sockets in int-new branch. - We should handle master VRF interface up/down events and perhaps disable associated protocols when VRF goes down. Or at least disable associated interfaces. - Also we should check if the master iface is really VRF iface and not some other kind of master iface. - BFD session request dispatch should be aware of VRFs. - Perhaps kernel protocol should read default kernel table ID from VRF iface so it is not necessary to configure it. - Perhaps we should have per-VRF default table.
2285 lines
47 KiB
C
2285 lines
47 KiB
C
/*
|
|
* BIRD Internet Routing Daemon -- Unix I/O
|
|
*
|
|
* (c) 1998--2004 Martin Mares <mj@ucw.cz>
|
|
* (c) 2004 Ondrej Filip <feela@network.cz>
|
|
*
|
|
* Can be freely distributed and used under the terms of the GNU GPL.
|
|
*/
|
|
|
|
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
|
|
if _GNU_SOURCE is not defined. */
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
#include <sys/time.h>
|
|
#include <sys/types.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/uio.h>
|
|
#include <sys/un.h>
|
|
#include <poll.h>
|
|
#include <unistd.h>
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <net/if.h>
|
|
#include <netinet/in.h>
|
|
#include <netinet/tcp.h>
|
|
#include <netinet/udp.h>
|
|
#include <netinet/icmp6.h>
|
|
|
|
#include "nest/bird.h"
|
|
#include "lib/lists.h"
|
|
#include "lib/resource.h"
|
|
#include "lib/timer.h"
|
|
#include "lib/socket.h"
|
|
#include "lib/event.h"
|
|
#include "lib/string.h"
|
|
#include "nest/iface.h"
|
|
|
|
#include "lib/unix.h"
|
|
#include "lib/sysio.h"
|
|
|
|
/* Maximum number of calls of tx handler for one socket in one
|
|
* poll iteration. Should be small enough to not monopolize CPU by
|
|
* one protocol instance.
|
|
*/
|
|
#define MAX_STEPS 4
|
|
|
|
/* Maximum number of calls of rx handler for all sockets in one poll
|
|
iteration. RX callbacks are often much more costly so we limit
|
|
this to gen small latencies */
|
|
#define MAX_RX_STEPS 4
|
|
|
|
/*
|
|
* Tracked Files
|
|
*/
|
|
|
|
struct rfile {
|
|
resource r;
|
|
FILE *f;
|
|
};
|
|
|
|
static void
|
|
rf_free(resource *r)
|
|
{
|
|
struct rfile *a = (struct rfile *) r;
|
|
|
|
fclose(a->f);
|
|
}
|
|
|
|
static void
|
|
rf_dump(resource *r)
|
|
{
|
|
struct rfile *a = (struct rfile *) r;
|
|
|
|
debug("(FILE *%p)\n", a->f);
|
|
}
|
|
|
|
static struct resclass rf_class = {
|
|
"FILE",
|
|
sizeof(struct rfile),
|
|
rf_free,
|
|
rf_dump,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
void *
|
|
tracked_fopen(pool *p, char *name, char *mode)
|
|
{
|
|
FILE *f = fopen(name, mode);
|
|
|
|
if (f)
|
|
{
|
|
struct rfile *r = ralloc(p, &rf_class);
|
|
r->f = f;
|
|
}
|
|
return f;
|
|
}
|
|
|
|
/**
|
|
* DOC: Timers
|
|
*
|
|
* Timers are resources which represent a wish of a module to call
|
|
* a function at the specified time. The platform dependent code
|
|
* doesn't guarantee exact timing, only that a timer function
|
|
* won't be called before the requested time.
|
|
*
|
|
* In BIRD, time is represented by values of the &bird_clock_t type
|
|
* which are integral numbers interpreted as a relative number of seconds since
|
|
* some fixed time point in past. The current time can be read
|
|
* from variable @now with reasonable accuracy and is monotonic. There is also
|
|
* a current 'absolute' time in variable @now_real reported by OS.
|
|
*
|
|
* Each timer is described by a &timer structure containing a pointer
|
|
* to the handler function (@hook), data private to this function (@data),
|
|
* time the function should be called at (@expires, 0 for inactive timers),
|
|
* for the other fields see |timer.h|.
|
|
*/
|
|
|
|
#define NEAR_TIMER_LIMIT 4
|
|
|
|
static list near_timers, far_timers;
|
|
static bird_clock_t first_far_timer = TIME_INFINITY;
|
|
|
|
/* now must be different from 0, because 0 is a special value in timer->expires */
|
|
bird_clock_t now = 1, now_real, boot_time;
|
|
|
|
static void
|
|
update_times_plain(void)
|
|
{
|
|
bird_clock_t new_time = time(NULL);
|
|
int delta = new_time - now_real;
|
|
|
|
if ((delta >= 0) && (delta < 60))
|
|
now += delta;
|
|
else if (now_real != 0)
|
|
log(L_WARN "Time jump, delta %d s", delta);
|
|
|
|
now_real = new_time;
|
|
}
|
|
|
|
static void
|
|
update_times_gettime(void)
|
|
{
|
|
struct timespec ts;
|
|
int rv;
|
|
|
|
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
if (rv != 0)
|
|
die("clock_gettime: %m");
|
|
|
|
if (ts.tv_sec != now) {
|
|
if (ts.tv_sec < now)
|
|
log(L_ERR "Monotonic timer is broken");
|
|
|
|
now = ts.tv_sec;
|
|
now_real = time(NULL);
|
|
}
|
|
}
|
|
|
|
static int clock_monotonic_available;
|
|
|
|
static inline void
|
|
update_times(void)
|
|
{
|
|
if (clock_monotonic_available)
|
|
update_times_gettime();
|
|
else
|
|
update_times_plain();
|
|
}
|
|
|
|
static inline void
|
|
init_times(void)
|
|
{
|
|
struct timespec ts;
|
|
clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
|
|
if (!clock_monotonic_available)
|
|
log(L_WARN "Monotonic timer is missing");
|
|
}
|
|
|
|
|
|
static void
|
|
tm_free(resource *r)
|
|
{
|
|
timer *t = (timer *) r;
|
|
|
|
tm_stop(t);
|
|
}
|
|
|
|
static void
|
|
tm_dump(resource *r)
|
|
{
|
|
timer *t = (timer *) r;
|
|
|
|
debug("(code %p, data %p, ", t->hook, t->data);
|
|
if (t->randomize)
|
|
debug("rand %d, ", t->randomize);
|
|
if (t->recurrent)
|
|
debug("recur %d, ", t->recurrent);
|
|
if (t->expires)
|
|
debug("expires in %d sec)\n", t->expires - now);
|
|
else
|
|
debug("inactive)\n");
|
|
}
|
|
|
|
static struct resclass tm_class = {
|
|
"Timer",
|
|
sizeof(timer),
|
|
tm_free,
|
|
tm_dump,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
/**
|
|
* tm_new - create a timer
|
|
* @p: pool
|
|
*
|
|
* This function creates a new timer resource and returns
|
|
* a pointer to it. To use the timer, you need to fill in
|
|
* the structure fields and call tm_start() to start timing.
|
|
*/
|
|
timer *
|
|
tm_new(pool *p)
|
|
{
|
|
timer *t = ralloc(p, &tm_class);
|
|
return t;
|
|
}
|
|
|
|
static inline void
|
|
tm_insert_near(timer *t)
|
|
{
|
|
node *n = HEAD(near_timers);
|
|
|
|
while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
|
|
n = n->next;
|
|
insert_node(&t->n, n->prev);
|
|
}
|
|
|
|
/**
|
|
* tm_start - start a timer
|
|
* @t: timer
|
|
* @after: number of seconds the timer should be run after
|
|
*
|
|
* This function schedules the hook function of the timer to
|
|
* be called after @after seconds. If the timer has been already
|
|
* started, it's @expire time is replaced by the new value.
|
|
*
|
|
* You can have set the @randomize field of @t, the timeout
|
|
* will be increased by a random number of seconds chosen
|
|
* uniformly from range 0 .. @randomize.
|
|
*
|
|
* You can call tm_start() from the handler function of the timer
|
|
* to request another run of the timer. Also, you can set the @recurrent
|
|
* field to have the timer re-added automatically with the same timeout.
|
|
*/
|
|
void
|
|
tm_start(timer *t, unsigned after)
|
|
{
|
|
bird_clock_t when;
|
|
|
|
if (t->randomize)
|
|
after += random() % (t->randomize + 1);
|
|
when = now + after;
|
|
if (t->expires == when)
|
|
return;
|
|
if (t->expires)
|
|
rem_node(&t->n);
|
|
t->expires = when;
|
|
if (after <= NEAR_TIMER_LIMIT)
|
|
tm_insert_near(t);
|
|
else
|
|
{
|
|
if (!first_far_timer || first_far_timer > when)
|
|
first_far_timer = when;
|
|
add_tail(&far_timers, &t->n);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tm_stop - stop a timer
|
|
* @t: timer
|
|
*
|
|
* This function stops a timer. If the timer is already stopped,
|
|
* nothing happens.
|
|
*/
|
|
void
|
|
tm_stop(timer *t)
|
|
{
|
|
if (t->expires)
|
|
{
|
|
rem_node(&t->n);
|
|
t->expires = 0;
|
|
}
|
|
}
|
|
|
|
static void
|
|
tm_dump_them(char *name, list *l)
|
|
{
|
|
node *n;
|
|
timer *t;
|
|
|
|
debug("%s timers:\n", name);
|
|
WALK_LIST(n, *l)
|
|
{
|
|
t = SKIP_BACK(timer, n, n);
|
|
debug("%p ", t);
|
|
tm_dump(&t->r);
|
|
}
|
|
debug("\n");
|
|
}
|
|
|
|
void
|
|
tm_dump_all(void)
|
|
{
|
|
tm_dump_them("Near", &near_timers);
|
|
tm_dump_them("Far", &far_timers);
|
|
}
|
|
|
|
static inline time_t
|
|
tm_first_shot(void)
|
|
{
|
|
time_t x = first_far_timer;
|
|
|
|
if (!EMPTY_LIST(near_timers))
|
|
{
|
|
timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
|
|
if (t->expires < x)
|
|
x = t->expires;
|
|
}
|
|
return x;
|
|
}
|
|
|
|
void io_log_event(void *hook, void *data);
|
|
|
|
static void
|
|
tm_shot(void)
|
|
{
|
|
timer *t;
|
|
node *n, *m;
|
|
|
|
if (first_far_timer <= now)
|
|
{
|
|
bird_clock_t limit = now + NEAR_TIMER_LIMIT;
|
|
first_far_timer = TIME_INFINITY;
|
|
n = HEAD(far_timers);
|
|
while (m = n->next)
|
|
{
|
|
t = SKIP_BACK(timer, n, n);
|
|
if (t->expires <= limit)
|
|
{
|
|
rem_node(n);
|
|
tm_insert_near(t);
|
|
}
|
|
else if (t->expires < first_far_timer)
|
|
first_far_timer = t->expires;
|
|
n = m;
|
|
}
|
|
}
|
|
while ((n = HEAD(near_timers)) -> next)
|
|
{
|
|
int delay;
|
|
t = SKIP_BACK(timer, n, n);
|
|
if (t->expires > now)
|
|
break;
|
|
rem_node(n);
|
|
delay = t->expires - now;
|
|
t->expires = 0;
|
|
if (t->recurrent)
|
|
{
|
|
int i = t->recurrent - delay;
|
|
if (i < 0)
|
|
i = 0;
|
|
tm_start(t, i);
|
|
}
|
|
io_log_event(t->hook, t->data);
|
|
t->hook(t);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* tm_parse_datetime - parse a date and time
|
|
* @x: datetime string
|
|
*
|
|
* tm_parse_datetime() takes a textual representation of
|
|
* a date and time (dd-mm-yyyy hh:mm:ss)
|
|
* and converts it to the corresponding value of type &bird_clock_t.
|
|
*/
|
|
bird_clock_t
|
|
tm_parse_datetime(char *x)
|
|
{
|
|
struct tm tm;
|
|
int n;
|
|
time_t t;
|
|
|
|
if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
|
|
return tm_parse_date(x);
|
|
tm.tm_mon--;
|
|
tm.tm_year -= 1900;
|
|
t = mktime(&tm);
|
|
if (t == (time_t) -1)
|
|
return 0;
|
|
return t;
|
|
}
|
|
/**
|
|
* tm_parse_date - parse a date
|
|
* @x: date string
|
|
*
|
|
* tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
|
|
* and converts it to the corresponding value of type &bird_clock_t.
|
|
*/
|
|
bird_clock_t
|
|
tm_parse_date(char *x)
|
|
{
|
|
struct tm tm;
|
|
int n;
|
|
time_t t;
|
|
|
|
if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
|
|
return 0;
|
|
tm.tm_mon--;
|
|
tm.tm_year -= 1900;
|
|
tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
|
|
t = mktime(&tm);
|
|
if (t == (time_t) -1)
|
|
return 0;
|
|
return t;
|
|
}
|
|
|
|
static void
|
|
tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
|
|
{
|
|
static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
|
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
|
|
|
|
if (delta < 20*3600)
|
|
bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
|
|
else if (delta < 360*86400)
|
|
bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
|
|
else
|
|
bsprintf(x, "%d", tm->tm_year+1900);
|
|
}
|
|
|
|
#include "conf/conf.h"
|
|
|
|
/**
|
|
* tm_format_datetime - convert date and time to textual representation
|
|
* @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
|
|
* @fmt_spec: specification of resulting textual representation of the time
|
|
* @t: time
|
|
*
|
|
* This function formats the given relative time value @t to a textual
|
|
* date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
|
|
*/
|
|
void
|
|
tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
|
|
{
|
|
const char *fmt_used;
|
|
struct tm *tm;
|
|
bird_clock_t delta = now - t;
|
|
t = now_real - delta;
|
|
tm = localtime(&t);
|
|
|
|
if (fmt_spec->fmt1 == NULL)
|
|
return tm_format_reltime(x, tm, delta);
|
|
|
|
if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
|
|
fmt_used = fmt_spec->fmt1;
|
|
else
|
|
fmt_used = fmt_spec->fmt2;
|
|
|
|
int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
|
|
if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
|
|
strcpy(x, "<too-long>");
|
|
}
|
|
|
|
|
|
/**
|
|
* DOC: Sockets
|
|
*
|
|
* Socket resources represent network connections. Their data structure (&socket)
|
|
* contains a lot of fields defining the exact type of the socket, the local and
|
|
* remote addresses and ports, pointers to socket buffers and finally pointers to
|
|
* hook functions to be called when new data have arrived to the receive buffer
|
|
* (@rx_hook), when the contents of the transmit buffer have been transmitted
|
|
* (@tx_hook) and when an error or connection close occurs (@err_hook).
|
|
*
|
|
* Freeing of sockets from inside socket hooks is perfectly safe.
|
|
*/
|
|
|
|
#ifndef SOL_IP
|
|
#define SOL_IP IPPROTO_IP
|
|
#endif
|
|
|
|
#ifndef SOL_IPV6
|
|
#define SOL_IPV6 IPPROTO_IPV6
|
|
#endif
|
|
|
|
#ifndef SOL_ICMPV6
|
|
#define SOL_ICMPV6 IPPROTO_ICMPV6
|
|
#endif
|
|
|
|
|
|
/*
|
|
* Sockaddr helper functions
|
|
*/
|
|
|
|
static inline int UNUSED sockaddr_length(int af)
|
|
{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
|
|
|
|
static inline void
|
|
sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
|
|
{
|
|
memset(sa, 0, sizeof(struct sockaddr_in));
|
|
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
|
|
sa->sin_len = sizeof(struct sockaddr_in);
|
|
#endif
|
|
sa->sin_family = AF_INET;
|
|
sa->sin_port = htons(port);
|
|
sa->sin_addr = ipa_to_in4(a);
|
|
}
|
|
|
|
static inline void
|
|
sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
|
|
{
|
|
memset(sa, 0, sizeof(struct sockaddr_in6));
|
|
#ifdef SIN6_LEN
|
|
sa->sin6_len = sizeof(struct sockaddr_in6);
|
|
#endif
|
|
sa->sin6_family = AF_INET6;
|
|
sa->sin6_port = htons(port);
|
|
sa->sin6_flowinfo = 0;
|
|
sa->sin6_addr = ipa_to_in6(a);
|
|
|
|
if (ifa && ipa_is_link_local(a))
|
|
sa->sin6_scope_id = ifa->index;
|
|
}
|
|
|
|
void
|
|
sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
|
|
{
|
|
if (af == AF_INET)
|
|
sockaddr_fill4((struct sockaddr_in *) sa, a, port);
|
|
else if (af == AF_INET6)
|
|
sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
|
|
else
|
|
bug("Unknown AF");
|
|
}
|
|
|
|
static inline void
|
|
sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
|
|
{
|
|
*port = ntohs(sa->sin_port);
|
|
*a = ipa_from_in4(sa->sin_addr);
|
|
}
|
|
|
|
static inline void
|
|
sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
|
|
{
|
|
*port = ntohs(sa->sin6_port);
|
|
*a = ipa_from_in6(sa->sin6_addr);
|
|
|
|
if (ifa && ipa_is_link_local(*a))
|
|
*ifa = if_find_by_index(sa->sin6_scope_id);
|
|
}
|
|
|
|
int
|
|
sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
|
|
{
|
|
if (sa->sa.sa_family != af)
|
|
goto fail;
|
|
|
|
if (af == AF_INET)
|
|
sockaddr_read4((struct sockaddr_in *) sa, a, port);
|
|
else if (af == AF_INET6)
|
|
sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
|
|
else
|
|
goto fail;
|
|
|
|
return 0;
|
|
|
|
fail:
|
|
*a = IPA_NONE;
|
|
*port = 0;
|
|
return -1;
|
|
}
|
|
|
|
|
|
/*
|
|
* IPv6 multicast syscalls
|
|
*/
|
|
|
|
/* Fortunately standardized in RFC 3493 */
|
|
|
|
#define INIT_MREQ6(maddr,ifa) \
|
|
{ .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
|
|
|
|
static inline int
|
|
sk_setup_multicast6(sock *s)
|
|
{
|
|
int index = s->iface->index;
|
|
int ttl = s->ttl;
|
|
int n = 0;
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
|
|
ERR("IPV6_MULTICAST_IF");
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
|
|
ERR("IPV6_MULTICAST_HOPS");
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
|
|
ERR("IPV6_MULTICAST_LOOP");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_join_group6(sock *s, ip_addr maddr)
|
|
{
|
|
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
|
|
ERR("IPV6_JOIN_GROUP");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_leave_group6(sock *s, ip_addr maddr)
|
|
{
|
|
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
|
|
ERR("IPV6_LEAVE_GROUP");
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* IPv6 packet control messages
|
|
*/
|
|
|
|
/* Also standardized, in RFC 3542 */
|
|
|
|
/*
|
|
* RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
|
|
* type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
|
|
* don't have IPV6_RECVPKTINFO we suppose the OS implements the older
|
|
* RFC and we use IPV6_PKTINFO.
|
|
*/
|
|
#ifndef IPV6_RECVPKTINFO
|
|
#define IPV6_RECVPKTINFO IPV6_PKTINFO
|
|
#endif
|
|
/*
|
|
* Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
|
|
*/
|
|
#ifndef IPV6_RECVHOPLIMIT
|
|
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
|
|
#endif
|
|
|
|
|
|
#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
|
|
#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
|
|
|
|
static inline int
|
|
sk_request_cmsg6_pktinfo(sock *s)
|
|
{
|
|
int y = 1;
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
|
|
ERR("IPV6_RECVPKTINFO");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_request_cmsg6_ttl(sock *s)
|
|
{
|
|
int y = 1;
|
|
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
|
|
ERR("IPV6_RECVHOPLIMIT");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline void
|
|
sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
|
|
{
|
|
if (cm->cmsg_type == IPV6_PKTINFO)
|
|
{
|
|
struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
|
|
s->laddr = ipa_from_in6(pi->ipi6_addr);
|
|
s->lifindex = pi->ipi6_ifindex;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
|
|
{
|
|
if (cm->cmsg_type == IPV6_HOPLIMIT)
|
|
s->rcv_ttl = * (int *) CMSG_DATA(cm);
|
|
}
|
|
|
|
static inline void
|
|
sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
|
|
{
|
|
struct cmsghdr *cm;
|
|
struct in6_pktinfo *pi;
|
|
int controllen = 0;
|
|
|
|
msg->msg_control = cbuf;
|
|
msg->msg_controllen = cbuflen;
|
|
|
|
cm = CMSG_FIRSTHDR(msg);
|
|
cm->cmsg_level = SOL_IPV6;
|
|
cm->cmsg_type = IPV6_PKTINFO;
|
|
cm->cmsg_len = CMSG_LEN(sizeof(*pi));
|
|
controllen += CMSG_SPACE(sizeof(*pi));
|
|
|
|
pi = (struct in6_pktinfo *) CMSG_DATA(cm);
|
|
pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
|
|
pi->ipi6_addr = ipa_to_in6(s->saddr);
|
|
|
|
msg->msg_controllen = controllen;
|
|
}
|
|
|
|
|
|
/*
|
|
* Miscellaneous socket syscalls
|
|
*/
|
|
|
|
static inline int
|
|
sk_set_ttl4(sock *s, int ttl)
|
|
{
|
|
if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
|
|
ERR("IP_TTL");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_set_ttl6(sock *s, int ttl)
|
|
{
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
|
|
ERR("IPV6_UNICAST_HOPS");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_set_tos4(sock *s, int tos)
|
|
{
|
|
if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
|
|
ERR("IP_TOS");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_set_tos6(sock *s, int tos)
|
|
{
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
|
|
ERR("IPV6_TCLASS");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
sk_set_high_port(sock *s UNUSED)
|
|
{
|
|
/* Port range setting is optional, ignore it if not supported */
|
|
|
|
#ifdef IP_PORTRANGE
|
|
if (sk_is_ipv4(s))
|
|
{
|
|
int range = IP_PORTRANGE_HIGH;
|
|
if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
|
|
ERR("IP_PORTRANGE");
|
|
}
|
|
#endif
|
|
|
|
#ifdef IPV6_PORTRANGE
|
|
if (sk_is_ipv6(s))
|
|
{
|
|
int range = IPV6_PORTRANGE_HIGH;
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
|
|
ERR("IPV6_PORTRANGE");
|
|
}
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline byte *
|
|
sk_skip_ip_header(byte *pkt, int *len)
|
|
{
|
|
if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
|
|
return NULL;
|
|
|
|
int hlen = (*pkt & 0x0f) * 4;
|
|
if ((hlen < 20) || (hlen > *len))
|
|
return NULL;
|
|
|
|
*len -= hlen;
|
|
return pkt + hlen;
|
|
}
|
|
|
|
byte *
|
|
sk_rx_buffer(sock *s, int *len)
|
|
{
|
|
if (sk_is_ipv4(s) && (s->type == SK_IP))
|
|
return sk_skip_ip_header(s->rbuf, len);
|
|
else
|
|
return s->rbuf;
|
|
}
|
|
|
|
|
|
/*
|
|
* Public socket functions
|
|
*/
|
|
|
|
/**
|
|
* sk_setup_multicast - enable multicast for given socket
|
|
* @s: socket
|
|
*
|
|
* Prepare transmission of multicast packets for given datagram socket.
|
|
* The socket must have defined @iface.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_setup_multicast(sock *s)
|
|
{
|
|
ASSERT(s->iface);
|
|
|
|
if (sk_is_ipv4(s))
|
|
return sk_setup_multicast4(s);
|
|
else
|
|
return sk_setup_multicast6(s);
|
|
}
|
|
|
|
/**
|
|
* sk_join_group - join multicast group for given socket
|
|
* @s: socket
|
|
* @maddr: multicast address
|
|
*
|
|
* Join multicast group for given datagram socket and associated interface.
|
|
* The socket must have defined @iface.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_join_group(sock *s, ip_addr maddr)
|
|
{
|
|
if (sk_is_ipv4(s))
|
|
return sk_join_group4(s, maddr);
|
|
else
|
|
return sk_join_group6(s, maddr);
|
|
}
|
|
|
|
/**
|
|
* sk_leave_group - leave multicast group for given socket
|
|
* @s: socket
|
|
* @maddr: multicast address
|
|
*
|
|
* Leave multicast group for given datagram socket and associated interface.
|
|
* The socket must have defined @iface.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_leave_group(sock *s, ip_addr maddr)
|
|
{
|
|
if (sk_is_ipv4(s))
|
|
return sk_leave_group4(s, maddr);
|
|
else
|
|
return sk_leave_group6(s, maddr);
|
|
}
|
|
|
|
/**
|
|
* sk_setup_broadcast - enable broadcast for given socket
|
|
* @s: socket
|
|
*
|
|
* Allow reception and transmission of broadcast packets for given datagram
|
|
* socket. The socket must have defined @iface. For transmission, packets should
|
|
* be send to @brd address of @iface.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_setup_broadcast(sock *s)
|
|
{
|
|
int y = 1;
|
|
|
|
if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
|
|
ERR("SO_BROADCAST");
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* sk_set_ttl - set transmit TTL for given socket
|
|
* @s: socket
|
|
* @ttl: TTL value
|
|
*
|
|
* Set TTL for already opened connections when TTL was not set before. Useful
|
|
* for accepted connections when different ones should have different TTL.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_set_ttl(sock *s, int ttl)
|
|
{
|
|
s->ttl = ttl;
|
|
|
|
if (sk_is_ipv4(s))
|
|
return sk_set_ttl4(s, ttl);
|
|
else
|
|
return sk_set_ttl6(s, ttl);
|
|
}
|
|
|
|
/**
|
|
* sk_set_min_ttl - set minimal accepted TTL for given socket
|
|
* @s: socket
|
|
* @ttl: TTL value
|
|
*
|
|
* Set minimal accepted TTL for given socket. Can be used for TTL security.
|
|
* implementations.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_set_min_ttl(sock *s, int ttl)
|
|
{
|
|
if (sk_is_ipv4(s))
|
|
return sk_set_min_ttl4(s, ttl);
|
|
else
|
|
return sk_set_min_ttl6(s, ttl);
|
|
}
|
|
|
|
#if 0
|
|
/**
|
|
* sk_set_md5_auth - add / remove MD5 security association for given socket
|
|
* @s: socket
|
|
* @local: IP address of local side
|
|
* @remote: IP address of remote side
|
|
* @ifa: Interface for link-local IP address
|
|
* @passwd: Password used for MD5 authentication
|
|
* @setkey: Update also system SA/SP database
|
|
*
|
|
* In TCP MD5 handling code in kernel, there is a set of security associations
|
|
* used for choosing password and other authentication parameters according to
|
|
* the local and remote address. This function is useful for listening socket,
|
|
* for active sockets it may be enough to set s->password field.
|
|
*
|
|
* When called with passwd != NULL, the new pair is added,
|
|
* When called with passwd == NULL, the existing pair is removed.
|
|
*
|
|
* Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
|
|
* stored in global SA/SP database (but the behavior also must be enabled on
|
|
* per-socket basis). In case of multiple sockets to the same neighbor, the
|
|
* socket-specific state must be configured for each socket while global state
|
|
* just once per src-dst pair. The @setkey argument controls whether the global
|
|
* state (SA/SP database) is also updated.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
|
|
{ DUMMY; }
|
|
#endif
|
|
|
|
/**
|
|
* sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
|
|
* @s: socket
|
|
* @offset: offset
|
|
*
|
|
* Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
|
|
* kernel will automatically fill it for outgoing packets and check it for
|
|
* incoming packets. Should not be used on ICMPv6 sockets, where the position is
|
|
* known to the kernel.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
|
|
int
|
|
sk_set_ipv6_checksum(sock *s, int offset)
|
|
{
|
|
if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
|
|
ERR("IPV6_CHECKSUM");
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
sk_set_icmp6_filter(sock *s, int p1, int p2)
|
|
{
|
|
/* a bit of lame interface, but it is here only for Radv */
|
|
struct icmp6_filter f;
|
|
|
|
ICMP6_FILTER_SETBLOCKALL(&f);
|
|
ICMP6_FILTER_SETPASS(p1, &f);
|
|
ICMP6_FILTER_SETPASS(p2, &f);
|
|
|
|
if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
|
|
ERR("ICMP6_FILTER");
|
|
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
sk_log_error(sock *s, const char *p)
|
|
{
|
|
log(L_ERR "%s: Socket error: %s%#m", p, s->err);
|
|
}
|
|
|
|
|
|
/*
|
|
* Actual struct birdsock code
|
|
*/
|
|
|
|
static list sock_list;
|
|
static struct birdsock *current_sock;
|
|
static struct birdsock *stored_sock;
|
|
|
|
static inline sock *
|
|
sk_next(sock *s)
|
|
{
|
|
if (!s->n.next->next)
|
|
return NULL;
|
|
else
|
|
return SKIP_BACK(sock, n, s->n.next);
|
|
}
|
|
|
|
static void
|
|
sk_alloc_bufs(sock *s)
|
|
{
|
|
if (!s->rbuf && s->rbsize)
|
|
s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
|
|
s->rpos = s->rbuf;
|
|
if (!s->tbuf && s->tbsize)
|
|
s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
|
|
s->tpos = s->ttx = s->tbuf;
|
|
}
|
|
|
|
static void
|
|
sk_free_bufs(sock *s)
|
|
{
|
|
if (s->rbuf_alloc)
|
|
{
|
|
xfree(s->rbuf_alloc);
|
|
s->rbuf = s->rbuf_alloc = NULL;
|
|
}
|
|
if (s->tbuf_alloc)
|
|
{
|
|
xfree(s->tbuf_alloc);
|
|
s->tbuf = s->tbuf_alloc = NULL;
|
|
}
|
|
}
|
|
|
|
static void
|
|
sk_free(resource *r)
|
|
{
|
|
sock *s = (sock *) r;
|
|
|
|
sk_free_bufs(s);
|
|
if (s->fd >= 0)
|
|
{
|
|
close(s->fd);
|
|
|
|
/* FIXME: we should call sk_stop() for SKF_THREAD sockets */
|
|
if (s->flags & SKF_THREAD)
|
|
return;
|
|
|
|
if (s == current_sock)
|
|
current_sock = sk_next(s);
|
|
if (s == stored_sock)
|
|
stored_sock = sk_next(s);
|
|
rem_node(&s->n);
|
|
}
|
|
}
|
|
|
|
void
|
|
sk_set_rbsize(sock *s, uint val)
|
|
{
|
|
ASSERT(s->rbuf_alloc == s->rbuf);
|
|
|
|
if (s->rbsize == val)
|
|
return;
|
|
|
|
s->rbsize = val;
|
|
xfree(s->rbuf_alloc);
|
|
s->rbuf_alloc = xmalloc(val);
|
|
s->rpos = s->rbuf = s->rbuf_alloc;
|
|
}
|
|
|
|
void
|
|
sk_set_tbsize(sock *s, uint val)
|
|
{
|
|
ASSERT(s->tbuf_alloc == s->tbuf);
|
|
|
|
if (s->tbsize == val)
|
|
return;
|
|
|
|
byte *old_tbuf = s->tbuf;
|
|
|
|
s->tbsize = val;
|
|
s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
|
|
s->tpos = s->tbuf + (s->tpos - old_tbuf);
|
|
s->ttx = s->tbuf + (s->ttx - old_tbuf);
|
|
}
|
|
|
|
void
|
|
sk_set_tbuf(sock *s, void *tbuf)
|
|
{
|
|
s->tbuf = tbuf ?: s->tbuf_alloc;
|
|
s->ttx = s->tpos = s->tbuf;
|
|
}
|
|
|
|
void
|
|
sk_reallocate(sock *s)
|
|
{
|
|
sk_free_bufs(s);
|
|
sk_alloc_bufs(s);
|
|
}
|
|
|
|
static void
|
|
sk_dump(resource *r)
|
|
{
|
|
sock *s = (sock *) r;
|
|
static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "DEL!" };
|
|
|
|
debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
|
|
sk_type_names[s->type],
|
|
s->data,
|
|
s->saddr,
|
|
s->sport,
|
|
s->daddr,
|
|
s->dport,
|
|
s->tos,
|
|
s->ttl,
|
|
s->iface ? s->iface->name : "none");
|
|
}
|
|
|
|
static struct resclass sk_class = {
|
|
"Socket",
|
|
sizeof(sock),
|
|
sk_free,
|
|
sk_dump,
|
|
NULL,
|
|
NULL
|
|
};
|
|
|
|
/**
|
|
* sk_new - create a socket
|
|
* @p: pool
|
|
*
|
|
* This function creates a new socket resource. If you want to use it,
|
|
* you need to fill in all the required fields of the structure and
|
|
* call sk_open() to do the actual opening of the socket.
|
|
*
|
|
* The real function name is sock_new(), sk_new() is a macro wrapper
|
|
* to avoid collision with OpenSSL.
|
|
*/
|
|
sock *
|
|
sock_new(pool *p)
|
|
{
|
|
sock *s = ralloc(p, &sk_class);
|
|
s->pool = p;
|
|
// s->saddr = s->daddr = IPA_NONE;
|
|
s->tos = s->priority = s->ttl = -1;
|
|
s->fd = -1;
|
|
return s;
|
|
}
|
|
|
|
static int
|
|
sk_setup(sock *s)
|
|
{
|
|
int y = 1;
|
|
int fd = s->fd;
|
|
|
|
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
|
|
ERR("O_NONBLOCK");
|
|
|
|
if (!s->af)
|
|
return 0;
|
|
|
|
if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
|
|
s->flags |= SKF_PKTINFO;
|
|
|
|
#ifdef CONFIG_USE_HDRINCL
|
|
if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
|
|
{
|
|
s->flags &= ~SKF_PKTINFO;
|
|
s->flags |= SKF_HDRINCL;
|
|
if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
|
|
ERR("IP_HDRINCL");
|
|
}
|
|
#endif
|
|
|
|
if (s->vrf && !s->iface)
|
|
{
|
|
/* Bind socket to associated VRF interface.
|
|
This is Linux-specific, but so is SO_BINDTODEVICE. */
|
|
#ifdef SO_BINDTODEVICE
|
|
struct ifreq ifr = {};
|
|
strcpy(ifr.ifr_name, s->vrf->name);
|
|
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
|
|
ERR("SO_BINDTODEVICE");
|
|
#endif
|
|
}
|
|
|
|
if (s->iface)
|
|
{
|
|
#ifdef SO_BINDTODEVICE
|
|
struct ifreq ifr = {};
|
|
strcpy(ifr.ifr_name, s->iface->name);
|
|
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
|
|
ERR("SO_BINDTODEVICE");
|
|
#endif
|
|
|
|
#ifdef CONFIG_UNIX_DONTROUTE
|
|
if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
|
|
ERR("SO_DONTROUTE");
|
|
#endif
|
|
}
|
|
|
|
if (s->priority >= 0)
|
|
if (sk_set_priority(s, s->priority) < 0)
|
|
return -1;
|
|
|
|
if (sk_is_ipv4(s))
|
|
{
|
|
if (s->flags & SKF_LADDR_RX)
|
|
if (sk_request_cmsg4_pktinfo(s) < 0)
|
|
return -1;
|
|
|
|
if (s->flags & SKF_TTL_RX)
|
|
if (sk_request_cmsg4_ttl(s) < 0)
|
|
return -1;
|
|
|
|
if ((s->type == SK_UDP) || (s->type == SK_IP))
|
|
if (sk_disable_mtu_disc4(s) < 0)
|
|
return -1;
|
|
|
|
if (s->ttl >= 0)
|
|
if (sk_set_ttl4(s, s->ttl) < 0)
|
|
return -1;
|
|
|
|
if (s->tos >= 0)
|
|
if (sk_set_tos4(s, s->tos) < 0)
|
|
return -1;
|
|
}
|
|
|
|
if (sk_is_ipv6(s))
|
|
{
|
|
if (s->flags & SKF_V6ONLY)
|
|
if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
|
|
ERR("IPV6_V6ONLY");
|
|
|
|
if (s->flags & SKF_LADDR_RX)
|
|
if (sk_request_cmsg6_pktinfo(s) < 0)
|
|
return -1;
|
|
|
|
if (s->flags & SKF_TTL_RX)
|
|
if (sk_request_cmsg6_ttl(s) < 0)
|
|
return -1;
|
|
|
|
if ((s->type == SK_UDP) || (s->type == SK_IP))
|
|
if (sk_disable_mtu_disc6(s) < 0)
|
|
return -1;
|
|
|
|
if (s->ttl >= 0)
|
|
if (sk_set_ttl6(s, s->ttl) < 0)
|
|
return -1;
|
|
|
|
if (s->tos >= 0)
|
|
if (sk_set_tos6(s, s->tos) < 0)
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
sk_insert(sock *s)
|
|
{
|
|
add_tail(&sock_list, &s->n);
|
|
}
|
|
|
|
static void
|
|
sk_tcp_connected(sock *s)
|
|
{
|
|
sockaddr sa;
|
|
int sa_len = sizeof(sa);
|
|
|
|
if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
|
|
(sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
|
|
log(L_WARN "SOCK: Cannot get local IP address for TCP>");
|
|
|
|
s->type = SK_TCP;
|
|
sk_alloc_bufs(s);
|
|
s->tx_hook(s);
|
|
}
|
|
|
|
static int
|
|
sk_passive_connected(sock *s, int type)
|
|
{
|
|
sockaddr loc_sa, rem_sa;
|
|
int loc_sa_len = sizeof(loc_sa);
|
|
int rem_sa_len = sizeof(rem_sa);
|
|
|
|
int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
|
|
if (fd < 0)
|
|
{
|
|
if ((errno != EINTR) && (errno != EAGAIN))
|
|
s->err_hook(s, errno);
|
|
return 0;
|
|
}
|
|
|
|
sock *t = sk_new(s->pool);
|
|
t->type = type;
|
|
t->fd = fd;
|
|
t->af = s->af;
|
|
t->ttl = s->ttl;
|
|
t->tos = s->tos;
|
|
t->rbsize = s->rbsize;
|
|
t->tbsize = s->tbsize;
|
|
|
|
if (type == SK_TCP)
|
|
{
|
|
if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
|
|
(sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
|
|
log(L_WARN "SOCK: Cannot get local IP address for TCP<");
|
|
|
|
if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
|
|
log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
|
|
}
|
|
|
|
if (sk_setup(t) < 0)
|
|
{
|
|
/* FIXME: Call err_hook instead ? */
|
|
log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
|
|
|
|
/* FIXME: handle it better in rfree() */
|
|
close(t->fd);
|
|
t->fd = -1;
|
|
rfree(t);
|
|
return 1;
|
|
}
|
|
|
|
sk_insert(t);
|
|
sk_alloc_bufs(t);
|
|
s->rx_hook(t, 0);
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* sk_open - open a socket
|
|
* @s: socket
|
|
*
|
|
* This function takes a socket resource created by sk_new() and
|
|
* initialized by the user and binds a corresponding network connection
|
|
* to it.
|
|
*
|
|
* Result: 0 for success, -1 for an error.
|
|
*/
|
|
int
|
|
sk_open(sock *s)
|
|
{
|
|
int af = BIRD_AF;
|
|
int fd = -1;
|
|
int do_bind = 0;
|
|
int bind_port = 0;
|
|
ip_addr bind_addr = IPA_NONE;
|
|
sockaddr sa;
|
|
|
|
switch (s->type)
|
|
{
|
|
case SK_TCP_ACTIVE:
|
|
s->ttx = ""; /* Force s->ttx != s->tpos */
|
|
/* Fall thru */
|
|
case SK_TCP_PASSIVE:
|
|
fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
|
|
bind_port = s->sport;
|
|
bind_addr = s->saddr;
|
|
do_bind = bind_port || ipa_nonzero(bind_addr);
|
|
break;
|
|
|
|
case SK_UDP:
|
|
fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
|
|
bind_port = s->sport;
|
|
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
|
|
do_bind = 1;
|
|
break;
|
|
|
|
case SK_IP:
|
|
fd = socket(af, SOCK_RAW, s->dport);
|
|
bind_port = 0;
|
|
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
|
|
do_bind = ipa_nonzero(bind_addr);
|
|
break;
|
|
|
|
case SK_MAGIC:
|
|
af = 0;
|
|
fd = s->fd;
|
|
break;
|
|
|
|
default:
|
|
bug("sk_open() called for invalid sock type %d", s->type);
|
|
}
|
|
|
|
if (fd < 0)
|
|
ERR("socket");
|
|
|
|
s->af = af;
|
|
s->fd = fd;
|
|
|
|
if (sk_setup(s) < 0)
|
|
goto err;
|
|
|
|
if (do_bind)
|
|
{
|
|
if (bind_port)
|
|
{
|
|
int y = 1;
|
|
|
|
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
|
|
ERR2("SO_REUSEADDR");
|
|
|
|
#ifdef CONFIG_NO_IFACE_BIND
|
|
/* Workaround missing ability to bind to an iface */
|
|
if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
|
|
{
|
|
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
|
|
ERR2("SO_REUSEPORT");
|
|
}
|
|
#endif
|
|
}
|
|
else
|
|
if (s->flags & SKF_HIGH_PORT)
|
|
if (sk_set_high_port(s) < 0)
|
|
log(L_WARN "Socket error: %s%#m", s->err);
|
|
|
|
sockaddr_fill(&sa, af, bind_addr, s->iface, bind_port);
|
|
if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
|
|
ERR2("bind");
|
|
}
|
|
|
|
if (s->password)
|
|
if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
|
|
goto err;
|
|
|
|
switch (s->type)
|
|
{
|
|
case SK_TCP_ACTIVE:
|
|
sockaddr_fill(&sa, af, s->daddr, s->iface, s->dport);
|
|
if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
|
|
sk_tcp_connected(s);
|
|
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
|
|
errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
|
|
ERR2("connect");
|
|
break;
|
|
|
|
case SK_TCP_PASSIVE:
|
|
if (listen(fd, 8) < 0)
|
|
ERR2("listen");
|
|
break;
|
|
|
|
case SK_MAGIC:
|
|
break;
|
|
|
|
default:
|
|
sk_alloc_bufs(s);
|
|
}
|
|
|
|
if (!(s->flags & SKF_THREAD))
|
|
sk_insert(s);
|
|
return 0;
|
|
|
|
err:
|
|
close(fd);
|
|
s->fd = -1;
|
|
return -1;
|
|
}
|
|
|
|
int
|
|
sk_open_unix(sock *s, char *name)
|
|
{
|
|
struct sockaddr_un sa;
|
|
int fd;
|
|
|
|
/* We are sloppy during error (leak fd and not set s->err), but we die anyway */
|
|
|
|
fd = socket(AF_UNIX, SOCK_STREAM, 0);
|
|
if (fd < 0)
|
|
return -1;
|
|
|
|
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
|
|
return -1;
|
|
|
|
/* Path length checked in test_old_bird() */
|
|
sa.sun_family = AF_UNIX;
|
|
strcpy(sa.sun_path, name);
|
|
|
|
if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
|
|
return -1;
|
|
|
|
if (listen(fd, 8) < 0)
|
|
return -1;
|
|
|
|
s->fd = fd;
|
|
sk_insert(s);
|
|
return 0;
|
|
}
|
|
|
|
|
|
#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
|
|
CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
|
|
#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
|
|
|
|
static void
|
|
sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
|
|
{
|
|
if (sk_is_ipv4(s))
|
|
sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
|
|
else
|
|
sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
|
|
}
|
|
|
|
static void
|
|
sk_process_cmsgs(sock *s, struct msghdr *msg)
|
|
{
|
|
struct cmsghdr *cm;
|
|
|
|
s->laddr = IPA_NONE;
|
|
s->lifindex = 0;
|
|
s->rcv_ttl = -1;
|
|
|
|
for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
|
|
{
|
|
if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
|
|
{
|
|
sk_process_cmsg4_pktinfo(s, cm);
|
|
sk_process_cmsg4_ttl(s, cm);
|
|
}
|
|
|
|
if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
|
|
{
|
|
sk_process_cmsg6_pktinfo(s, cm);
|
|
sk_process_cmsg6_ttl(s, cm);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static inline int
|
|
sk_sendmsg(sock *s)
|
|
{
|
|
struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
|
|
byte cmsg_buf[CMSG_TX_SPACE];
|
|
sockaddr dst;
|
|
|
|
sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
|
|
|
|
struct msghdr msg = {
|
|
.msg_name = &dst.sa,
|
|
.msg_namelen = SA_LEN(dst),
|
|
.msg_iov = &iov,
|
|
.msg_iovlen = 1
|
|
};
|
|
|
|
#ifdef CONFIG_USE_HDRINCL
|
|
byte hdr[20];
|
|
struct iovec iov2[2] = { {hdr, 20}, iov };
|
|
|
|
if (s->flags & SKF_HDRINCL)
|
|
{
|
|
sk_prepare_ip_header(s, hdr, iov.iov_len);
|
|
msg.msg_iov = iov2;
|
|
msg.msg_iovlen = 2;
|
|
}
|
|
#endif
|
|
|
|
if (s->flags & SKF_PKTINFO)
|
|
sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
|
|
|
|
return sendmsg(s->fd, &msg, 0);
|
|
}
|
|
|
|
static inline int
|
|
sk_recvmsg(sock *s)
|
|
{
|
|
struct iovec iov = {s->rbuf, s->rbsize};
|
|
byte cmsg_buf[CMSG_RX_SPACE];
|
|
sockaddr src;
|
|
|
|
struct msghdr msg = {
|
|
.msg_name = &src.sa,
|
|
.msg_namelen = sizeof(src), // XXXX ??
|
|
.msg_iov = &iov,
|
|
.msg_iovlen = 1,
|
|
.msg_control = cmsg_buf,
|
|
.msg_controllen = sizeof(cmsg_buf),
|
|
.msg_flags = 0
|
|
};
|
|
|
|
int rv = recvmsg(s->fd, &msg, 0);
|
|
if (rv < 0)
|
|
return rv;
|
|
|
|
//ifdef IPV4
|
|
// if (cf_type == SK_IP)
|
|
// rv = ipv4_skip_header(pbuf, rv);
|
|
//endif
|
|
|
|
sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
|
|
sk_process_cmsgs(s, &msg);
|
|
|
|
if (msg.msg_flags & MSG_TRUNC)
|
|
s->flags |= SKF_TRUNCATED;
|
|
else
|
|
s->flags &= ~SKF_TRUNCATED;
|
|
|
|
return rv;
|
|
}
|
|
|
|
|
|
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
|
|
|
|
static int
|
|
sk_maybe_write(sock *s)
|
|
{
|
|
int e;
|
|
|
|
switch (s->type)
|
|
{
|
|
case SK_TCP:
|
|
case SK_MAGIC:
|
|
case SK_UNIX:
|
|
while (s->ttx != s->tpos)
|
|
{
|
|
e = write(s->fd, s->ttx, s->tpos - s->ttx);
|
|
|
|
if (e < 0)
|
|
{
|
|
if (errno != EINTR && errno != EAGAIN)
|
|
{
|
|
reset_tx_buffer(s);
|
|
/* EPIPE is just a connection close notification during TX */
|
|
s->err_hook(s, (errno != EPIPE) ? errno : 0);
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
s->ttx += e;
|
|
}
|
|
reset_tx_buffer(s);
|
|
return 1;
|
|
|
|
case SK_UDP:
|
|
case SK_IP:
|
|
{
|
|
if (s->tbuf == s->tpos)
|
|
return 1;
|
|
|
|
e = sk_sendmsg(s);
|
|
|
|
if (e < 0)
|
|
{
|
|
if (errno != EINTR && errno != EAGAIN)
|
|
{
|
|
reset_tx_buffer(s);
|
|
s->err_hook(s, errno);
|
|
return -1;
|
|
}
|
|
|
|
if (!s->tx_hook)
|
|
reset_tx_buffer(s);
|
|
return 0;
|
|
}
|
|
reset_tx_buffer(s);
|
|
return 1;
|
|
}
|
|
default:
|
|
bug("sk_maybe_write: unknown socket type %d", s->type);
|
|
}
|
|
}
|
|
|
|
int
|
|
sk_rx_ready(sock *s)
|
|
{
|
|
int rv;
|
|
struct pollfd pfd = { .fd = s->fd };
|
|
pfd.events |= POLLIN;
|
|
|
|
redo:
|
|
rv = poll(&pfd, 1, 0);
|
|
|
|
if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
|
|
goto redo;
|
|
|
|
return rv;
|
|
}
|
|
|
|
/**
|
|
* sk_send - send data to a socket
|
|
* @s: socket
|
|
* @len: number of bytes to send
|
|
*
|
|
* This function sends @len bytes of data prepared in the
|
|
* transmit buffer of the socket @s to the network connection.
|
|
* If the packet can be sent immediately, it does so and returns
|
|
* 1, else it queues the packet for later processing, returns 0
|
|
* and calls the @tx_hook of the socket when the tranmission
|
|
* takes place.
|
|
*/
|
|
int
|
|
sk_send(sock *s, unsigned len)
|
|
{
|
|
s->ttx = s->tbuf;
|
|
s->tpos = s->tbuf + len;
|
|
return sk_maybe_write(s);
|
|
}
|
|
|
|
/**
|
|
* sk_send_to - send data to a specific destination
|
|
* @s: socket
|
|
* @len: number of bytes to send
|
|
* @addr: IP address to send the packet to
|
|
* @port: port to send the packet to
|
|
*
|
|
* This is a sk_send() replacement for connection-less packet sockets
|
|
* which allows destination of the packet to be chosen dynamically.
|
|
* Raw IP sockets should use 0 for @port.
|
|
*/
|
|
int
|
|
sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
|
|
{
|
|
s->daddr = addr;
|
|
if (port)
|
|
s->dport = port;
|
|
|
|
s->ttx = s->tbuf;
|
|
s->tpos = s->tbuf + len;
|
|
return sk_maybe_write(s);
|
|
}
|
|
|
|
/*
|
|
int
|
|
sk_send_full(sock *s, unsigned len, struct iface *ifa,
|
|
ip_addr saddr, ip_addr daddr, unsigned dport)
|
|
{
|
|
s->iface = ifa;
|
|
s->saddr = saddr;
|
|
s->daddr = daddr;
|
|
s->dport = dport;
|
|
s->ttx = s->tbuf;
|
|
s->tpos = s->tbuf + len;
|
|
return sk_maybe_write(s);
|
|
}
|
|
*/
|
|
|
|
/* sk_read() and sk_write() are called from BFD's event loop */
|
|
|
|
int
|
|
sk_read(sock *s, int revents)
|
|
{
|
|
switch (s->type)
|
|
{
|
|
case SK_TCP_PASSIVE:
|
|
return sk_passive_connected(s, SK_TCP);
|
|
|
|
case SK_UNIX_PASSIVE:
|
|
return sk_passive_connected(s, SK_UNIX);
|
|
|
|
case SK_TCP:
|
|
case SK_UNIX:
|
|
{
|
|
int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
|
|
|
|
if (c < 0)
|
|
{
|
|
if (errno != EINTR && errno != EAGAIN)
|
|
s->err_hook(s, errno);
|
|
else if (errno == EAGAIN && !(revents & POLLIN))
|
|
{
|
|
log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
|
|
s->err_hook(s, 0);
|
|
}
|
|
}
|
|
else if (!c)
|
|
s->err_hook(s, 0);
|
|
else
|
|
{
|
|
s->rpos += c;
|
|
if (s->rx_hook(s, s->rpos - s->rbuf))
|
|
{
|
|
/* We need to be careful since the socket could have been deleted by the hook */
|
|
if (current_sock == s)
|
|
s->rpos = s->rbuf;
|
|
}
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
case SK_MAGIC:
|
|
return s->rx_hook(s, 0);
|
|
|
|
default:
|
|
{
|
|
int e = sk_recvmsg(s);
|
|
|
|
if (e < 0)
|
|
{
|
|
if (errno != EINTR && errno != EAGAIN)
|
|
s->err_hook(s, errno);
|
|
return 0;
|
|
}
|
|
|
|
s->rpos = s->rbuf + e;
|
|
s->rx_hook(s, e);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
sk_write(sock *s)
|
|
{
|
|
switch (s->type)
|
|
{
|
|
case SK_TCP_ACTIVE:
|
|
{
|
|
sockaddr sa;
|
|
sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
|
|
|
|
if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
|
|
sk_tcp_connected(s);
|
|
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
|
|
s->err_hook(s, errno);
|
|
return 0;
|
|
}
|
|
|
|
default:
|
|
if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
|
|
{
|
|
if (s->tx_hook)
|
|
s->tx_hook(s);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
void
|
|
sk_err(sock *s, int revents)
|
|
{
|
|
int se = 0, sse = sizeof(se);
|
|
if ((s->type != SK_MAGIC) && (revents & POLLERR))
|
|
if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
|
|
{
|
|
log(L_ERR "IO: Socket error: SO_ERROR: %m");
|
|
se = 0;
|
|
}
|
|
|
|
s->err_hook(s, se);
|
|
}
|
|
|
|
void
|
|
sk_dump_all(void)
|
|
{
|
|
node *n;
|
|
sock *s;
|
|
|
|
debug("Open sockets:\n");
|
|
WALK_LIST(n, sock_list)
|
|
{
|
|
s = SKIP_BACK(sock, n, n);
|
|
debug("%p ", s);
|
|
sk_dump(&s->r);
|
|
}
|
|
debug("\n");
|
|
}
|
|
|
|
|
|
/*
|
|
* Internal event log and watchdog
|
|
*/
|
|
|
|
#define EVENT_LOG_LENGTH 32
|
|
|
|
struct event_log_entry
|
|
{
|
|
void *hook;
|
|
void *data;
|
|
btime timestamp;
|
|
btime duration;
|
|
};
|
|
|
|
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
|
|
static struct event_log_entry *event_open;
|
|
static int event_log_pos, event_log_num, watchdog_active;
|
|
static btime last_time;
|
|
static btime loop_time;
|
|
|
|
static void
|
|
io_update_time(void)
|
|
{
|
|
struct timespec ts;
|
|
int rv;
|
|
|
|
if (!clock_monotonic_available)
|
|
return;
|
|
|
|
/*
|
|
* This is third time-tracking procedure (after update_times() above and
|
|
* times_update() in BFD), dedicated to internal event log and latency
|
|
* tracking. Hopefully, we consolidate these sometimes.
|
|
*/
|
|
|
|
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
if (rv < 0)
|
|
die("clock_gettime: %m");
|
|
|
|
last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000);
|
|
|
|
if (event_open)
|
|
{
|
|
event_open->duration = last_time - event_open->timestamp;
|
|
|
|
if (event_open->duration > config->latency_limit)
|
|
log(L_WARN "Event 0x%p 0x%p took %d ms",
|
|
event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
|
|
|
|
event_open = NULL;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* io_log_event - mark approaching event into event log
|
|
* @hook: event hook address
|
|
* @data: event data address
|
|
*
|
|
* Store info (hook, data, timestamp) about the following internal event into
|
|
* a circular event log (@event_log). When latency tracking is enabled, the log
|
|
* entry is kept open (in @event_open) so the duration can be filled later.
|
|
*/
|
|
void
|
|
io_log_event(void *hook, void *data)
|
|
{
|
|
if (config->latency_debug)
|
|
io_update_time();
|
|
|
|
struct event_log_entry *en = event_log + event_log_pos;
|
|
|
|
en->hook = hook;
|
|
en->data = data;
|
|
en->timestamp = last_time;
|
|
en->duration = 0;
|
|
|
|
event_log_num++;
|
|
event_log_pos++;
|
|
event_log_pos %= EVENT_LOG_LENGTH;
|
|
|
|
event_open = config->latency_debug ? en : NULL;
|
|
}
|
|
|
|
static inline void
|
|
io_close_event(void)
|
|
{
|
|
if (event_open)
|
|
io_update_time();
|
|
}
|
|
|
|
void
|
|
io_log_dump(void)
|
|
{
|
|
int i;
|
|
|
|
log(L_DEBUG "Event log:");
|
|
for (i = 0; i < EVENT_LOG_LENGTH; i++)
|
|
{
|
|
struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
|
|
if (en->hook)
|
|
log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
|
|
(int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
|
|
}
|
|
}
|
|
|
|
void
|
|
watchdog_sigalrm(int sig UNUSED)
|
|
{
|
|
/* Update last_time and duration, but skip latency check */
|
|
config->latency_limit = 0xffffffff;
|
|
io_update_time();
|
|
|
|
/* We want core dump */
|
|
abort();
|
|
}
|
|
|
|
static inline void
|
|
watchdog_start1(void)
|
|
{
|
|
io_update_time();
|
|
|
|
loop_time = last_time;
|
|
}
|
|
|
|
static inline void
|
|
watchdog_start(void)
|
|
{
|
|
io_update_time();
|
|
|
|
loop_time = last_time;
|
|
event_log_num = 0;
|
|
|
|
if (config->watchdog_timeout)
|
|
{
|
|
alarm(config->watchdog_timeout);
|
|
watchdog_active = 1;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
watchdog_stop(void)
|
|
{
|
|
io_update_time();
|
|
|
|
if (watchdog_active)
|
|
{
|
|
alarm(0);
|
|
watchdog_active = 0;
|
|
}
|
|
|
|
btime duration = last_time - loop_time;
|
|
if (duration > config->watchdog_warning)
|
|
log(L_WARN "I/O loop cycle took %d ms for %d events",
|
|
(int) (duration TO_MS), event_log_num);
|
|
}
|
|
|
|
|
|
/*
|
|
* Main I/O Loop
|
|
*/
|
|
|
|
volatile int async_config_flag; /* Asynchronous reconfiguration/dump scheduled */
|
|
volatile int async_dump_flag;
|
|
volatile int async_shutdown_flag;
|
|
|
|
void
|
|
io_init(void)
|
|
{
|
|
init_list(&near_timers);
|
|
init_list(&far_timers);
|
|
init_list(&sock_list);
|
|
init_list(&global_event_list);
|
|
krt_io_init();
|
|
init_times();
|
|
update_times();
|
|
boot_time = now;
|
|
srandom((int) now_real);
|
|
}
|
|
|
|
static int short_loops = 0;
|
|
#define SHORT_LOOP_MAX 10
|
|
|
|
void
|
|
io_loop(void)
|
|
{
|
|
int poll_tout;
|
|
time_t tout;
|
|
int nfds, events, pout;
|
|
sock *s;
|
|
node *n;
|
|
int fdmax = 256;
|
|
struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
|
|
|
|
watchdog_start1();
|
|
for(;;)
|
|
{
|
|
events = ev_run_list(&global_event_list);
|
|
timers:
|
|
update_times();
|
|
tout = tm_first_shot();
|
|
if (tout <= now)
|
|
{
|
|
tm_shot();
|
|
goto timers;
|
|
}
|
|
poll_tout = (events ? 0 : MIN(tout - now, 3)) * 1000; /* Time in milliseconds */
|
|
|
|
io_close_event();
|
|
|
|
nfds = 0;
|
|
WALK_LIST(n, sock_list)
|
|
{
|
|
pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
|
|
s = SKIP_BACK(sock, n, n);
|
|
if (s->rx_hook)
|
|
{
|
|
pfd[nfds].fd = s->fd;
|
|
pfd[nfds].events |= POLLIN;
|
|
}
|
|
if (s->tx_hook && s->ttx != s->tpos)
|
|
{
|
|
pfd[nfds].fd = s->fd;
|
|
pfd[nfds].events |= POLLOUT;
|
|
}
|
|
if (pfd[nfds].fd != -1)
|
|
{
|
|
s->index = nfds;
|
|
nfds++;
|
|
}
|
|
else
|
|
s->index = -1;
|
|
|
|
if (nfds >= fdmax)
|
|
{
|
|
fdmax *= 2;
|
|
pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Yes, this is racy. But even if the signal comes before this test
|
|
* and entering poll(), it gets caught on the next timer tick.
|
|
*/
|
|
|
|
if (async_config_flag)
|
|
{
|
|
io_log_event(async_config, NULL);
|
|
async_config();
|
|
async_config_flag = 0;
|
|
continue;
|
|
}
|
|
if (async_dump_flag)
|
|
{
|
|
io_log_event(async_dump, NULL);
|
|
async_dump();
|
|
async_dump_flag = 0;
|
|
continue;
|
|
}
|
|
if (async_shutdown_flag)
|
|
{
|
|
io_log_event(async_shutdown, NULL);
|
|
async_shutdown();
|
|
async_shutdown_flag = 0;
|
|
continue;
|
|
}
|
|
|
|
/* And finally enter poll() to find active sockets */
|
|
watchdog_stop();
|
|
pout = poll(pfd, nfds, poll_tout);
|
|
watchdog_start();
|
|
|
|
if (pout < 0)
|
|
{
|
|
if (errno == EINTR || errno == EAGAIN)
|
|
continue;
|
|
die("poll: %m");
|
|
}
|
|
if (pout)
|
|
{
|
|
/* guaranteed to be non-empty */
|
|
current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
|
|
|
|
while (current_sock)
|
|
{
|
|
sock *s = current_sock;
|
|
if (s->index == -1)
|
|
{
|
|
current_sock = sk_next(s);
|
|
goto next;
|
|
}
|
|
|
|
int e;
|
|
int steps;
|
|
|
|
steps = MAX_STEPS;
|
|
if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
|
|
do
|
|
{
|
|
steps--;
|
|
io_log_event(s->rx_hook, s->data);
|
|
e = sk_read(s, pfd[s->index].revents);
|
|
if (s != current_sock)
|
|
goto next;
|
|
}
|
|
while (e && s->rx_hook && steps);
|
|
|
|
steps = MAX_STEPS;
|
|
if (pfd[s->index].revents & POLLOUT)
|
|
do
|
|
{
|
|
steps--;
|
|
io_log_event(s->tx_hook, s->data);
|
|
e = sk_write(s);
|
|
if (s != current_sock)
|
|
goto next;
|
|
}
|
|
while (e && steps);
|
|
|
|
current_sock = sk_next(s);
|
|
next: ;
|
|
}
|
|
|
|
short_loops++;
|
|
if (events && (short_loops < SHORT_LOOP_MAX))
|
|
continue;
|
|
short_loops = 0;
|
|
|
|
int count = 0;
|
|
current_sock = stored_sock;
|
|
if (current_sock == NULL)
|
|
current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
|
|
|
|
while (current_sock && count < MAX_RX_STEPS)
|
|
{
|
|
sock *s = current_sock;
|
|
if (s->index == -1)
|
|
{
|
|
current_sock = sk_next(s);
|
|
goto next2;
|
|
}
|
|
|
|
if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
|
|
{
|
|
count++;
|
|
io_log_event(s->rx_hook, s->data);
|
|
sk_read(s, pfd[s->index].revents);
|
|
if (s != current_sock)
|
|
goto next2;
|
|
}
|
|
|
|
if (pfd[s->index].revents & (POLLHUP | POLLERR))
|
|
{
|
|
sk_err(s, pfd[s->index].revents);
|
|
if (s != current_sock)
|
|
goto next2;
|
|
}
|
|
|
|
current_sock = sk_next(s);
|
|
next2: ;
|
|
}
|
|
|
|
|
|
stored_sock = current_sock;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
test_old_bird(char *path)
|
|
{
|
|
int fd;
|
|
struct sockaddr_un sa;
|
|
|
|
fd = socket(AF_UNIX, SOCK_STREAM, 0);
|
|
if (fd < 0)
|
|
die("Cannot create socket: %m");
|
|
if (strlen(path) >= sizeof(sa.sun_path))
|
|
die("Socket path too long");
|
|
bzero(&sa, sizeof(sa));
|
|
sa.sun_family = AF_UNIX;
|
|
strcpy(sa.sun_path, path);
|
|
if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
|
|
die("I found another BIRD running.");
|
|
close(fd);
|
|
}
|