0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2024-11-14 15:18:44 +00:00
bird/sysdep/unix/io.c
Maria Matejka 144ac4c1d3 Logging: fixed size logfiles behaving as mmapped ringbuffers
This variant of logging avoids calling write() for every log line,
allowing for waitless logging. This makes heavy logging less heavy
and more useful for race condition debugging.
2023-09-24 20:43:04 +02:00

2499 lines
52 KiB
C

/*
* BIRD Internet Routing Daemon -- Unix I/O
*
* (c) 1998--2004 Martin Mares <mj@ucw.cz>
* (c) 2004 Ondrej Filip <feela@network.cz>
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
if _GNU_SOURCE is not defined. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/un.h>
#include <poll.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/icmp6.h>
#include "nest/bird.h"
#include "lib/lists.h"
#include "lib/resource.h"
#include "lib/socket.h"
#include "lib/event.h"
#include "lib/locking.h"
#include "lib/timer.h"
#include "lib/string.h"
#include "nest/iface.h"
#include "conf/conf.h"
#include "sysdep/unix/unix.h"
#include "sysdep/unix/io-loop.h"
#include CONFIG_INCLUDE_SYSIO_H
/* Maximum number of calls of tx handler for one socket in one
* poll iteration. Should be small enough to not monopolize CPU by
* one protocol instance.
*/
#define MAX_STEPS 4
/* Maximum number of calls of rx handler for all sockets in one poll
iteration. RX callbacks are often much more costly so we limit
this to gen small latencies */
#define MAX_RX_STEPS 4
/*
* Tracked Files
*/
struct rfile {
resource r;
struct stat stat;
int fd;
off_t limit;
_Atomic off_t pos;
void *mapping;
};
struct rfile rf_stderr = {
.fd = 2,
};
static void
rf_free(resource *r)
{
struct rfile *a = (struct rfile *) r;
if (a->mapping)
munmap(a->mapping, a->limit);
close(a->fd);
}
static void
rf_dump(resource *r, unsigned indent UNUSED)
{
struct rfile *a = (struct rfile *) r;
debug("(fd %d)\n", a->fd);
}
static struct resclass rf_class = {
"FILE",
sizeof(struct rfile),
rf_free,
rf_dump,
NULL,
NULL
};
static int
rf_open_get_fd(const char *name, enum rf_mode mode)
{
int omode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
int flags;
switch (mode)
{
case RF_APPEND:
flags = O_WRONLY | O_CREAT | O_APPEND;
break;
case RF_FIXED:
flags = O_RDWR | O_CREAT;
break;
default:
bug("rf_open() must have the mode set");
}
return open(name, flags, omode);
}
static void
rf_stat(struct rfile *r)
{
if (fstat(r->fd, &r->stat) < 0)
die("fstat() failed: %m");
}
struct rfile *
rf_open(pool *p, const char *name, enum rf_mode mode, off_t limit)
{
int fd = rf_open_get_fd(name, mode);
if (fd < 0)
return NULL; /* The caller takes care of printing %m. */
struct rfile *r = ralloc(p, &rf_class);
r->fd = fd;
r->limit = limit;
switch (mode)
{
case RF_APPEND:
rf_stat(r);
r->pos = S_ISREG(r->stat.st_mode) ? r->stat.st_size : 0;
break;
case RF_FIXED:
if ((ftruncate(fd, limit) < 0)
|| ((r->mapping = mmap(NULL, limit, PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED))
{
int erf = errno;
r->mapping = NULL;
rfree(r);
errno = erf;
return NULL;
}
break;
default:
bug("rf_open() must have the mode set");
}
return r;
}
off_t
rf_size(struct rfile *r)
{
return atomic_load_explicit(&r->pos, memory_order_relaxed);
}
int
rf_same(struct rfile *a, struct rfile *b)
{
rf_stat(a);
rf_stat(b);
return
(a->limit == b->limit) &&
(a->stat.st_mode == b->stat.st_mode) &&
(a->stat.st_dev == b->stat.st_dev) &&
(a->stat.st_ino == b->stat.st_ino);
}
int
rf_write(struct rfile *r, const void *buf, size_t _count)
{
off_t count = _count;
if (r->mapping)
{
/* Update the pointer */
off_t target = atomic_fetch_add_explicit(&r->pos, count, memory_order_relaxed) % r->limit;
/* Take care of wrapping */
if (target + count > r->limit)
{
memcpy(r->mapping, buf + (r->limit - target), target + count - r->limit);
count = r->limit - target;
}
/* Write the line */
memcpy(r->mapping + target, buf, count);
return 1;
}
else if (r->limit && (atomic_fetch_add_explicit(&r->pos, count, memory_order_relaxed) + count > r->limit))
{
atomic_fetch_sub_explicit(&r->pos, count, memory_order_relaxed);
return 0;
}
else
{
while ((write(r->fd, buf, count) < 0) && (errno == EINTR))
;
return 1;
}
}
/*
* Time clock
*/
btime boot_time;
void
times_update(void)
{
struct timespec ts;
int rv;
btime old_time = current_time();
btime old_real_time = current_real_time();
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
if (rv < 0)
die("Monotonic clock is missing");
if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40)))
log(L_WARN "Monotonic clock is crazy");
btime new_time = ts.tv_sec S + ts.tv_nsec NS;
if (new_time < old_time)
log(L_ERR "Monotonic clock is broken");
rv = clock_gettime(CLOCK_REALTIME, &ts);
if (rv < 0)
die("clock_gettime: %m");
btime new_real_time = ts.tv_sec S + ts.tv_nsec NS;
if (!atomic_compare_exchange_strong_explicit(
&last_time,
&old_time,
new_time,
memory_order_acq_rel,
memory_order_relaxed))
DBG("Time update collision: last_time");
if (!atomic_compare_exchange_strong_explicit(
&real_time,
&old_real_time,
new_real_time,
memory_order_acq_rel,
memory_order_relaxed))
DBG("Time update collision: real_time");
}
/**
* DOC: Sockets
*
* Socket resources represent network connections. Their data structure (&socket)
* contains a lot of fields defining the exact type of the socket, the local and
* remote addresses and ports, pointers to socket buffers and finally pointers to
* hook functions to be called when new data have arrived to the receive buffer
* (@rx_hook), when the contents of the transmit buffer have been transmitted
* (@tx_hook) and when an error or connection close occurs (@err_hook).
*
* Freeing of sockets from inside socket hooks is perfectly safe.
*/
#ifndef SOL_IP
#define SOL_IP IPPROTO_IP
#endif
#ifndef SOL_IPV6
#define SOL_IPV6 IPPROTO_IPV6
#endif
#ifndef SOL_ICMPV6
#define SOL_ICMPV6 IPPROTO_ICMPV6
#endif
/*
* Sockaddr helper functions
*/
static inline int UNUSED sockaddr_length(int af)
{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
static inline void
sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
{
memset(sa, 0, sizeof(struct sockaddr_in));
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
sa->sin_len = sizeof(struct sockaddr_in);
#endif
sa->sin_family = AF_INET;
sa->sin_port = htons(port);
sa->sin_addr = ipa_to_in4(a);
}
static inline void
sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
{
memset(sa, 0, sizeof(struct sockaddr_in6));
#ifdef SIN6_LEN
sa->sin6_len = sizeof(struct sockaddr_in6);
#endif
sa->sin6_family = AF_INET6;
sa->sin6_port = htons(port);
sa->sin6_flowinfo = 0;
sa->sin6_addr = ipa_to_in6(a);
if (ifa && ipa_is_link_local(a))
sa->sin6_scope_id = ifa->index;
}
void
sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
{
if (af == AF_INET)
sockaddr_fill4((struct sockaddr_in *) sa, a, port);
else if (af == AF_INET6)
sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
else
bug("Unknown AF");
}
static inline void
sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
{
*port = ntohs(sa->sin_port);
*a = ipa_from_in4(sa->sin_addr);
}
static inline void
sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
{
*port = ntohs(sa->sin6_port);
*a = ipa_from_in6(sa->sin6_addr);
if (ifa && ipa_is_link_local(*a))
*ifa = if_find_by_index(sa->sin6_scope_id);
}
int
sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
{
if (sa->sa.sa_family != af)
goto fail;
if (af == AF_INET)
sockaddr_read4((struct sockaddr_in *) sa, a, port);
else if (af == AF_INET6)
sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
else
goto fail;
return 0;
fail:
*a = IPA_NONE;
*port = 0;
return -1;
}
/*
* IPv6 multicast syscalls
*/
/* Fortunately standardized in RFC 3493 */
#define INIT_MREQ6(maddr,ifa) \
{ .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
static inline int
sk_setup_multicast6(sock *s)
{
int index = s->iface->index;
int ttl = s->ttl;
int n = 0;
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
ERR("IPV6_MULTICAST_IF");
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
ERR("IPV6_MULTICAST_HOPS");
if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
ERR("IPV6_MULTICAST_LOOP");
return 0;
}
static inline int
sk_join_group6(sock *s, ip_addr maddr)
{
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
ERR("IPV6_JOIN_GROUP");
return 0;
}
static inline int
sk_leave_group6(sock *s, ip_addr maddr)
{
struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
ERR("IPV6_LEAVE_GROUP");
return 0;
}
/*
* IPv6 packet control messages
*/
/* Also standardized, in RFC 3542 */
/*
* RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
* type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
* don't have IPV6_RECVPKTINFO we suppose the OS implements the older
* RFC and we use IPV6_PKTINFO.
*/
#ifndef IPV6_RECVPKTINFO
#define IPV6_RECVPKTINFO IPV6_PKTINFO
#endif
/*
* Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
*/
#ifndef IPV6_RECVHOPLIMIT
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
#endif
#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
static inline int
sk_request_cmsg6_pktinfo(sock *s)
{
int y = 1;
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
ERR("IPV6_RECVPKTINFO");
return 0;
}
static inline int
sk_request_cmsg6_ttl(sock *s)
{
int y = 1;
if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
ERR("IPV6_RECVHOPLIMIT");
return 0;
}
static inline void
sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
{
if (cm->cmsg_type == IPV6_PKTINFO)
{
struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
s->laddr = ipa_from_in6(pi->ipi6_addr);
s->lifindex = pi->ipi6_ifindex;
}
}
static inline void
sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
{
if (cm->cmsg_type == IPV6_HOPLIMIT)
s->rcv_ttl = * (int *) CMSG_DATA(cm);
}
static inline void
sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
{
struct cmsghdr *cm;
struct in6_pktinfo *pi;
int controllen = 0;
msg->msg_control = cbuf;
msg->msg_controllen = cbuflen;
cm = CMSG_FIRSTHDR(msg);
cm->cmsg_level = SOL_IPV6;
cm->cmsg_type = IPV6_PKTINFO;
cm->cmsg_len = CMSG_LEN(sizeof(*pi));
controllen += CMSG_SPACE(sizeof(*pi));
pi = (struct in6_pktinfo *) CMSG_DATA(cm);
pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
pi->ipi6_addr = ipa_to_in6(s->saddr);
msg->msg_controllen = controllen;
}
/*
* Miscellaneous socket syscalls
*/
static inline int
sk_set_ttl4(sock *s, int ttl)
{
if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
ERR("IP_TTL");
return 0;
}
static inline int
sk_set_ttl6(sock *s, int ttl)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
ERR("IPV6_UNICAST_HOPS");
return 0;
}
static inline int
sk_set_tos4(sock *s, int tos)
{
if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
ERR("IP_TOS");
return 0;
}
static inline int
sk_set_tos6(sock *s, int tos)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
ERR("IPV6_TCLASS");
return 0;
}
static inline int
sk_set_high_port(sock *s UNUSED)
{
/* Port range setting is optional, ignore it if not supported */
#ifdef IP_PORTRANGE
if (sk_is_ipv4(s))
{
int range = IP_PORTRANGE_HIGH;
if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
ERR("IP_PORTRANGE");
}
#endif
#ifdef IPV6_PORTRANGE
if (sk_is_ipv6(s))
{
int range = IPV6_PORTRANGE_HIGH;
if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
ERR("IPV6_PORTRANGE");
}
#endif
return 0;
}
static inline byte *
sk_skip_ip_header(byte *pkt, int *len)
{
if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
return NULL;
int hlen = (*pkt & 0x0f) * 4;
if ((hlen < 20) || (hlen > *len))
return NULL;
*len -= hlen;
return pkt + hlen;
}
byte *
sk_rx_buffer(sock *s, int *len)
{
if (sk_is_ipv4(s) && (s->type == SK_IP))
return sk_skip_ip_header(s->rbuf, len);
else
return s->rbuf;
}
/*
* Public socket functions
*/
/**
* sk_setup_multicast - enable multicast for given socket
* @s: socket
*
* Prepare transmission of multicast packets for given datagram socket.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_setup_multicast(sock *s)
{
ASSERT(s->iface);
if (sk_is_ipv4(s))
return sk_setup_multicast4(s);
else
return sk_setup_multicast6(s);
}
/**
* sk_join_group - join multicast group for given socket
* @s: socket
* @maddr: multicast address
*
* Join multicast group for given datagram socket and associated interface.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_join_group(sock *s, ip_addr maddr)
{
if (sk_is_ipv4(s))
return sk_join_group4(s, maddr);
else
return sk_join_group6(s, maddr);
}
/**
* sk_leave_group - leave multicast group for given socket
* @s: socket
* @maddr: multicast address
*
* Leave multicast group for given datagram socket and associated interface.
* The socket must have defined @iface.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_leave_group(sock *s, ip_addr maddr)
{
if (sk_is_ipv4(s))
return sk_leave_group4(s, maddr);
else
return sk_leave_group6(s, maddr);
}
/**
* sk_setup_broadcast - enable broadcast for given socket
* @s: socket
*
* Allow reception and transmission of broadcast packets for given datagram
* socket. The socket must have defined @iface. For transmission, packets should
* be send to @brd address of @iface.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_setup_broadcast(sock *s)
{
int y = 1;
if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
ERR("SO_BROADCAST");
return 0;
}
/**
* sk_set_ttl - set transmit TTL for given socket
* @s: socket
* @ttl: TTL value
*
* Set TTL for already opened connections when TTL was not set before. Useful
* for accepted connections when different ones should have different TTL.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_ttl(sock *s, int ttl)
{
s->ttl = ttl;
if (sk_is_ipv4(s))
return sk_set_ttl4(s, ttl);
else
return sk_set_ttl6(s, ttl);
}
/**
* sk_set_min_ttl - set minimal accepted TTL for given socket
* @s: socket
* @ttl: TTL value
*
* Set minimal accepted TTL for given socket. Can be used for TTL security.
* implementations.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_min_ttl(sock *s, int ttl)
{
if (sk_is_ipv4(s))
return sk_set_min_ttl4(s, ttl);
else
return sk_set_min_ttl6(s, ttl);
}
#if 0
/**
* sk_set_md5_auth - add / remove MD5 security association for given socket
* @s: socket
* @local: IP address of local side
* @remote: IP address of remote side
* @ifa: Interface for link-local IP address
* @passwd: Password used for MD5 authentication
* @setkey: Update also system SA/SP database
*
* In TCP MD5 handling code in kernel, there is a set of security associations
* used for choosing password and other authentication parameters according to
* the local and remote address. This function is useful for listening socket,
* for active sockets it may be enough to set s->password field.
*
* When called with passwd != NULL, the new pair is added,
* When called with passwd == NULL, the existing pair is removed.
*
* Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
* stored in global SA/SP database (but the behavior also must be enabled on
* per-socket basis). In case of multiple sockets to the same neighbor, the
* socket-specific state must be configured for each socket while global state
* just once per src-dst pair. The @setkey argument controls whether the global
* state (SA/SP database) is also updated.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
{ DUMMY; }
#endif
/**
* sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
* @s: socket
* @offset: offset
*
* Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
* kernel will automatically fill it for outgoing packets and check it for
* incoming packets. Should not be used on ICMPv6 sockets, where the position is
* known to the kernel.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_set_ipv6_checksum(sock *s, int offset)
{
if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
ERR("IPV6_CHECKSUM");
return 0;
}
int
sk_set_icmp6_filter(sock *s, int p1, int p2)
{
/* a bit of lame interface, but it is here only for Radv */
struct icmp6_filter f;
ICMP6_FILTER_SETBLOCKALL(&f);
ICMP6_FILTER_SETPASS(p1, &f);
ICMP6_FILTER_SETPASS(p2, &f);
if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
ERR("ICMP6_FILTER");
return 0;
}
void
sk_log_error(sock *s, const char *p)
{
log(L_ERR "%s: Socket error: %s%#m", p, s->err);
}
/*
* Actual struct birdsock code
*/
sock *
sk_next(sock *s)
{
if (!s->n.next->next)
return NULL;
else
return SKIP_BACK(sock, n, s->n.next);
}
static void
sk_alloc_bufs(sock *s)
{
if (!s->rbuf && s->rbsize)
s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
s->rpos = s->rbuf;
if (!s->tbuf && s->tbsize)
s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
s->tpos = s->ttx = s->tbuf;
}
static void
sk_free_bufs(sock *s)
{
if (s->rbuf_alloc)
{
xfree(s->rbuf_alloc);
s->rbuf = s->rbuf_alloc = NULL;
}
if (s->tbuf_alloc)
{
xfree(s->tbuf_alloc);
s->tbuf = s->tbuf_alloc = NULL;
}
}
#ifdef HAVE_LIBSSH
static void
sk_ssh_free(sock *s)
{
struct ssh_sock *ssh = s->ssh;
if (s->ssh == NULL)
return;
s->ssh = NULL;
if (ssh->channel)
{
ssh_channel_close(ssh->channel);
ssh_channel_free(ssh->channel);
ssh->channel = NULL;
}
if (ssh->session)
{
ssh_disconnect(ssh->session);
ssh_free(ssh->session);
ssh->session = NULL;
}
}
#endif
static void
sk_free(resource *r)
{
sock *s = (sock *) r;
sk_free_bufs(s);
#ifdef HAVE_LIBSSH
if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
sk_ssh_free(s);
#endif
if (s->loop)
birdloop_remove_socket(s->loop, s);
if (s->fd >= 0 && s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
close(s->fd);
s->fd = -1;
}
void
sk_set_rbsize(sock *s, uint val)
{
ASSERT(s->rbuf_alloc == s->rbuf);
if (s->rbsize == val)
return;
s->rbsize = val;
xfree(s->rbuf_alloc);
s->rbuf_alloc = xmalloc(val);
s->rpos = s->rbuf = s->rbuf_alloc;
}
void
sk_set_tbsize(sock *s, uint val)
{
ASSERT(s->tbuf_alloc == s->tbuf);
if (s->tbsize == val)
return;
byte *old_tbuf = s->tbuf;
s->tbsize = val;
s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
s->tpos = s->tbuf + (s->tpos - old_tbuf);
s->ttx = s->tbuf + (s->ttx - old_tbuf);
}
void
sk_set_tbuf(sock *s, void *tbuf)
{
s->tbuf = tbuf ?: s->tbuf_alloc;
s->ttx = s->tpos = s->tbuf;
}
void
sk_reallocate(sock *s)
{
sk_free_bufs(s);
sk_alloc_bufs(s);
}
static void
sk_dump(resource *r, unsigned indent UNUSED)
{
sock *s = (sock *) r;
static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
sk_type_names[s->type],
s->data,
s->saddr,
s->sport,
s->daddr,
s->dport,
s->tos,
s->ttl,
s->iface ? s->iface->name : "none");
}
static struct resclass sk_class = {
"Socket",
sizeof(sock),
sk_free,
sk_dump,
NULL,
NULL
};
/**
* sk_new - create a socket
* @p: pool
*
* This function creates a new socket resource. If you want to use it,
* you need to fill in all the required fields of the structure and
* call sk_open() to do the actual opening of the socket.
*
* The real function name is sock_new(), sk_new() is a macro wrapper
* to avoid collision with OpenSSL.
*/
sock *
sock_new(pool *p)
{
sock *s = ralloc(p, &sk_class);
s->pool = p;
// s->saddr = s->daddr = IPA_NONE;
s->tos = s->priority = s->ttl = -1;
s->fd = -1;
return s;
}
static int
sk_setup(sock *s)
{
int y = 1;
int fd = s->fd;
if (s->type == SK_SSH_ACTIVE)
return 0;
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
ERR("O_NONBLOCK");
if (!s->af)
return 0;
if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
s->flags |= SKF_PKTINFO;
#ifdef CONFIG_USE_HDRINCL
if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
{
s->flags &= ~SKF_PKTINFO;
s->flags |= SKF_HDRINCL;
if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
ERR("IP_HDRINCL");
}
#endif
if (s->vrf && !s->iface)
{
/* Bind socket to associated VRF interface.
This is Linux-specific, but so is SO_BINDTODEVICE. */
#ifdef SO_BINDTODEVICE
struct ifreq ifr = {};
strcpy(ifr.ifr_name, s->vrf->name);
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
ERR("SO_BINDTODEVICE");
#endif
}
if (s->iface)
{
#ifdef SO_BINDTODEVICE
struct ifreq ifr = {};
strcpy(ifr.ifr_name, s->iface->name);
if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
ERR("SO_BINDTODEVICE");
#endif
#ifdef CONFIG_UNIX_DONTROUTE
if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
ERR("SO_DONTROUTE");
#endif
}
if (sk_is_ipv4(s))
{
if (s->flags & SKF_LADDR_RX)
if (sk_request_cmsg4_pktinfo(s) < 0)
return -1;
if (s->flags & SKF_TTL_RX)
if (sk_request_cmsg4_ttl(s) < 0)
return -1;
if ((s->type == SK_UDP) || (s->type == SK_IP))
if (sk_disable_mtu_disc4(s) < 0)
return -1;
if (s->ttl >= 0)
if (sk_set_ttl4(s, s->ttl) < 0)
return -1;
if (s->tos >= 0)
if (sk_set_tos4(s, s->tos) < 0)
return -1;
}
if (sk_is_ipv6(s))
{
if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
ERR("IPV6_V6ONLY");
if (s->flags & SKF_LADDR_RX)
if (sk_request_cmsg6_pktinfo(s) < 0)
return -1;
if (s->flags & SKF_TTL_RX)
if (sk_request_cmsg6_ttl(s) < 0)
return -1;
if ((s->type == SK_UDP) || (s->type == SK_IP))
if (sk_disable_mtu_disc6(s) < 0)
return -1;
if (s->ttl >= 0)
if (sk_set_ttl6(s, s->ttl) < 0)
return -1;
if (s->tos >= 0)
if (sk_set_tos6(s, s->tos) < 0)
return -1;
}
/* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
if (s->priority >= 0)
if (sk_set_priority(s, s->priority) < 0)
return -1;
return 0;
}
static void
sk_tcp_connected(sock *s)
{
sockaddr sa;
int sa_len = sizeof(sa);
if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
(sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
log(L_WARN "SOCK: Cannot get local IP address for TCP>");
s->type = SK_TCP;
sk_alloc_bufs(s);
s->tx_hook(s);
}
#ifdef HAVE_LIBSSH
static void
sk_ssh_connected(sock *s)
{
sk_alloc_bufs(s);
s->type = SK_SSH;
s->tx_hook(s);
}
#endif
static int
sk_passive_connected(sock *s, int type)
{
sockaddr loc_sa, rem_sa;
int loc_sa_len = sizeof(loc_sa);
int rem_sa_len = sizeof(rem_sa);
int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
if (fd < 0)
{
if ((errno != EINTR) && (errno != EAGAIN))
s->err_hook(s, errno);
return 0;
}
struct domain_generic *sock_lock = DG_IS_LOCKED(s->pool->domain) ? NULL : s->pool->domain;
if (sock_lock)
DG_LOCK(sock_lock);
sock *t = sk_new(s->pool);
t->type = type;
t->data = s->data;
t->af = s->af;
t->fd = fd;
t->ttl = s->ttl;
t->tos = s->tos;
t->vrf = s->vrf;
t->rbsize = s->rbsize;
t->tbsize = s->tbsize;
if (type == SK_TCP)
{
if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
(sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
log(L_WARN "SOCK: Cannot get local IP address for TCP<");
if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
}
if (sk_setup(t) < 0)
{
/* FIXME: Call err_hook instead ? */
log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
/* FIXME: handle it better in rfree() */
close(t->fd);
t->fd = -1;
sk_close(t);
t = NULL;
}
else
{
birdloop_add_socket(s->loop, t);
sk_alloc_bufs(t);
}
if (sock_lock)
DG_UNLOCK(sock_lock);
if (t)
s->rx_hook(t, 0);
return 1;
}
#ifdef HAVE_LIBSSH
/*
* Return SSH_OK or SSH_AGAIN or SSH_ERROR
*/
static int
sk_ssh_connect(sock *s)
{
s->fd = ssh_get_fd(s->ssh->session);
/* Big fall thru automata */
switch (s->ssh->state)
{
case SK_SSH_CONNECT:
{
switch (ssh_connect(s->ssh->session))
{
case SSH_AGAIN:
/* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
* after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
* documented but our code relies on that.
*/
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_SERVER_KNOWN:
{
s->ssh->state = SK_SSH_SERVER_KNOWN;
if (s->ssh->server_hostkey_path)
{
int server_identity_is_ok = 1;
#ifdef HAVE_SSH_OLD_SERVER_VALIDATION_API
#define ssh_session_is_known_server ssh_is_server_known
#define SSH_KNOWN_HOSTS_OK SSH_SERVER_KNOWN_OK
#define SSH_KNOWN_HOSTS_UNKNOWN SSH_SERVER_NOT_KNOWN
#define SSH_KNOWN_HOSTS_CHANGED SSH_SERVER_KNOWN_CHANGED
#define SSH_KNOWN_HOSTS_NOT_FOUND SSH_SERVER_FILE_NOT_FOUND
#define SSH_KNOWN_HOSTS_ERROR SSH_SERVER_ERROR
#define SSH_KNOWN_HOSTS_OTHER SSH_SERVER_FOUND_OTHER
#endif
/* Check server identity */
switch (ssh_session_is_known_server(s->ssh->session))
{
#define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
case SSH_KNOWN_HOSTS_OK:
/* The server is known and has not changed. */
break;
case SSH_KNOWN_HOSTS_UNKNOWN:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
server_identity_is_ok = 0;
break;
case SSH_KNOWN_HOSTS_CHANGED:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
server_identity_is_ok = 0;
break;
case SSH_KNOWN_HOSTS_NOT_FOUND:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
server_identity_is_ok = 0;
break;
case SSH_KNOWN_HOSTS_ERROR:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
server_identity_is_ok = 0;
break;
case SSH_KNOWN_HOSTS_OTHER:
LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
"It is a possible attack.");
server_identity_is_ok = 0;
break;
}
if (!server_identity_is_ok)
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_USERAUTH:
{
s->ssh->state = SK_SSH_USERAUTH;
switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
{
case SSH_AUTH_AGAIN:
return SSH_AGAIN;
case SSH_AUTH_SUCCESS:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_CHANNEL:
{
s->ssh->state = SK_SSH_CHANNEL;
s->ssh->channel = ssh_channel_new(s->ssh->session);
if (s->ssh->channel == NULL)
return SSH_ERROR;
} /* fallthrough */
case SK_SSH_SESSION:
{
s->ssh->state = SK_SSH_SESSION;
switch (ssh_channel_open_session(s->ssh->channel))
{
case SSH_AGAIN:
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
} /* fallthrough */
case SK_SSH_SUBSYSTEM:
{
s->ssh->state = SK_SSH_SUBSYSTEM;
if (s->ssh->subsystem)
{
switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
{
case SSH_AGAIN:
return SSH_AGAIN;
case SSH_OK:
break;
default:
return SSH_ERROR;
}
}
} /* fallthrough */
case SK_SSH_ESTABLISHED:
s->ssh->state = SK_SSH_ESTABLISHED;
}
return SSH_OK;
}
/*
* Return file descriptor number if success
* Return -1 if failed
*/
static int
sk_open_ssh(sock *s)
{
if (!s->ssh)
bug("sk_open() sock->ssh is not allocated");
ssh_session sess = ssh_new();
if (sess == NULL)
ERR2("Cannot create a ssh session");
s->ssh->session = sess;
const int verbosity = SSH_LOG_NOLOG;
ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
/* TODO: Add SSH_OPTIONS_BINDADDR */
ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);
if (s->ssh->server_hostkey_path)
ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);
if (s->ssh->client_privkey_path)
ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);
ssh_set_blocking(sess, 0);
switch (sk_ssh_connect(s))
{
case SSH_AGAIN:
break;
case SSH_OK:
sk_ssh_connected(s);
break;
case SSH_ERROR:
ERR2(ssh_get_error(sess));
break;
}
return ssh_get_fd(sess);
err:
return -1;
}
#endif
/**
* sk_open - open a socket
* @loop: loop
* @s: socket
*
* This function takes a socket resource created by sk_new() and
* initialized by the user and binds a corresponding network connection
* to it.
*
* Result: 0 for success, -1 for an error.
*/
int
sk_open(sock *s, struct birdloop *loop)
{
int af = AF_UNSPEC;
int fd = -1;
int do_bind = 0;
int bind_port = 0;
ip_addr bind_addr = IPA_NONE;
sockaddr sa;
if (s->type <= SK_IP)
{
/*
* For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
* explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
* But the specifications have to be consistent.
*/
switch (s->subtype)
{
case 0:
ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
(ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
break;
case SK_IPV4:
ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
af = AF_INET;
break;
case SK_IPV6:
ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
af = AF_INET6;
break;
default:
bug("Invalid subtype %d", s->subtype);
}
}
switch (s->type)
{
case SK_TCP_ACTIVE:
s->ttx = ""; /* Force s->ttx != s->tpos */
/* Fall thru */
case SK_TCP_PASSIVE:
fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
bind_port = s->sport;
bind_addr = s->saddr;
do_bind = bind_port || ipa_nonzero(bind_addr);
break;
#ifdef HAVE_LIBSSH
case SK_SSH_ACTIVE:
s->ttx = ""; /* Force s->ttx != s->tpos */
fd = sk_open_ssh(s);
break;
#endif
case SK_UDP:
fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
bind_port = s->sport;
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
do_bind = 1;
break;
case SK_IP:
fd = socket(af, SOCK_RAW, s->dport);
bind_port = 0;
bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
do_bind = ipa_nonzero(bind_addr);
break;
case SK_MAGIC:
af = 0;
fd = s->fd;
break;
default:
bug("sk_open() called for invalid sock type %d", s->type);
}
if (fd < 0)
ERR("socket");
s->af = af;
s->fd = fd;
if (sk_setup(s) < 0)
goto err;
if (do_bind)
{
if (bind_port)
{
int y = 1;
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
ERR2("SO_REUSEADDR");
#ifdef CONFIG_NO_IFACE_BIND
/* Workaround missing ability to bind to an iface */
if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
{
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
ERR2("SO_REUSEPORT");
}
#endif
}
else
if (s->flags & SKF_HIGH_PORT)
if (sk_set_high_port(s) < 0)
log(L_WARN "Socket error: %s%#m", s->err);
if (s->flags & SKF_FREEBIND)
if (sk_set_freebind(s) < 0)
log(L_WARN "Socket error: %s%#m", s->err);
sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
ERR2("bind");
}
if (s->password)
if (sk_set_md5_auth(s, s->saddr, s->daddr, -1, s->iface, s->password, 0) < 0)
goto err;
switch (s->type)
{
case SK_TCP_ACTIVE:
sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
sk_tcp_connected(s);
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
ERR2("connect");
break;
case SK_TCP_PASSIVE:
if (listen(fd, 8) < 0)
ERR2("listen");
break;
case SK_SSH_ACTIVE:
case SK_MAGIC:
break;
default:
sk_alloc_bufs(s);
}
birdloop_add_socket(loop, s);
return 0;
err:
close(fd);
s->fd = -1;
return -1;
}
int
sk_open_unix(sock *s, struct birdloop *loop, char *name)
{
struct sockaddr_un sa;
int fd;
/* We are sloppy during error (leak fd and not set s->err), but we die anyway */
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
return -1;
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
return -1;
/* Path length checked in test_old_bird() but we may need unix sockets for other reasons in future */
ASSERT_DIE(strlen(name) < sizeof(sa.sun_path));
sa.sun_family = AF_UNIX;
strcpy(sa.sun_path, name);
if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
return -1;
if (listen(fd, 8) < 0)
return -1;
s->fd = fd;
birdloop_add_socket(loop, s);
return 0;
}
#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
static void
sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
{
if (sk_is_ipv4(s))
sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
else
sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
}
static void
sk_process_cmsgs(sock *s, struct msghdr *msg)
{
struct cmsghdr *cm;
s->laddr = IPA_NONE;
s->lifindex = 0;
s->rcv_ttl = -1;
for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
{
if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
{
sk_process_cmsg4_pktinfo(s, cm);
sk_process_cmsg4_ttl(s, cm);
}
if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
{
sk_process_cmsg6_pktinfo(s, cm);
sk_process_cmsg6_ttl(s, cm);
}
}
}
static inline int
sk_sendmsg(sock *s)
{
struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
byte cmsg_buf[CMSG_TX_SPACE];
sockaddr dst;
int flags = 0;
sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
struct msghdr msg = {
.msg_name = &dst.sa,
.msg_namelen = SA_LEN(dst),
.msg_iov = &iov,
.msg_iovlen = 1
};
#ifdef CONFIG_DONTROUTE_UNICAST
/* FreeBSD silently changes TTL to 1 when MSG_DONTROUTE is used, therefore we
cannot use it for other cases (e.g. when TTL security is used). */
if (ipa_is_ip4(s->daddr) && ip4_is_unicast(ipa_to_ip4(s->daddr)) && (s->ttl == 1))
flags = MSG_DONTROUTE;
#endif
#ifdef CONFIG_USE_HDRINCL
byte hdr[20];
struct iovec iov2[2] = { {hdr, 20}, iov };
if (s->flags & SKF_HDRINCL)
{
sk_prepare_ip_header(s, hdr, iov.iov_len);
msg.msg_iov = iov2;
msg.msg_iovlen = 2;
}
#endif
if (s->flags & SKF_PKTINFO)
sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
return sendmsg(s->fd, &msg, flags);
}
static inline int
sk_recvmsg(sock *s)
{
struct iovec iov = {s->rbuf, s->rbsize};
byte cmsg_buf[CMSG_RX_SPACE];
sockaddr src;
struct msghdr msg = {
.msg_name = &src.sa,
.msg_namelen = sizeof(src), // XXXX ??
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = cmsg_buf,
.msg_controllen = sizeof(cmsg_buf),
.msg_flags = 0
};
int rv = recvmsg(s->fd, &msg, 0);
if (rv < 0)
return rv;
//ifdef IPV4
// if (cf_type == SK_IP)
// rv = ipv4_skip_header(pbuf, rv);
//endif
sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
sk_process_cmsgs(s, &msg);
if (msg.msg_flags & MSG_TRUNC)
s->flags |= SKF_TRUNCATED;
else
s->flags &= ~SKF_TRUNCATED;
return rv;
}
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
_Bool
sk_tx_pending(sock *s)
{
return s->ttx != s->tpos;
}
static int
sk_maybe_write(sock *s)
{
int e;
switch (s->type)
{
case SK_TCP:
case SK_MAGIC:
case SK_UNIX:
while (sk_tx_pending(s))
{
e = write(s->fd, s->ttx, s->tpos - s->ttx);
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
{
reset_tx_buffer(s);
/* EPIPE is just a connection close notification during TX */
s->err_hook(s, (errno != EPIPE) ? errno : 0);
return -1;
}
return 0;
}
s->ttx += e;
}
reset_tx_buffer(s);
return 1;
#ifdef HAVE_LIBSSH
case SK_SSH:
while (sk_tx_pending(s))
{
e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx);
if (e < 0)
{
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
reset_tx_buffer(s);
/* EPIPE is just a connection close notification during TX */
s->err_hook(s, (errno != EPIPE) ? errno : 0);
return -1;
}
s->ttx += e;
}
reset_tx_buffer(s);
return 1;
#endif
case SK_UDP:
case SK_IP:
{
if (s->tbuf == s->tpos)
return 1;
e = sk_sendmsg(s);
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
{
reset_tx_buffer(s);
s->err_hook(s, errno);
return -1;
}
if (!s->tx_hook)
reset_tx_buffer(s);
return 0;
}
reset_tx_buffer(s);
return 1;
}
default:
bug("sk_maybe_write: unknown socket type %d", s->type);
}
}
int
sk_rx_ready(sock *s)
{
int rv;
struct pollfd pfd = { .fd = s->fd };
pfd.events |= POLLIN;
redo:
rv = poll(&pfd, 1, 0);
if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
goto redo;
return rv;
}
/**
* sk_send - send data to a socket
* @s: socket
* @len: number of bytes to send
*
* This function sends @len bytes of data prepared in the
* transmit buffer of the socket @s to the network connection.
* If the packet can be sent immediately, it does so and returns
* 1, else it queues the packet for later processing, returns 0
* and calls the @tx_hook of the socket when the tranmission
* takes place.
*/
int
sk_send(sock *s, unsigned len)
{
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
int e = sk_maybe_write(s);
if (e == 0) /* Trigger thread poll reload to poll this socket's write. */
socket_changed(s);
return e;
}
/**
* sk_send_to - send data to a specific destination
* @s: socket
* @len: number of bytes to send
* @addr: IP address to send the packet to
* @port: port to send the packet to
*
* This is a sk_send() replacement for connection-less packet sockets
* which allows destination of the packet to be chosen dynamically.
* Raw IP sockets should use 0 for @port.
*/
int
sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
{
s->daddr = addr;
if (port)
s->dport = port;
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
return sk_maybe_write(s);
}
/*
int
sk_send_full(sock *s, unsigned len, struct iface *ifa,
ip_addr saddr, ip_addr daddr, unsigned dport)
{
s->iface = ifa;
s->saddr = saddr;
s->daddr = daddr;
s->dport = dport;
s->ttx = s->tbuf;
s->tpos = s->tbuf + len;
return sk_maybe_write(s);
}
*/
static void
call_rx_hook(sock *s, int size)
{
if (s->rx_hook(s, size))
{
/* We need to be careful since the socket could have been deleted by the hook */
if (s->loop->sock_active == s)
s->rpos = s->rbuf;
}
}
#ifdef HAVE_LIBSSH
static int
sk_read_ssh(sock *s)
{
ssh_channel rchans[2] = { s->ssh->channel, NULL };
struct timeval timev = { 1, 0 };
if (ssh_channel_select(rchans, NULL, NULL, &timev) == SSH_EINTR)
return 1; /* Try again */
if (ssh_channel_is_eof(s->ssh->channel) != 0)
{
/* The remote side is closing the connection */
s->err_hook(s, 0);
return 0;
}
if (rchans[0] == NULL)
return 0; /* No data is available on the socket */
const uint used_bytes = s->rpos - s->rbuf;
const int read_bytes = ssh_channel_read_nonblocking(s->ssh->channel, s->rpos, s->rbsize - used_bytes, 0);
if (read_bytes > 0)
{
/* Received data */
s->rpos += read_bytes;
call_rx_hook(s, used_bytes + read_bytes);
return 1;
}
else if (read_bytes == 0)
{
if (ssh_channel_is_eof(s->ssh->channel) != 0)
{
/* The remote side is closing the connection */
s->err_hook(s, 0);
}
}
else
{
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
}
return 0; /* No data is available on the socket */
}
#endif
/* sk_read() and sk_write() are called from BFD's event loop */
static inline int
sk_read_noflush(sock *s, int revents)
{
switch (s->type)
{
case SK_TCP_PASSIVE:
return sk_passive_connected(s, SK_TCP);
case SK_UNIX_PASSIVE:
return sk_passive_connected(s, SK_UNIX);
case SK_TCP:
case SK_UNIX:
{
int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
if (c < 0)
{
if (errno != EINTR && errno != EAGAIN)
s->err_hook(s, errno);
else if (errno == EAGAIN && !(revents & POLLIN))
{
log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
s->err_hook(s, 0);
}
}
else if (!c)
s->err_hook(s, 0);
else
{
s->rpos += c;
call_rx_hook(s, s->rpos - s->rbuf);
return 1;
}
return 0;
}
#ifdef HAVE_LIBSSH
case SK_SSH:
return sk_read_ssh(s);
#endif
case SK_MAGIC:
return s->rx_hook(s, 0);
default:
{
int e = sk_recvmsg(s);
if (e < 0)
{
if (errno != EINTR && errno != EAGAIN)
s->err_hook(s, errno);
return 0;
}
s->rpos = s->rbuf + e;
s->rx_hook(s, e);
return 1;
}
}
}
int
sk_read(sock *s, int revents)
{
int e = sk_read_noflush(s, revents);
tmp_flush();
return e;
}
static inline int
sk_write_noflush(sock *s)
{
switch (s->type)
{
case SK_TCP_ACTIVE:
{
sockaddr sa;
sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
sk_tcp_connected(s);
else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
s->err_hook(s, errno);
return 0;
}
#ifdef HAVE_LIBSSH
case SK_SSH_ACTIVE:
{
switch (sk_ssh_connect(s))
{
case SSH_OK:
sk_ssh_connected(s);
break;
case SSH_AGAIN:
return 1;
case SSH_ERROR:
s->err = ssh_get_error(s->ssh->session);
s->err_hook(s, ssh_get_error_code(s->ssh->session));
break;
}
return 0;
}
#endif
default:
if (sk_tx_pending(s) && sk_maybe_write(s) > 0)
{
if (s->tx_hook)
s->tx_hook(s);
return 1;
}
return 0;
}
}
int
sk_write(sock *s)
{
int e = sk_write_noflush(s);
tmp_flush();
return e;
}
int sk_is_ipv4(sock *s)
{ return s->af == AF_INET; }
int sk_is_ipv6(sock *s)
{ return s->af == AF_INET6; }
void
sk_err(sock *s, int revents)
{
int se = 0, sse = sizeof(se);
if ((s->type != SK_MAGIC) && (revents & POLLERR))
if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
{
log(L_ERR "IO: Socket error: SO_ERROR: %m");
se = 0;
}
s->err_hook(s, se);
tmp_flush();
}
void
sk_dump_all(void)
{
node *n;
sock *s;
debug("Open sockets:\n");
WALK_LIST(n, main_birdloop.sock_list)
{
s = SKIP_BACK(sock, n, n);
debug("%p ", s);
sk_dump(&s->r, 3);
}
debug("\n");
}
/*
* Internal event log and watchdog
*/
#define EVENT_LOG_LENGTH 32
struct event_log_entry
{
void *hook;
void *data;
btime timestamp;
btime duration;
};
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
static struct event_log_entry *event_open;
static int event_log_pos, event_log_num, watchdog_active;
static btime last_io_time;
static btime loop_time;
static void
io_update_time(void)
{
last_io_time = current_time();
if (event_open)
{
event_open->duration = last_io_time - event_open->timestamp;
if (event_open->duration > config->latency_limit)
log(L_WARN "Event 0x%p 0x%p took %u.%03u ms",
event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000));
event_open = NULL;
}
}
/**
* io_log_event - mark approaching event into event log
* @hook: event hook address
* @data: event data address
*
* Store info (hook, data, timestamp) about the following internal event into
* a circular event log (@event_log). When latency tracking is enabled, the log
* entry is kept open (in @event_open) so the duration can be filled later.
*/
void
io_log_event(void *hook, void *data)
{
if (config->latency_debug)
io_update_time();
struct event_log_entry *en = event_log + event_log_pos;
en->hook = hook;
en->data = data;
en->timestamp = last_io_time;
en->duration = 0;
event_log_num++;
event_log_pos++;
event_log_pos %= EVENT_LOG_LENGTH;
event_open = config->latency_debug ? en : NULL;
}
static inline void
io_close_event(void)
{
if (event_open)
io_update_time();
}
void
io_log_dump(void)
{
int i;
log(L_DEBUG "Event log:");
for (i = 0; i < EVENT_LOG_LENGTH; i++)
{
struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
if (en->hook)
log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
(int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
}
}
void
watchdog_sigalrm(int sig UNUSED)
{
/* Update last_io_time and duration, but skip latency check */
config->latency_limit = 0xffffffff;
io_update_time();
debug_safe("Watchdog timer timed out\n");
/* We want core dump */
abort();
}
static inline void
watchdog_start1(void)
{
io_update_time();
loop_time = last_io_time;
}
static inline void
watchdog_start(void)
{
io_update_time();
loop_time = last_io_time;
event_log_num = 0;
if (config->watchdog_timeout)
{
alarm(config->watchdog_timeout);
watchdog_active = 1;
}
}
static inline void
watchdog_stop(void)
{
io_update_time();
if (watchdog_active)
{
alarm(0);
watchdog_active = 0;
}
btime duration = last_io_time - loop_time;
if (duration > config->watchdog_warning)
log(L_WARN "I/O loop cycle took %u.%03u ms for %d events",
(uint) (duration TO_MS), (uint) (duration % 1000), event_log_num);
}
/*
* Main I/O Loop
*/
void
io_init(void)
{
init_list(&main_birdloop.sock_list);
ev_init_list(&global_event_list, &main_birdloop, "Global event list");
ev_init_list(&global_work_list, &main_birdloop, "Global work list");
ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list");
krt_io_init();
// XXX init_times();
// XXX update_times();
boot_time = current_time();
u64 now = (u64) current_real_time();
srandom((uint) (now ^ (now >> 32)));
}
static int short_loops = 0;
#define SHORT_LOOP_MAX 10
#define WORK_EVENTS_MAX 10
sock *stored_sock;
void
io_loop(void)
{
int poll_tout, timeout;
int events, pout;
timer *t;
struct pfd pfd;
BUFFER_INIT(pfd.pfd, &root_pool, 16);
BUFFER_INIT(pfd.loop, &root_pool, 16);
watchdog_start1();
for(;;)
{
times_update();
events = ev_run_list(&global_event_list);
events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events;
events = ev_run_list(&main_birdloop.event_list) || events;
timers_fire(&main_birdloop.time, 1);
io_close_event();
// FIXME
poll_tout = (events ? 0 : 3000); /* Time in milliseconds */
if (t = timers_first(&main_birdloop.time))
{
times_update();
timeout = (tm_remains(t) TO_MS) + 1;
poll_tout = MIN(poll_tout, timeout);
}
BUFFER_FLUSH(pfd.pfd);
BUFFER_FLUSH(pfd.loop);
pipe_pollin(&main_birdloop.thread->wakeup, &pfd);
sockets_prepare(&main_birdloop, &pfd);
/*
* Yes, this is racy. But even if the signal comes before this test
* and entering poll(), it gets caught on the next timer tick.
*/
if (async_config_flag)
{
io_log_event(async_config, NULL);
async_config();
async_config_flag = 0;
continue;
}
if (async_dump_flag)
{
io_log_event(async_dump, NULL);
async_dump();
async_dump_flag = 0;
continue;
}
if (async_shutdown_flag)
{
io_log_event(async_shutdown, NULL);
async_shutdown();
async_shutdown_flag = 0;
continue;
}
/* And finally enter poll() to find active sockets */
watchdog_stop();
birdloop_leave(&main_birdloop);
pout = poll(pfd.pfd.data, pfd.pfd.used, poll_tout);
birdloop_enter(&main_birdloop);
watchdog_start();
if (pout < 0)
{
if (errno == EINTR || errno == EAGAIN)
continue;
bug("poll: %m");
}
if (pout)
{
if (pfd.pfd.data[0].revents & POLLIN)
{
/* IO loop reload requested */
pipe_drain(&main_birdloop.thread->wakeup);
atomic_fetch_and_explicit(&main_birdloop.thread_transition, ~LTT_PING, memory_order_acq_rel);
continue;
}
times_update();
/* guaranteed to be non-empty */
main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list));
while (main_birdloop.sock_active)
{
sock *s = main_birdloop.sock_active;
if (s->index != -1)
{
int e;
int steps;
steps = MAX_STEPS;
if (s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook)
do
{
steps--;
io_log_event(s->rx_hook, s->data);
e = sk_read(s, pfd.pfd.data[s->index].revents);
}
while (e && (main_birdloop.sock_active == s) && s->rx_hook && steps);
if (s != main_birdloop.sock_active)
continue;
steps = MAX_STEPS;
if (pfd.pfd.data[s->index].revents & POLLOUT)
do
{
steps--;
io_log_event(s->tx_hook, s->data);
e = sk_write(s);
}
while (e && (main_birdloop.sock_active == s) && steps);
if (s != main_birdloop.sock_active)
continue;
}
main_birdloop.sock_active = sk_next(s);
}
short_loops++;
if (events && (short_loops < SHORT_LOOP_MAX))
continue;
short_loops = 0;
int count = 0;
main_birdloop.sock_active = stored_sock;
if (main_birdloop.sock_active == NULL)
main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list));
while (main_birdloop.sock_active && count < MAX_RX_STEPS)
{
sock *s = main_birdloop.sock_active;
if (s->index == -1)
goto next2;
if (!s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook)
{
count++;
io_log_event(s->rx_hook, s->data);
sk_read(s, pfd.pfd.data[s->index].revents);
if (s != main_birdloop.sock_active)
continue;
}
if (pfd.pfd.data[s->index].revents & (POLLHUP | POLLERR))
{
sk_err(s, pfd.pfd.data[s->index].revents);
if (s != main_birdloop.sock_active)
continue;
}
next2: ;
main_birdloop.sock_active = sk_next(s);
}
stored_sock = main_birdloop.sock_active;
}
}
}
void
test_old_bird(char *path)
{
int fd;
struct sockaddr_un sa;
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
die("Cannot create socket: %m");
if (strlen(path) >= sizeof(sa.sun_path))
die("Socket path too long");
bzero(&sa, sizeof(sa));
sa.sun_family = AF_UNIX;
strcpy(sa.sun_path, path);
if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
die("I found another BIRD running.");
close(fd);
}