0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2024-12-22 17:51:53 +00:00
bird/flock/container.c
2024-10-20 21:09:32 +02:00

1204 lines
27 KiB
C

#include "flock/flock.h"
#include "lib/birdlib.h"
#include "lib/cbor.h"
#include "lib/io-loop.h"
#include "lib/hash.h"
#include <dirent.h>
#include <poll.h>
#include <sched.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>
static struct hypervisor_container_forker {
sock *s;
pool *p;
struct birdloop *loop;
HASH(struct container_runtime) hash;
struct container_runtime *cur_crt;
int ctl[2]; /* socketpair filedescriptors */
} hcf;
static struct container_config {
const char *hostname;
const char *workdir;
const char *basedir;
} ccf;
struct container_runtime {
struct container_runtime *next;
struct container_config ccf;
uint hash;
pid_t pid;
sock *s;
struct container_operation_callback {
callback cb;
sock *s;
void *data;
} *ccc;
char data[];
};
#define CRT_KEY(c) c->ccf.hostname, c->hash
#define CRT_NEXT(c) c->next
#define CRT_EQ(a,h,b,i) ((h) == (i)) && (!strcmp(a,b))
#define CRT_FN(a,h) h
static sig_atomic_t poweroff, zombie;
static void
container_poweroff_sighandler(int signo)
{
poweroff = signo;
}
static void
container_child_sighandler(int signo UNUSED)
{
zombie = 1;
}
static int container_forker_fd = -1;
static void
container_poweroff(int fd, int sig)
{
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
byte buf[128];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, _cw.buf, sizeof _cw.buf);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -4);
cbor_put_int(cw, sig);
}
ASSERT_DIE(cbor_writer_done(cw) == 1);
s64 sz = cw->data.pos - cw->data.start;
ASSERT_DIE(write(fd, cw->data.start, sz) == sz);
unlink("/dev/log");
}
static void
container_zombie(void)
{
zombie = 0;
log(L_INFO "Zombie elimination routine invoked.");
while (1) {
int status;
pid_t p = waitpid(-1, &status, WNOHANG);
if (p < 0)
{
if (errno != ECHILD)
log(L_ERR "Zombie elimination failed: %m");
return;
}
if (p == 0)
return;
const char *coreinfo = WCOREDUMP(status) ? " (core dumped)" : "";
if (WIFEXITED(status))
log(L_INFO "Process %d ended with status %d%s", p, WEXITSTATUS(status), coreinfo);
else if (WIFSIGNALED(status))
log(L_INFO "Process %d exited by signal %d (%s)%s", p, WTERMSIG(status), strsignal(WTERMSIG(status)), coreinfo);
else
log(L_ERR "Process %d exited with a strange status %d", p, status);
}
}
//#define SYSCALL(x, ...) ({ int _e = x(__VA_ARGS__); if (_e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); else log(L_TRACE "OK %s at %s:%d", #x, __FILE__, __LINE__); _e; })
#define SYSCALL(x, ...) ({ int _e = x(__VA_ARGS__); if (_e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); _e; })
#define RUN(...) do { \
pid_t pid = fork(); \
if (pid) waitpid(pid, NULL, 0); \
else { execlp(__VA_ARGS__, NULL); bug("exec %s failed: %m", #__VA_ARGS__); } \
} while (0)
static int
container_getdir(char *path)
{
int e = open(path, O_DIRECTORY | O_PATH | O_RDWR);
if ((e >= 0) || (errno != ENOENT))
return e;
/* Split the path */
char *sl = strrchr(path, '/');
char *name = sl+1;
if (sl == path)
path = "/";
else
{
while (sl && sl[1] == 0)
{
/* Trailing slash removal */
sl[0] = 0;
sl = strrchr(path, '/');
}
if (!sl)
bug("Getdir failed, empty sl");
}
/* Open the parent directory */
*sl = 0;
int fd = container_getdir(path);
if (fd < 0)
return fd;
for (uint i=0; i<256; i++)
{
e = mkdirat(fd, name, 0755);
if ((e < 0) && (errno != EEXIST))
{
close(fd);
return e;
}
e = openat(fd, name, O_DIRECTORY | O_PATH | O_RDWR);
if ((e >= 0) || (errno != ENOENT))
{
close(fd);
return e;
}
}
die("Somebody is messing with the filesystem too badly.");
}
static void
copylink(const char *src, int sz, const char *dst)
{
char *contents = alloca(sz + 1);
int xsz = SYSCALL(readlink, src, contents, sz);
contents[xsz] = 0;
// log(L_INFO "symlinking device %s -> %s", dst, contents);
int se = symlink(contents, dst);
if (se < 0)
// die("failed to symlink %s: %m", dst);
log(L_ERR "failed to symlink %s: %m", dst);
}
#define GETDIR(_path) ({ char *path = _path; int fd = container_getdir(path); if (fd < 0) die("Failed to get the directory %s: %m", path); fd; })
#define MKDIR(_path) close(GETDIR(tmp_strdup(_path)))
struct container_logger {
struct birdloop *loop;
pool *p;
sock *rs;
sock *ws;
};
static int
container_logger_rx(sock *sk, uint sz)
{
struct container_logger *clg = sk->data;
if (clg->ws->tpos + sz >= clg->ws->tbuf + clg->ws->tbsize)
log(L_INFO "dropping a log message");
memcpy(clg->ws->tpos, sk->rbuf, sz);
clg->ws->tpos[sz] = '\n';
if (clg->ws->tpos == clg->ws->tbuf)
sk_send(clg->ws, sz + 1);
else
clg->ws->tpos += sz + 1;
return 0;
}
static void
container_logger_rerr(sock *sk UNUSED, int err)
{
if (!err)
bug("what");
die("Logger receiver socket closed unexpectedly: %s", strerror(err));
}
static void
container_logger_werr(sock *sk UNUSED, int err)
{
die("Logger writer closed unexpectedly: %s", strerror(err));
}
static void
container_init_logger(void)
{
struct birdloop *loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Logger");
birdloop_enter(loop);
pool *p = rp_new(birdloop_pool(loop), birdloop_domain(loop), "Logger pool");
sock *s = sk_new(p);
s->type = SK_MAGIC;
s->rx_hook = container_logger_rx;
s->err_hook = container_logger_rerr;
sk_set_rbsize(s, 16384);
unlink("/dev/log");
s->fd = SYSCALL(socket, AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
union {
struct sockaddr sa;
struct sockaddr_un un;
} sa;
sa.un.sun_family = AF_UNIX;
strcpy(sa.un.sun_path, "/dev/log");
SYSCALL(bind, s->fd, &sa.sa, sizeof sa.un);
struct container_logger *clg = mb_allocz(p, sizeof *clg);
clg->loop = loop;
clg->p = p;
clg->rs = s;
s->data = clg;
if (sk_open(clg->rs, clg->loop) < 0)
bug("Logger failed in sk_open(r): %m");
clg->rs->type = SK_UDP;
s = clg->ws = sk_new(p);
s->data = clg;
s->type = SK_MAGIC;
s->err_hook = container_logger_werr;
sk_set_tbsize(s, 16384);
MKDIR("/var/log");
s->fd = SYSCALL(open, "/var/log/syslog", O_WRONLY | O_CREAT, 0640);
if (sk_open(clg->ws, clg->loop) < 0)
bug("Logger failed in sk_open(w): %m");
s->type = SK_UNIX;
birdloop_leave(loop);
}
static void
container_mainloop(int fd)
{
log(L_INFO "container mainloop with fd %d", fd);
signal(SIGTERM, container_poweroff_sighandler);
signal(SIGINT, container_poweroff_sighandler);
signal(SIGCHLD, container_child_sighandler);
/* Move to the workdir */
linpool *lp = lp_new(&root_pool);
if (strchr(ccf.basedir, ',') ||
strchr(ccf.basedir, '=') ||
strchr(ccf.basedir, '\\'))
die("Refusing to work with paths containing chars: ,=\\");
int wfd = GETDIR(lp_sprintf(lp, "%s%s", ccf.workdir[0] == '/' ? "" : "./", ccf.workdir));
SYSCALL(fchdir, wfd);
close(wfd); wfd = -1;
close(GETDIR(lp_strdup(lp, "./upper")));
close(GETDIR(lp_strdup(lp, "./tmp")));
close(GETDIR(lp_strdup(lp, "./root")));
bool cloneroot = !strcmp(ccf.basedir, "/");
bool clonedev = cloneroot;
if (cloneroot)
{
ccf.basedir = "./lower";
close(GETDIR(lp_strdup(lp, "./lower")));
}
const char *overlay_mount_options = lp_sprintf(lp, "lowerdir=%s,upperdir=%s,workdir=%s",
ccf.basedir, "./upper", "./tmp");
SYSCALL(mount, "overlay", "./root", "overlay", 0, overlay_mount_options);
if (cloneroot)
{
#define BINDMOUNT(path) do { \
struct stat s; \
SYSCALL(lstat, "/" #path, &s); \
switch (s.st_mode & S_IFMT) { \
case S_IFLNK: \
copylink("/" #path, s.st_size, "./lower/" #path); \
break; \
case S_IFDIR: \
close(GETDIR(lp_strdup(lp, "./lower/" #path))); \
SYSCALL(mount, "/" #path, "./root/" #path, NULL, MS_BIND | MS_REC, NULL); \
break; \
} \
} while (0)
BINDMOUNT(bin);
BINDMOUNT(etc);
BINDMOUNT(lib);
BINDMOUNT(lib32);
BINDMOUNT(lib64);
BINDMOUNT(libx32);
BINDMOUNT(sbin);
BINDMOUNT(usr);
close(GETDIR(lp_strdup(lp, "./lower/dev/pts")));
symlink("/dev/pts/ptmx", "./lower/dev/ptmx");
DIR *x = opendir("/dev");
for (struct dirent *e; e = readdir(x); )
{
if (!strcmp(e->d_name, ".")
|| !strcmp(e->d_name, "..")
|| !strcmp(e->d_name, "ptmx")
|| !strcmp(e->d_name, "log")
)
continue;
const char *path = lp_sprintf(lp, "./lower/dev/%s", e->d_name);
const char *mpnt = lp_sprintf(lp, "./root/dev/%s", e->d_name);
const char *orig = lp_sprintf(lp, "/dev/%s", e->d_name);
struct stat s;
SYSCALL(lstat, orig, &s);
if (!(s.st_mode & S_IRWXO))
{
// log(L_INFO "ignoring unusable device %s", e->d_name);
continue;
}
switch (s.st_mode & S_IFMT)
{
case S_IFSOCK:
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
case S_IFREG:
// log(L_INFO "bindmounting device %s", e->d_name);
SYSCALL(close, SYSCALL(open, path, O_WRONLY | O_CREAT, 0666));
int me = mount(orig, mpnt, NULL, MS_BIND, NULL);
if (me < 0)
log(L_ERR "failed to bindmount %s to %s: %m", orig, mpnt);
break;
case S_IFLNK:
copylink(orig, s.st_size, path);
break;
default:
// log(L_INFO "ignoring device %s", e->d_name);
break;
}
}
}
MKDIR("./lower/proc");
MKDIR("./lower/sys");
MKDIR("./lower/run");
MKDIR("./lower/tmp");
SYSCALL(chroot, "./root");
SYSCALL(chdir, "/");
/* Remounting proc to reflect the new PID namespace */
SYSCALL(mount, "proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
SYSCALL(mount, "sysfs", "/sys", "sysfs", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
SYSCALL(mount, "tmpfs", "/run", "tmpfs", MS_NOSUID | MS_NODEV, NULL);
SYSCALL(mount, "tmpfs", "/tmp", "tmpfs", MS_NOSUID | MS_NODEV, NULL);
SYSCALL(mount, "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, "ptmxmode=600");
container_init_logger();
/* Run worker threads */
struct thread_config tc = {};
bird_thread_commit(&tc);
while (1)
{
struct pollfd pfd = {
.fd = fd,
.events = POLLIN,
};
sigset_t newmask;
sigemptyset(&newmask);
int res = ppoll(&pfd, 1, NULL, &newmask);
if ((res < 0) && (errno != EINTR))
log(L_INFO "ppoll returned -1: %m");
if (poweroff)
{
container_poweroff(fd, poweroff);
exit(0);
}
if (zombie)
container_zombie();
if (pfd.revents & POLLIN)
{
int sfd = -1;
byte buf[128], cbuf[CMSG_SPACE(sizeof sfd)];
struct iovec v = {
.iov_base = buf,
.iov_len = sizeof buf,
};
struct msghdr m = {
.msg_iov = &v,
.msg_iovlen = 1,
.msg_control = &cbuf,
.msg_controllen = sizeof cbuf,
};
int sz = recvmsg(fd, &m, 0);
if (sz < 0)
{
log(L_ERR "error reading data from control socket: %m");
exit(1);
}
if (sz == 0)
{
log(L_INFO "control socket closing, shutdown");
exit(0);
}
ASSERT_DIE(sz >= 3);
ASSERT_DIE(buf[0] == 0xa1);
switch (buf[1]) {
case 0:
ASSERT_DIE(buf[2] == 0xf6);
container_poweroff(fd, 0);
exit(0);
break;
case 0x21:
ASSERT_DIE(buf[2] == 0xf6);
struct cmsghdr *c = CMSG_FIRSTHDR(&m);
memcpy(&sfd, CMSG_DATA(c), sizeof sfd);
int e = fork();
if (e < 0) bug("Cannot fork: %m");
if (e == 0) {
int fd = accept(sfd, NULL, 0);
if (fd < 0)
{
log(L_ERR "failed to accept telnet connection: %m");
exit(1);
}
log(L_INFO "telnet connected");
close(0);
close(1);
// close(2);
dup2(fd, 0);
dup2(fd, 1);
/* Unblock signals */
sigset_t newmask;
sigemptyset(&newmask);
sigprocmask(SIG_SETMASK, &newmask, NULL);
/* Exec the telnet */
// e = execl("/usr/bin/strace", "strace", "-o", "/xxx", "-ff", "telnetd", "-E", "/bin/bash", NULL);
e = execl("/usr/sbin/telnetd", "telnetd", "-E", "/bin/bash", NULL);
log(L_ERR "failed to execl telnet: %m");
exit(42);
}
close(sfd);
break;
default:
log(L_ERR "unknown command on control socket: %d", buf[1]);
break;
}
}
}
}
static uint container_counter = 0;
static void
container_start(void)
{
log(L_INFO "Requested to start a container, name %s, base %s, work %s",
ccf.hostname, ccf.basedir, ccf.workdir);
pid_t pid = fork();
if (pid < 0)
die("Failed to fork container (parent): %m");
if (pid)
{
log(L_INFO "Forked container parent pid %d", pid);
container_counter++;
int status;
pid_t pp = waitpid(pid, &status, 0);
if (pp < 0)
die("Failed to waitpid %d: %m");
if (pp != pid)
die("Waited pid %d instead of %d, wtf", pp, pid);
const char *coreinfo = WCOREDUMP(status) ? " (core dumped)" : "";
if (WIFEXITED(status))
log(L_INFO "Process %d ended with status %d%s", pp, WEXITSTATUS(status), coreinfo);
else if (WIFSIGNALED(status))
log(L_INFO "Process %d exited by signal %d (%s)%s", pp, WTERMSIG(status), strsignal(WTERMSIG(status)), coreinfo);
else
log(L_ERR "Process %d exited with a strange status %d", pp, status);
return;
}
int e = unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWTIME | CLONE_NEWNET);
if (e < 0)
die("Failed to unshare container: %m");
/* Mask signals for forking and other fragile stuff */
sigset_t oldmask;
sigset_t newmask;
sigemptyset(&newmask);
#define KILLABLE_SIGNALS SIGINT, SIGTERM, SIGHUP, SIGQUIT
#define FROB(x) sigaddset(&newmask, x);
MACRO_FOREACH(FROB, KILLABLE_SIGNALS);
#undef FROB
sigprocmask(SIG_BLOCK, &newmask, &oldmask);
/* create socketpair before forking to do communication */
int fds[2];
e = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds);
if (e < 0)
die("Failed to create internal socketpair: %m");
log("container fork socketpair: %d %d", fds[0], fds[1]);
pid = fork();
if (pid < 0)
die("Failed to fork container (child): %m");
if (!pid)
{
/* Cleanup in control sockets */
close(hcf.ctl[1]);
close(fds[0]);
ASSERT_DIE(container_counter < 0x6000);
this_thread_id -= (container_counter << 1) + 0x3000 ;
container_mainloop(fds[1]); /* this never returns */
bug("container_mainloop has returned");
}
close(fds[1]);
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
byte buf[128];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, _cw.buf, sizeof _cw.buf);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -2);
cbor_put_int(cw, pid);
}
struct iovec v = {
.iov_base = cw->data.start,
.iov_len = cw->data.pos - cw->data.start,
};
byte cbuf[CMSG_SPACE(sizeof fds[0])];
struct msghdr m = {
.msg_iov = &v,
.msg_iovlen = 1,
.msg_control = &cbuf,
.msg_controllen = sizeof cbuf,
};
struct cmsghdr *c = CMSG_FIRSTHDR(&m);
c->cmsg_level = SOL_SOCKET;
c->cmsg_type = SCM_RIGHTS;
c->cmsg_len = CMSG_LEN(sizeof fds[0]);
memcpy(CMSG_DATA(c), &fds[0], sizeof fds[0]);
log(L_INFO "Sending socket");
e = sendmsg(container_forker_fd, &m, 0);
if (e < 0)
log(L_ERR "Failed to send socket: %m");
log(L_INFO "Socket sent");
exit(0);
}
/* The Parent */
static void
container_cleanup(struct container_runtime *crt)
{
HASH_REMOVE(hcf.hash, CRT, crt);
sk_close(crt->s);
mb_free(crt);
}
static int
hypervisor_container_rx(sock *sk, uint _sz UNUSED)
{
byte buf[128];
ssize_t sz = read(sk->fd, buf, sizeof buf);
if (sz < 0)
{
log(L_ERR "error reading data from %p (container_rx): %m", sk);
sk_close(sk);
return 0;
}
struct container_runtime *crt = sk->data;
ASSERT_DIE(crt->s == sk);
ASSERT_DIE(sz >= 3);
ASSERT_DIE(buf[0] == 0xa1);
switch (buf[1]) {
case 0x23:
log(L_INFO "container %s ended by signal %d", crt->ccf.hostname, buf[2]);
if (crt->ccc)
callback_activate(&crt->ccc->cb);
container_cleanup(crt);
break;
default:
log(L_ERR "container %s sent a weird message 0x%02x sz %d", crt->ccf.hostname, buf[1], sz);
break;
}
return 0;
}
static void
hypervisor_container_err(sock *sk, int err)
{
struct container_runtime *crt = sk->data;
log(L_ERR "Container %s socket closed unexpectedly: %s", crt->ccf.hostname, strerror(err));
container_cleanup(crt);
}
static int
hypervisor_container_forker_rx(sock *sk, uint sz)
{
if ((sz < 3) || (sk->rxfd < 0))
{
log(L_ERR "Container forker RX strange behavior, what the hell (sz %d, fd %d)", sz, sk->rxfd);
sk_close(sk);
ev_send_loop(&main_birdloop, &poweroff_event);
return 0;
}
ASSERT_DIE(sk->rbuf[0] == 0xa1);
ASSERT_DIE(sk->rbuf[1] == 0x21);
pid_t pid;
if (sk->rbuf[2] < 0x18)
pid = sk->rbuf[2];
else if (sk->rbuf[2] == 24)
pid = sk->rbuf[3];
else if (sk->rbuf[2] == 25)
pid = sk->rbuf[3] << 8 + sk->rbuf[4];
else if (sk->rbuf[3] == 26)
pid = sk->rbuf[3] << 32 + sk->rbuf[4] << 24 + sk->rbuf[5] << 16 + sk->rbuf[6];
else
bug("not implemented");
log(L_INFO "Machine started with PID %d", pid);
sock *skl = sk_new(sk->pool);
skl->type = SK_MAGIC;
skl->rx_hook = hypervisor_container_rx;
skl->err_hook = hypervisor_container_err;
skl->fd = sk->rxfd;
sk_set_tbsize(skl, 1024);
if (sk_open(skl, sk->loop) < 0)
bug("Machine control socket: sk_open failed");
ASSERT_DIE(birdloop_inside(hcf.loop));
ASSERT_DIE(hcf.cur_crt);
skl->data = hcf.cur_crt;
hcf.cur_crt->pid = pid;
hcf.cur_crt->s = skl;
if (hcf.cur_crt->ccc)
callback_activate(&hcf.cur_crt->ccc->cb);
hcf.cur_crt->ccc = NULL;
hcf.cur_crt = NULL;
return 0;
}
static void
hypervisor_container_forker_err(sock *sk, int e UNUSED)
{
sk_close(sk);
}
/* The child */
static void
crt_err(sock *s, int err UNUSED)
{
struct container_runtime *crt = s->data;
s->data = crt->ccc->data;
callback_cancel(&crt->ccc->cb);
mb_free(crt->ccc);
crt->ccc = NULL;
}
int
container_ctl_fd(const char *name)
{
uint h = mem_hash(name, strlen(name));
struct container_runtime *crt = HASH_FIND(hcf.hash, CRT, name, h);
return (crt && crt->s) ? crt->s->fd : -1;
}
static void
container_created(callback *cb)
{
SKIP_BACK_DECLARE(struct container_operation_callback, ccc, cb, cb);
sock *s = ccc->s;
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -1);
cbor_put_string(cw, "OK");
}
sk_send(s, cw->data.pos - cw->data.start);
s->data = ccc->data;
sk_resume_rx(s->loop, s);
mb_free(ccc);
}
void
hypervisor_container_request(sock *s, const char *name, const char *basedir, const char *workdir)
{
birdloop_enter(hcf.loop);
uint h = mem_hash(name, strlen(name));
struct container_runtime *crt = HASH_FIND(hcf.hash, CRT, name, h);
if (crt)
{
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -127);
cbor_put_string(cw, "BAD: Already exists");
}
sk_send(s, cw->data.pos - cw->data.start);
birdloop_leave(hcf.loop);
return;
}
uint nlen = strlen(name),
blen = strlen(basedir),
wlen = strlen(workdir);
crt = mb_allocz(hcf.p, sizeof *crt + nlen + blen + wlen + 3);
char *pos = crt->data;
crt->ccf.hostname = pos;
memcpy(pos, name, nlen + 1);
pos += nlen + 1;
crt->ccf.workdir = pos;
memcpy(pos, workdir, wlen + 1);
pos += wlen + 1;
crt->ccf.basedir = pos;
memcpy(pos, basedir, blen + 1);
pos += blen + 1;
crt->hash = h;
struct container_operation_callback *ccc = mb_alloc(s->pool, sizeof *ccc);
*ccc = (struct container_operation_callback) {
.s = s,
.data = s->data,
};
callback_init(&ccc->cb, container_created, s->loop);
crt->ccc = ccc;
HASH_INSERT(hcf.hash, CRT, crt);
ASSERT_DIE(hcf.cur_crt == NULL);
hcf.cur_crt = crt;
log(L_INFO "requesting machine creation, socket %p", s);
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, 0);
cbor_put_string(cw, name);
cbor_put_int(cw, 1);
cbor_put_string(cw, basedir);
cbor_put_int(cw, 2);
cbor_put_string(cw, workdir);
}
sk_send(s, cw->data.pos - cw->data.start);
s->err_paused = crt_err;
s->data = crt;
sk_pause_rx(s->loop, s);
birdloop_leave(hcf.loop);
}
static void
container_stopped(callback *cb)
{
SKIP_BACK_DECLARE(struct container_operation_callback, ccc, cb, cb);
sock *s = ccc->s;
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -1);
cbor_put_string(cw, "OK");
}
sk_send(s, cw->data.pos - cw->data.start);
s->data = ccc->data;
sk_resume_rx(s->loop, s);
mb_free(ccc);
}
void
hypervisor_container_shutdown(sock *s, const char *name)
{
birdloop_enter(hcf.loop);
uint h = mem_hash(name, strlen(name));
struct container_runtime *crt = HASH_FIND(hcf.hash, CRT, name, h);
if (!crt || !crt->s)
{
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, -127);
cbor_put_string(cw, "BAD: Not found");
}
sk_send(s, cw->data.pos - cw->data.start);
birdloop_leave(hcf.loop);
return;
}
struct {
struct cbor_writer w;
struct cbor_writer_stack_item si[2];
} _cw;
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, s->tbuf, s->tbsize);
CBOR_PUT_MAP(cw)
{
cbor_put_int(cw, 0);
cbor_put_null(cw);
}
sk_send(s, cw->data.pos - cw->data.start);
struct container_operation_callback *ccc = mb_alloc(s->pool, sizeof *ccc);
*ccc = (struct container_operation_callback) {
.s = s,
.data = s->data,
};
callback_init(&ccc->cb, container_stopped, s->loop);
crt->ccc = ccc;
s->err_paused = crt_err;
s->data = crt;
sk_pause_rx(s->loop, s);
birdloop_leave(hcf.loop);
}
struct ccs_parser_context {
struct cbor_parser_context *ctx;
u64 bytes_consumed;
u64 major_state;
};
#define CBOR_PARSER_ERROR bug
static struct ccs_parser_context ccx_, *ccx = &ccx_;
static void
hcf_parse(byte *buf, int size)
{
ASSERT_DIE(size > 0);
struct cbor_parser_context *ctx = ccx->ctx;
for (int pos = 0; pos < size; pos++)
{
switch (cbor_parse_byte(ctx, buf[pos]))
{
case CPR_ERROR:
bug("CBOR parser failure: %s", ctx->error);
case CPR_MORE:
continue;
case CPR_MAJOR:
/* Check type acceptance */
switch (ccx->major_state)
{
case 0: /* toplevel */
if (ctx->type != 5)
CBOR_PARSER_ERROR("Expected mapping, got %u", ctx->type);
ccf = (struct container_config) {};
ccx->major_state = 1;
break;
case 1: /* inside toplevel mapping */
if (ctx->type != 0)
CBOR_PARSER_ERROR("Expected integer, got %u", ctx->type);
if (ctx->value >= 3)
CBOR_PARSER_ERROR("Mapping key too high, got %lu", ctx->value);
ccx->major_state = ctx->value + 2;
break;
case 2: /* machine hostname */
if (ctx->type != 3)
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
if (ctx->tflags & CPT_VARLEN)
CBOR_PARSER_ERROR("Variable length string not supported yet");
if (ccf.hostname)
CBOR_PARSER_ERROR("Duplicate argument 0 / hostname");
ASSERT_DIE(!ctx->target_buf);
ccf.hostname = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
ctx->target_len = ctx->value;
break;
case 3: /* basedir */
if (ctx->type != 3)
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
if (ctx->tflags & CPT_VARLEN)
CBOR_PARSER_ERROR("Variable length string not supported yet");
if (ccf.workdir)
CBOR_PARSER_ERROR("Duplicate argument 1 / basedir");
ASSERT_DIE(!ctx->target_buf);
ccf.basedir = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
ctx->target_len = ctx->value;
break;
case 4: /* workdir */
if (ctx->type != 3)
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
if (ctx->tflags & CPT_VARLEN)
CBOR_PARSER_ERROR("Variable length string not supported yet");
if (ccf.workdir)
CBOR_PARSER_ERROR("Duplicate argument 2 / workdir");
ASSERT_DIE(!ctx->target_buf);
ccf.workdir = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
ctx->target_len = ctx->value;
break;
default:
bug("invalid parser state");
}
break;
case CPR_STR_END:
/* Bytes read completely! */
switch (ccx->major_state)
{
case 2:
case 3:
case 4:
ccx->major_state = 1;
break;
default:
bug("Unexpected state to end a (byte)string in");
/* Code to run at the end of a (byte)string */
}
break;
}
/* End of array or map */
while (cbor_parse_block_end(ctx))
{
switch (ccx->major_state)
{
/* Code to run at the end of the mapping */
case 0: /* toplevel item ended */
/* Reinit the parser */
ccx->major_state = 0;
ccx->bytes_consumed = 0;
cbor_parser_reset(ccx->ctx);
if (size > pos + 1)
hcf_parse(buf + pos + 1, size - pos - 1);
return;
case 1: /* the mapping ended */
if (!ccf.hostname)
CBOR_PARSER_ERROR("Missing hostname");
if (!ccf.workdir)
CBOR_PARSER_ERROR("Missing workdir");
if (!ccf.basedir)
CBOR_PARSER_ERROR("Missing basedir");
container_start();
ccx->major_state = 0;
break;
default:
bug("Unexpected state to end a mapping in");
}
}
}
ccx->bytes_consumed += size;
}
void
hypervisor_container_fork(void)
{
int e, *fds = hcf.ctl;
/* create socketpair before forking to do communication */
e = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds);
if (e < 0)
die("Failed to create internal socketpair: %m");
e = fork();
if (e < 0)
die("Failed to fork container forker: %m");
if (e)
{
/* parent side */
hcf.loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Container forker");
birdloop_enter(hcf.loop);
hcf.p = rp_new(birdloop_pool(hcf.loop), birdloop_domain(hcf.loop), "Container forker pool");
hcf.s = sk_new(hcf.p);
hcf.s->type = SK_MAGIC;
/* Set the hooks and fds according to the side we are at */
hcf.s->rx_hook = hypervisor_container_forker_rx;
hcf.s->err_hook = hypervisor_container_forker_err;
sk_set_tbsize(hcf.s, 16384);
sk_set_rbsize(hcf.s, 128);
hcf.s->fd = fds[0];
close(fds[1]);
HASH_INIT(hcf.hash, hcf.p, 6);
if (sk_open(hcf.s, hcf.loop) < 0)
bug("Container forker parent: sk_open failed");
hcf.s->type = SK_UNIX_MSG;
birdloop_leave(hcf.loop);
return;
}
/* noreturn child side */
close(fds[0]);
hexp_cleanup_after_fork();
container_forker_fd = fds[1];
this_thread_id |= 0xf000;
/* initialize the forker */
ccx->ctx = cbor_parser_new(&root_pool, 2);
while (true)
{
byte buf[4096];
ssize_t rx = read(fds[1], buf, sizeof buf);
times_update();
if (rx == 0)
{
log(L_INFO "Container forker socket closed, exiting");
exit(0);
}
if (rx < 0)
bug("Container forker child: failed to read: %m");
hcf_parse(buf, rx);
}
}