mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2025-01-03 07:31:54 +00:00
1214 lines
28 KiB
C
1214 lines
28 KiB
C
#include "flock/flock.h"
|
|
|
|
#include "lib/birdlib.h"
|
|
#include "lib/cbor.h"
|
|
#include "lib/io-loop.h"
|
|
#include "lib/hash.h"
|
|
|
|
#include <dirent.h>
|
|
#include <poll.h>
|
|
#include <sched.h>
|
|
#include <stdlib.h>
|
|
#include <sys/mount.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/types.h>
|
|
#include <sys/un.h>
|
|
#include <sys/wait.h>
|
|
#include <unistd.h>
|
|
|
|
static struct hypervisor_container_forker {
|
|
sock *s;
|
|
pool *p;
|
|
CBOR_STREAM_EMBED(stream, 4);
|
|
struct birdloop *loop;
|
|
HASH(struct container_runtime) hash;
|
|
struct container_runtime *cur_crt;
|
|
int ctl[2]; /* socketpair filedescriptors */
|
|
} hcf;
|
|
|
|
struct container_fork_request {
|
|
CBOR_CHANNEL_EMBED(cch, 4);
|
|
struct cbor_channel *ctl_ch;
|
|
struct container_runtime *crt;
|
|
int reply_state;
|
|
};
|
|
|
|
struct container_runtime {
|
|
struct container_runtime *next;
|
|
uint hash;
|
|
pid_t pid;
|
|
sock *s;
|
|
CBOR_STREAM_EMBED(stream, 4);
|
|
char hostname[];
|
|
};
|
|
|
|
#define CBOR_PARSER_ERROR FAIL
|
|
|
|
#define CRT_KEY(c) c->hostname, c->hash
|
|
#define CRT_NEXT(c) c->next
|
|
#define CRT_EQ(a,h,b,i) ((h) == (i)) && (!strcmp(a,b))
|
|
#define CRT_FN(a,h) h
|
|
|
|
static sig_atomic_t poweroff, zombie;
|
|
|
|
static void
|
|
container_poweroff_sighandler(int signo)
|
|
{
|
|
poweroff = signo;
|
|
}
|
|
|
|
static void
|
|
container_child_sighandler(int signo UNUSED)
|
|
{
|
|
zombie = 1;
|
|
}
|
|
|
|
static void
|
|
container_poweroff(int fd, int sig)
|
|
{
|
|
struct {
|
|
struct cbor_writer w;
|
|
struct cbor_writer_stack_item si[2];
|
|
byte buf[128];
|
|
} _cw;
|
|
|
|
struct cbor_writer *cw = cbor_writer_init(&_cw.w, 2, _cw.buf, sizeof _cw.buf);
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, -4);
|
|
cbor_put_int(cw, sig);
|
|
}
|
|
ASSERT_DIE(cbor_writer_done(cw) == 1);
|
|
s64 sz = cw->data.pos - cw->data.start;
|
|
ASSERT_DIE(write(fd, cw->data.start, sz) == sz);
|
|
|
|
unlink("/dev/log");
|
|
}
|
|
|
|
static void
|
|
container_zombie(void)
|
|
{
|
|
zombie = 0;
|
|
log(L_INFO "Zombie elimination routine invoked.");
|
|
while (1) {
|
|
int status;
|
|
pid_t p = waitpid(-1, &status, WNOHANG);
|
|
|
|
if (p < 0)
|
|
{
|
|
if (errno != ECHILD)
|
|
log(L_ERR "Zombie elimination failed: %m");
|
|
return;
|
|
}
|
|
|
|
if (p == 0)
|
|
return;
|
|
|
|
const char *coreinfo = WCOREDUMP(status) ? " (core dumped)" : "";
|
|
|
|
if (WIFEXITED(status))
|
|
log(L_INFO "Process %d ended with status %d%s", p, WEXITSTATUS(status), coreinfo);
|
|
else if (WIFSIGNALED(status))
|
|
log(L_INFO "Process %d exited by signal %d (%s)%s", p, WTERMSIG(status), strsignal(WTERMSIG(status)), coreinfo);
|
|
else
|
|
log(L_ERR "Process %d exited with a strange status %d", p, status);
|
|
}
|
|
}
|
|
|
|
//#define SYSCALL(x, ...) ({ int _e = x(__VA_ARGS__); if (_e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); else log(L_TRACE "OK %s at %s:%d", #x, __FILE__, __LINE__); _e; })
|
|
#define SYSCALL(x, ...) ({ int _e = x(__VA_ARGS__); if (_e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); _e; })
|
|
|
|
#define RUN(...) do { \
|
|
pid_t pid = fork(); \
|
|
if (pid) waitpid(pid, NULL, 0); \
|
|
else { execlp(__VA_ARGS__, NULL); bug("exec %s failed: %m", #__VA_ARGS__); } \
|
|
} while (0)
|
|
|
|
|
|
static int
|
|
container_getdir(char *path)
|
|
{
|
|
int e = open(path, O_DIRECTORY | O_PATH | O_RDWR);
|
|
if ((e >= 0) || (errno != ENOENT))
|
|
return e;
|
|
|
|
/* Split the path */
|
|
char *sl = strrchr(path, '/');
|
|
char *name = sl+1;
|
|
|
|
if (sl == path)
|
|
path = "/";
|
|
else
|
|
{
|
|
while (sl && sl[1] == 0)
|
|
{
|
|
/* Trailing slash removal */
|
|
sl[0] = 0;
|
|
sl = strrchr(path, '/');
|
|
}
|
|
|
|
if (!sl)
|
|
bug("Getdir failed, empty sl");
|
|
}
|
|
|
|
/* Open the parent directory */
|
|
*sl = 0;
|
|
int fd = container_getdir(path);
|
|
if (fd < 0)
|
|
return fd;
|
|
|
|
for (uint i=0; i<256; i++)
|
|
{
|
|
e = mkdirat(fd, name, 0755);
|
|
if ((e < 0) && (errno != EEXIST))
|
|
{
|
|
close(fd);
|
|
return e;
|
|
}
|
|
|
|
e = openat(fd, name, O_DIRECTORY | O_PATH | O_RDWR);
|
|
if ((e >= 0) || (errno != ENOENT))
|
|
{
|
|
close(fd);
|
|
return e;
|
|
}
|
|
}
|
|
|
|
die("Somebody is messing with the filesystem too badly.");
|
|
}
|
|
|
|
static void
|
|
copylink(const char *src, int sz, const char *dst)
|
|
{
|
|
char *contents = alloca(sz + 1);
|
|
int xsz = SYSCALL(readlink, src, contents, sz);
|
|
contents[xsz] = 0;
|
|
// log(L_INFO "symlinking device %s -> %s", dst, contents);
|
|
int se = symlink(contents, dst);
|
|
if (se < 0)
|
|
// die("failed to symlink %s: %m", dst);
|
|
log(L_ERR "failed to symlink %s: %m", dst);
|
|
}
|
|
|
|
#define GETDIR(_path) ({ char *path = _path; int fd = container_getdir(path); if (fd < 0) die("Failed to get the directory %s: %m", path); fd; })
|
|
#define MKDIR(_path) close(GETDIR(tmp_strdup(_path)))
|
|
|
|
struct container_logger {
|
|
struct birdloop *loop;
|
|
pool *p;
|
|
sock *rs;
|
|
sock *ws;
|
|
};
|
|
|
|
static int
|
|
container_logger_rx(sock *sk, uint sz)
|
|
{
|
|
struct container_logger *clg = sk->data;
|
|
if (clg->ws->tpos + sz >= clg->ws->tbuf + clg->ws->tbsize)
|
|
log(L_INFO "dropping a log message");
|
|
|
|
memcpy(clg->ws->tpos, sk->rbuf, sz);
|
|
clg->ws->tpos[sz] = '\n';
|
|
|
|
if (clg->ws->tpos == clg->ws->tbuf)
|
|
sk_send(clg->ws, sz + 1);
|
|
else
|
|
clg->ws->tpos += sz + 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
container_logger_rerr(sock *sk UNUSED, int err)
|
|
{
|
|
if (!err)
|
|
bug("what");
|
|
|
|
die("Logger receiver socket closed unexpectedly: %s", strerror(err));
|
|
}
|
|
|
|
static void
|
|
container_logger_werr(sock *sk UNUSED, int err)
|
|
{
|
|
die("Logger writer closed unexpectedly: %s", strerror(err));
|
|
}
|
|
|
|
static void
|
|
container_init_logger(void)
|
|
{
|
|
struct birdloop *loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Logger");
|
|
birdloop_enter(loop);
|
|
pool *p = rp_new(birdloop_pool(loop), birdloop_domain(loop), "Logger pool");
|
|
sock *s = sk_new(p);
|
|
s->type = SK_MAGIC;
|
|
s->rx_hook = container_logger_rx;
|
|
s->err_hook = container_logger_rerr;
|
|
sk_set_rbsize(s, 16384);
|
|
|
|
unlink("/dev/log");
|
|
s->fd = SYSCALL(socket, AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
|
union {
|
|
struct sockaddr sa;
|
|
struct sockaddr_un un;
|
|
} sa;
|
|
sa.un.sun_family = AF_UNIX;
|
|
strcpy(sa.un.sun_path, "/dev/log");
|
|
SYSCALL(bind, s->fd, &sa.sa, sizeof sa.un);
|
|
|
|
struct container_logger *clg = mb_allocz(p, sizeof *clg);
|
|
clg->loop = loop;
|
|
clg->p = p;
|
|
clg->rs = s;
|
|
s->data = clg;
|
|
|
|
if (sk_open(clg->rs, clg->loop) < 0)
|
|
bug("Logger failed in sk_open(r): %m");
|
|
|
|
clg->rs->type = SK_UDP;
|
|
|
|
s = clg->ws = sk_new(p);
|
|
s->data = clg;
|
|
s->type = SK_MAGIC;
|
|
s->err_hook = container_logger_werr;
|
|
sk_set_tbsize(s, 16384);
|
|
|
|
MKDIR("/var/log");
|
|
s->fd = SYSCALL(open, "/var/log/syslog", O_WRONLY | O_CREAT, 0640);
|
|
|
|
if (sk_open(clg->ws, clg->loop) < 0)
|
|
bug("Logger failed in sk_open(w): %m");
|
|
|
|
s->type = SK_UNIX;
|
|
|
|
birdloop_leave(loop);
|
|
}
|
|
|
|
static void
|
|
container_mainloop(int fd, struct flock_machine_container_config *ccf)
|
|
{
|
|
log(L_INFO "container mainloop with fd %d", fd);
|
|
|
|
signal(SIGTERM, container_poweroff_sighandler);
|
|
signal(SIGINT, container_poweroff_sighandler);
|
|
signal(SIGCHLD, container_child_sighandler);
|
|
|
|
/* Move to the workdir */
|
|
linpool *lp = lp_new(&root_pool);
|
|
|
|
if (strchr(ccf->basedir, ',') ||
|
|
strchr(ccf->basedir, '=') ||
|
|
strchr(ccf->basedir, '\\'))
|
|
die("Refusing to work with paths containing chars: ,=\\");
|
|
|
|
int wfd = GETDIR(lp_sprintf(lp, "%s%s", ccf->workdir[0] == '/' ? "" : "./", ccf->workdir));
|
|
SYSCALL(fchdir, wfd);
|
|
close(wfd); wfd = -1;
|
|
|
|
close(GETDIR(lp_strdup(lp, "./upper")));
|
|
close(GETDIR(lp_strdup(lp, "./tmp")));
|
|
close(GETDIR(lp_strdup(lp, "./root")));
|
|
|
|
bool cloneroot = !strcmp(ccf->basedir, "/");
|
|
bool clonedev = cloneroot;
|
|
if (cloneroot)
|
|
{
|
|
ccf->basedir = "./lower";
|
|
close(GETDIR(lp_strdup(lp, "./lower")));
|
|
}
|
|
|
|
const char *overlay_mount_options = lp_sprintf(lp, "lowerdir=%s,upperdir=%s,workdir=%s",
|
|
ccf->basedir, "./upper", "./tmp");
|
|
SYSCALL(mount, "overlay", "./root", "overlay", 0, overlay_mount_options);
|
|
|
|
if (cloneroot)
|
|
{
|
|
#define BINDMOUNT(path) do { \
|
|
struct stat s; \
|
|
SYSCALL(lstat, "/" #path, &s); \
|
|
switch (s.st_mode & S_IFMT) { \
|
|
case S_IFLNK: \
|
|
copylink("/" #path, s.st_size, "./lower/" #path); \
|
|
break; \
|
|
case S_IFDIR: \
|
|
close(GETDIR(lp_strdup(lp, "./lower/" #path))); \
|
|
SYSCALL(mount, "/" #path, "./root/" #path, NULL, MS_BIND | MS_REC, NULL); \
|
|
break; \
|
|
} \
|
|
} while (0)
|
|
BINDMOUNT(bin);
|
|
BINDMOUNT(etc);
|
|
BINDMOUNT(lib);
|
|
BINDMOUNT(lib32);
|
|
BINDMOUNT(lib64);
|
|
BINDMOUNT(libx32);
|
|
BINDMOUNT(sbin);
|
|
BINDMOUNT(usr);
|
|
|
|
close(GETDIR(lp_strdup(lp, "./lower/dev/pts")));
|
|
symlink("/dev/pts/ptmx", "./lower/dev/ptmx");
|
|
|
|
DIR *x = opendir("/dev");
|
|
for (struct dirent *e; e = readdir(x); )
|
|
{
|
|
if (!strcmp(e->d_name, ".")
|
|
|| !strcmp(e->d_name, "..")
|
|
|| !strcmp(e->d_name, "ptmx")
|
|
|| !strcmp(e->d_name, "log")
|
|
)
|
|
continue;
|
|
|
|
const char *path = lp_sprintf(lp, "./lower/dev/%s", e->d_name);
|
|
const char *mpnt = lp_sprintf(lp, "./root/dev/%s", e->d_name);
|
|
const char *orig = lp_sprintf(lp, "/dev/%s", e->d_name);
|
|
|
|
struct stat s;
|
|
SYSCALL(lstat, orig, &s);
|
|
if (!(s.st_mode & S_IRWXO))
|
|
{
|
|
// log(L_INFO "ignoring unusable device %s", e->d_name);
|
|
continue;
|
|
}
|
|
|
|
switch (s.st_mode & S_IFMT)
|
|
{
|
|
case S_IFSOCK:
|
|
case S_IFIFO:
|
|
case S_IFCHR:
|
|
case S_IFBLK:
|
|
case S_IFREG:
|
|
// log(L_INFO "bindmounting device %s", e->d_name);
|
|
SYSCALL(close, SYSCALL(open, path, O_WRONLY | O_CREAT, 0666));
|
|
int me = mount(orig, mpnt, NULL, MS_BIND, NULL);
|
|
if (me < 0)
|
|
log(L_ERR "failed to bindmount %s to %s: %m", orig, mpnt);
|
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
copylink(orig, s.st_size, path);
|
|
break;
|
|
|
|
default:
|
|
// log(L_INFO "ignoring device %s", e->d_name);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
MKDIR("./lower/proc");
|
|
MKDIR("./lower/sys");
|
|
MKDIR("./lower/run");
|
|
MKDIR("./lower/tmp");
|
|
|
|
SYSCALL(chroot, "./root");
|
|
SYSCALL(chdir, "/");
|
|
|
|
/* Remounting proc to reflect the new PID namespace */
|
|
SYSCALL(mount, "proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
|
|
SYSCALL(mount, "sysfs", "/sys", "sysfs", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
|
|
SYSCALL(mount, "tmpfs", "/run", "tmpfs", MS_NOSUID | MS_NODEV, NULL);
|
|
SYSCALL(mount, "tmpfs", "/tmp", "tmpfs", MS_NOSUID | MS_NODEV, NULL);
|
|
SYSCALL(mount, "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, "ptmxmode=600");
|
|
|
|
container_init_logger();
|
|
|
|
/* Run worker threads */
|
|
struct thread_config tc = {};
|
|
bird_thread_commit(&tc);
|
|
|
|
while (1)
|
|
{
|
|
struct pollfd pfd = {
|
|
.fd = fd,
|
|
.events = POLLIN,
|
|
};
|
|
|
|
sigset_t newmask;
|
|
sigemptyset(&newmask);
|
|
|
|
int res = ppoll(&pfd, 1, NULL, &newmask);
|
|
|
|
if ((res < 0) && (errno != EINTR))
|
|
log(L_INFO "ppoll returned -1: %m");
|
|
|
|
if (poweroff)
|
|
{
|
|
container_poweroff(fd, poweroff);
|
|
exit(0);
|
|
}
|
|
|
|
if (zombie)
|
|
container_zombie();
|
|
|
|
if (pfd.revents & POLLIN)
|
|
{
|
|
int sfd = -1;
|
|
byte buf[128], cbuf[CMSG_SPACE(sizeof sfd)];
|
|
struct iovec v = {
|
|
.iov_base = buf,
|
|
.iov_len = sizeof buf,
|
|
};
|
|
struct msghdr m = {
|
|
.msg_iov = &v,
|
|
.msg_iovlen = 1,
|
|
.msg_control = &cbuf,
|
|
.msg_controllen = sizeof cbuf,
|
|
};
|
|
|
|
int sz = recvmsg(fd, &m, 0);
|
|
if (sz < 0)
|
|
{
|
|
log(L_ERR "error reading data from control socket: %m");
|
|
exit(1);
|
|
}
|
|
|
|
if (sz == 0)
|
|
{
|
|
log(L_INFO "control socket closing, shutdown");
|
|
exit(0);
|
|
}
|
|
|
|
ASSERT_DIE(sz >= 3);
|
|
ASSERT_DIE(buf[0] == 0xa1);
|
|
switch (buf[1]) {
|
|
case 0:
|
|
ASSERT_DIE(buf[2] == 0xf6);
|
|
container_poweroff(fd, 0);
|
|
exit(0);
|
|
break;
|
|
|
|
case 0x21:
|
|
ASSERT_DIE(buf[2] == 0xf6);
|
|
struct cmsghdr *c = CMSG_FIRSTHDR(&m);
|
|
memcpy(&sfd, CMSG_DATA(c), sizeof sfd);
|
|
|
|
int e = fork();
|
|
if (e < 0) bug("Cannot fork: %m");
|
|
if (e == 0) {
|
|
int fd = accept(sfd, NULL, 0);
|
|
if (fd < 0)
|
|
{
|
|
log(L_ERR "failed to accept telnet connection: %m");
|
|
exit(1);
|
|
}
|
|
log(L_INFO "telnet connected");
|
|
|
|
close(0);
|
|
close(1);
|
|
// close(2);
|
|
|
|
dup2(fd, 0);
|
|
dup2(fd, 1);
|
|
|
|
/* Unblock signals */
|
|
sigset_t newmask;
|
|
sigemptyset(&newmask);
|
|
sigprocmask(SIG_SETMASK, &newmask, NULL);
|
|
|
|
/* Exec the telnet */
|
|
|
|
// e = execl("/usr/bin/strace", "strace", "-o", "/xxx", "-ff", "telnetd", "-E", "/bin/bash", NULL);
|
|
e = execl("/usr/sbin/telnetd", "telnetd", "-E", "/bin/bash", NULL);
|
|
log(L_ERR "failed to execl telnet: %m");
|
|
exit(42);
|
|
}
|
|
close(sfd);
|
|
break;
|
|
|
|
default:
|
|
log(L_ERR "unknown command on control socket: %d", buf[1]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
struct container_fork_request_child {
|
|
CBOR_CHANNEL_EMBED(cch, 4);
|
|
struct flock_machine_container_config ccf;
|
|
u64 major_state;
|
|
};
|
|
|
|
static uint container_counter = 0;
|
|
|
|
static void
|
|
container_start(struct container_fork_request_child *ccx)
|
|
{
|
|
struct flock_machine_container_config *ccf = &ccx->ccf;
|
|
|
|
log(L_INFO "Requested to start a container, name %s, base %s, work %s",
|
|
ccf->cf.name, ccf->basedir, ccf->workdir);
|
|
|
|
pid_t pid = fork();
|
|
if (pid < 0)
|
|
die("Failed to fork container (parent): %m");
|
|
|
|
if (pid)
|
|
{
|
|
log(L_INFO "Forked container parent pid %d", pid);
|
|
container_counter++;
|
|
int status;
|
|
pid_t pp = waitpid(pid, &status, 0);
|
|
|
|
if (pp < 0)
|
|
die("Failed to waitpid %d: %m");
|
|
|
|
if (pp != pid)
|
|
die("Waited pid %d instead of %d, wtf", pp, pid);
|
|
|
|
const char *coreinfo = WCOREDUMP(status) ? " (core dumped)" : "";
|
|
|
|
if (WIFEXITED(status))
|
|
log(L_INFO "Process %d ended with status %d%s", pp, WEXITSTATUS(status), coreinfo);
|
|
else if (WIFSIGNALED(status))
|
|
log(L_INFO "Process %d exited by signal %d (%s)%s", pp, WTERMSIG(status), strsignal(WTERMSIG(status)), coreinfo);
|
|
else
|
|
log(L_ERR "Process %d exited with a strange status %d", pp, status);
|
|
|
|
return;
|
|
}
|
|
|
|
int e = unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWTIME | CLONE_NEWNET);
|
|
if (e < 0)
|
|
die("Failed to unshare container: %m");
|
|
|
|
/* Mask signals for forking and other fragile stuff */
|
|
sigset_t oldmask;
|
|
sigset_t newmask;
|
|
sigemptyset(&newmask);
|
|
#define KILLABLE_SIGNALS SIGINT, SIGTERM, SIGHUP, SIGQUIT
|
|
#define FROB(x) sigaddset(&newmask, x);
|
|
MACRO_FOREACH(FROB, KILLABLE_SIGNALS);
|
|
#undef FROB
|
|
sigprocmask(SIG_BLOCK, &newmask, &oldmask);
|
|
|
|
/* create socketpair before forking to do communication */
|
|
int fds[2];
|
|
e = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds);
|
|
if (e < 0)
|
|
die("Failed to create internal socketpair: %m");
|
|
|
|
log("container fork socketpair: %d %d", fds[0], fds[1]);
|
|
|
|
pid = fork();
|
|
if (pid < 0)
|
|
die("Failed to fork container (child): %m");
|
|
|
|
if (!pid)
|
|
{
|
|
/* Cleanup in control sockets */
|
|
close(hcf.ctl[1]);
|
|
close(fds[0]);
|
|
|
|
ASSERT_DIE(container_counter < 0x6000);
|
|
this_thread_id -= (container_counter << 1) + 0x3000 ;
|
|
container_mainloop(fds[1], ccf); /* this never returns */
|
|
bug("container_mainloop has returned");
|
|
}
|
|
|
|
close(fds[1]);
|
|
|
|
log(L_INFO "Sending socket");
|
|
|
|
ccx->cch.stream->s->txfd = fds[0];
|
|
CBOR_REPLY(&ccx->cch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, -2);
|
|
cbor_put_int(cw, pid);
|
|
}
|
|
|
|
log(L_INFO "Socket sent");
|
|
exit(0);
|
|
}
|
|
|
|
/* The Parent */
|
|
|
|
static struct container_runtime *
|
|
container_find_by_name(const char *name)
|
|
{
|
|
uint h = mem_hash(name, strlen(name));
|
|
return HASH_FIND(hcf.hash, CRT, name, h);
|
|
}
|
|
|
|
struct cbor_channel *
|
|
container_get_channel(const char *name)
|
|
{
|
|
struct container_runtime *crt = container_find_by_name(name);
|
|
return crt ? cbor_channel_new(&crt->stream) : NULL;
|
|
}
|
|
|
|
static void
|
|
container_cleanup(struct container_runtime *crt)
|
|
{
|
|
HASH_REMOVE(hcf.hash, CRT, crt);
|
|
sk_close(crt->s);
|
|
mb_free(crt);
|
|
}
|
|
|
|
struct container_ctl_msg {
|
|
CBOR_CHANNEL_EMBED(cch, 4);
|
|
struct cbor_channel *ctl_ch;
|
|
int msg_state;
|
|
int down_signal;
|
|
};
|
|
|
|
static void
|
|
container_ctl_cancel(struct cbor_stream *stream)
|
|
{
|
|
SKIP_BACK_DECLARE(struct container_runtime, crt, stream, stream);
|
|
HASH_REMOVE(hcf.hash, CRT, crt);
|
|
mb_free(crt);
|
|
}
|
|
|
|
static enum cbor_parse_result
|
|
container_ctl_parse(struct cbor_channel *cch, enum cbor_parse_result res)
|
|
{
|
|
SKIP_BACK_DECLARE(struct container_ctl_msg, ccc, cch, cch);
|
|
SKIP_BACK_DECLARE(struct container_runtime, crt, stream, cch->stream);
|
|
struct cbor_parser_context *ctx = &crt->stream.parser;
|
|
|
|
#define FAIL(...) do { log(L_ERR "Container ctl parse: " __VA_ARGS__); return CPR_ERROR; } while (0)
|
|
|
|
switch (res)
|
|
{
|
|
case CPR_MAJOR:
|
|
switch (ccc->msg_state)
|
|
{
|
|
case 0:
|
|
if ((ctx->type != CBOR_MAP) || (ctx->value != 1))
|
|
FAIL("Expected map of size 1, got %d-%d", ctx->type, ctx->value);
|
|
|
|
ccc->msg_state = 1;
|
|
return CPR_MORE;
|
|
|
|
case 1:
|
|
if ((ctx->type != CBOR_NEGINT) || (ctx->value != 3))
|
|
FAIL("Expected key -4, got %d-%d", ctx->type, ctx->value);
|
|
|
|
ccc->msg_state = 2;
|
|
return CPR_MORE;
|
|
|
|
case 2:
|
|
CBOR_PARSE_ONLY(ctx, POSINT, ccc->down_signal);
|
|
ccc->msg_state = 3;
|
|
return CPR_MORE;
|
|
|
|
default:
|
|
FAIL("Input overflow to state %d", ccc->msg_state);
|
|
}
|
|
bug("Overrun switch");
|
|
|
|
case CPR_STR_END:
|
|
FAIL("Unexpected string end");
|
|
|
|
case CPR_BLOCK_END:
|
|
switch (ccc->msg_state) {
|
|
case 3:
|
|
ccc->msg_state = 4;
|
|
break;
|
|
|
|
default:
|
|
FAIL("Unexpected block end in state %d", ccc->msg_state);
|
|
}
|
|
break;
|
|
|
|
case CPR_ERROR:
|
|
case CPR_MORE:
|
|
FAIL("Invalid input");
|
|
}
|
|
|
|
log(L_INFO "container %s ended by signal %d", crt->hostname, ccc->down_signal);
|
|
container_cleanup(crt);
|
|
|
|
#undef FAIL
|
|
return CPR_BLOCK_END;
|
|
}
|
|
|
|
static enum cbor_parse_result
|
|
container_fork_request_reply(struct cbor_channel *cch, enum cbor_parse_result res)
|
|
{
|
|
ASSERT_DIE(cch->stream == &hcf.stream);
|
|
|
|
SKIP_BACK_DECLARE(struct container_fork_request, cfr, cch, cch);
|
|
struct container_runtime *crt = cfr->crt;
|
|
struct cbor_parser_context *ctx = &cch->stream->parser;
|
|
|
|
#define FAIL(...) do { log(L_ERR "Container fork request reply: " __VA_ARGS__); return CPR_ERROR; } while (0)
|
|
|
|
switch (res)
|
|
{
|
|
case CPR_MAJOR:
|
|
switch (cfr->reply_state) {
|
|
case 0:
|
|
if ((ctx->type != CBOR_MAP) || (ctx->value != 1))
|
|
FAIL("Expected map of size 1, got %d-%d", ctx->type, ctx->value);
|
|
|
|
cfr->reply_state = 1;
|
|
return CPR_MORE;
|
|
|
|
case 1:
|
|
if ((ctx->type != CBOR_NEGINT) || (ctx->value != 1))
|
|
FAIL("Expected key -2, got %d-%d", ctx->type, ctx->value);
|
|
|
|
cfr->reply_state = 2;
|
|
return CPR_MORE;
|
|
|
|
case 2:
|
|
CBOR_PARSE_ONLY(ctx, POSINT, crt->pid);
|
|
cfr->reply_state = 3;
|
|
return CPR_MORE;
|
|
|
|
default:
|
|
FAIL("Input overflow to state %d", cfr->reply_state);
|
|
}
|
|
bug("Overrun switch");
|
|
|
|
case CPR_STR_END:
|
|
FAIL("Unexpected string end");
|
|
|
|
case CPR_BLOCK_END:
|
|
switch (cfr->reply_state) {
|
|
case 3:
|
|
cfr->reply_state = 4;
|
|
return CPR_MORE;
|
|
|
|
case 4:
|
|
cfr->reply_state = 5;
|
|
break;
|
|
|
|
default:
|
|
FAIL("Unexpected block end in state %d", cfr->reply_state);
|
|
}
|
|
break;
|
|
|
|
case CPR_ERROR:
|
|
case CPR_MORE:
|
|
FAIL("Invalid input");
|
|
}
|
|
|
|
log(L_INFO "Machine started with PID %d", crt->pid);
|
|
|
|
if (hcf.s->rxfd < 0)
|
|
FAIL("No control socket of the new machine");
|
|
|
|
ASSERT_DIE(birdloop_inside(hcf.loop));
|
|
|
|
sock *skl = sk_new(hcf.p);
|
|
skl->type = SK_MAGIC;
|
|
|
|
skl->fd = hcf.s->rxfd;
|
|
hcf.s->rxfd = -1;
|
|
|
|
if (sk_open(skl, hcf.loop) < 0)
|
|
bug("Machine control socket: sk_open failed");
|
|
|
|
sk_set_tbsize(skl, 1024);
|
|
skl->type = SK_UNIX_MSG;
|
|
|
|
crt->s = skl;
|
|
cbor_stream_attach(&crt->stream, skl);
|
|
|
|
CBOR_REPLY(cfr->ctl_ch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, -1);
|
|
cbor_put_string(cw, "OK");
|
|
}
|
|
|
|
cbor_channel_done(&cfr->cch);
|
|
|
|
return CPR_BLOCK_END;
|
|
#undef FAIL
|
|
}
|
|
|
|
void
|
|
hypervisor_container_start(struct cbor_channel *cch, struct flock_machine_container_config *ccf)
|
|
{
|
|
birdloop_enter(hcf.loop);
|
|
|
|
#define FAIL(id, msg) do { \
|
|
CBOR_REPLY(cch, cw) CBOR_PUT_MAP(cw) { \
|
|
cbor_put_int(cw, id); cbor_put_string(cw, msg);\
|
|
} cbor_channel_done(cch); \
|
|
birdloop_leave(hcf.loop); \
|
|
return; } while (0)
|
|
|
|
if (!ccf->cf.name)
|
|
FAIL(-101, "Machine name not specified");
|
|
|
|
if (!ccf->workdir)
|
|
FAIL(-102, "Machine workdir not specified");
|
|
|
|
if (!ccf->basedir)
|
|
FAIL(-103, "Machine basedir not specified");
|
|
|
|
const char *name = ccf->cf.name;
|
|
uint h = mem_hash(name, strlen(name));
|
|
struct container_runtime *crt = HASH_FIND(hcf.hash, CRT, name, h);
|
|
if (crt)
|
|
FAIL(-127, "Container already exists");
|
|
|
|
uint nlen = strlen(name);
|
|
crt = mb_allocz(hcf.p, sizeof *crt + nlen + 1);
|
|
crt->hash = h;
|
|
memcpy(crt->hostname, name, nlen + 1);
|
|
|
|
HASH_INSERT(hcf.hash, CRT, crt);
|
|
|
|
/* Create a new channel atop the forker stream */
|
|
log(L_INFO "requesting machine creation, name %s", name);
|
|
SKIP_BACK_DECLARE(struct container_fork_request, cfr, cch, cbor_channel_new(&hcf.stream));
|
|
cfr->ctl_ch = cch;
|
|
cfr->crt = crt;
|
|
cfr->cch.parse = container_fork_request_reply;
|
|
|
|
crt->stream.parse = container_ctl_parse;
|
|
crt->stream.cancel = container_ctl_cancel;
|
|
|
|
CBOR_STREAM_INIT(crt, stream, cch, hcf.p, struct container_ctl_msg);
|
|
|
|
CBOR_REPLY(&cfr->cch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, 0);
|
|
cbor_put_string(cw, name);
|
|
cbor_put_int(cw, 1);
|
|
cbor_put_string(cw, ccf->basedir);
|
|
cbor_put_int(cw, 2);
|
|
cbor_put_string(cw, ccf->workdir);
|
|
}
|
|
|
|
#undef FAIL
|
|
birdloop_leave(hcf.loop);
|
|
}
|
|
|
|
static enum cbor_parse_result
|
|
container_stopped(struct cbor_channel *cch, enum cbor_parse_result res)
|
|
{
|
|
SKIP_BACK_DECLARE(struct container_ctl_msg, ccc, cch, cch);
|
|
SKIP_BACK_DECLARE(struct container_runtime, crt, stream, cch->stream);
|
|
struct cbor_parser_context *ctx = &crt->stream.parser;
|
|
|
|
#define FAIL(...) do { log(L_ERR "Container stopped parse: " __VA_ARGS__); return CPR_ERROR; } while (0)
|
|
|
|
switch (res)
|
|
{
|
|
case CPR_MAJOR:
|
|
switch (ccc->msg_state)
|
|
{
|
|
case 0:
|
|
if ((ctx->type != CBOR_MAP) || (ctx->value != 1))
|
|
FAIL("Expected map of size 1, got %d-%d", ctx->type, ctx->value);
|
|
|
|
ccc->msg_state = 1;
|
|
return CPR_MORE;
|
|
|
|
case 1:
|
|
if ((ctx->type != CBOR_NEGINT) || (ctx->value != 3))
|
|
FAIL("Expected key -4, got %d-%d", ctx->type, ctx->value);
|
|
|
|
ccc->msg_state = 2;
|
|
return CPR_MORE;
|
|
|
|
case 2:
|
|
CBOR_PARSE_ONLY(ctx, POSINT, ccc->down_signal);
|
|
ccc->msg_state = 3;
|
|
return CPR_MORE;
|
|
|
|
default:
|
|
FAIL("Input overflow to state %d", ccc->msg_state);
|
|
}
|
|
bug("Overrun switch");
|
|
|
|
case CPR_STR_END:
|
|
FAIL("Unexpected string end");
|
|
|
|
case CPR_BLOCK_END:
|
|
switch (ccc->msg_state) {
|
|
case 3:
|
|
ccc->msg_state = 4;
|
|
break;
|
|
|
|
default:
|
|
FAIL("Unexpected block end in state %d", ccc->msg_state);
|
|
}
|
|
break;
|
|
|
|
case CPR_ERROR:
|
|
case CPR_MORE:
|
|
FAIL("Invalid input");
|
|
}
|
|
|
|
CBOR_REPLY(ccc->ctl_ch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, -1);
|
|
cbor_put_string(cw, "OK");
|
|
}
|
|
|
|
cbor_channel_done(&ccc->cch);
|
|
return CPR_BLOCK_END;
|
|
#undef FAIL
|
|
}
|
|
|
|
void
|
|
hypervisor_container_shutdown(struct cbor_channel *cch, struct flock_machine_container_config *ccf)
|
|
{
|
|
birdloop_enter(hcf.loop);
|
|
|
|
const char *name = ccf->cf.name;
|
|
uint h = mem_hash(name, strlen(name));
|
|
struct container_runtime *crt = HASH_FIND(hcf.hash, CRT, name, h);
|
|
|
|
if (!crt || !crt->s)
|
|
{
|
|
CBOR_REPLY(cch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, -127);
|
|
cbor_put_string(cw, "BAD: Not found");
|
|
}
|
|
|
|
cbor_channel_done(cch);
|
|
birdloop_leave(hcf.loop);
|
|
return;
|
|
}
|
|
|
|
SKIP_BACK_DECLARE(struct container_ctl_msg, ccr, cch, cbor_channel_new(&crt->stream));
|
|
CBOR_REPLY(&ccr->cch, cw)
|
|
CBOR_PUT_MAP(cw)
|
|
{
|
|
cbor_put_int(cw, 0);
|
|
cbor_put_null(cw);
|
|
}
|
|
|
|
ccr->cch.parse = container_stopped;
|
|
ccr->ctl_ch = cch;
|
|
|
|
birdloop_leave(hcf.loop);
|
|
}
|
|
|
|
#undef CBOR_PARSER_ERROR
|
|
#define CBOR_PARSER_ERROR bug
|
|
|
|
static enum cbor_parse_result
|
|
hcf_parse(struct cbor_channel *cch, enum cbor_parse_result res)
|
|
{
|
|
SKIP_BACK_DECLARE(struct container_fork_request_child, ccx, cch, cch);
|
|
struct cbor_parser_context *ctx = &cch->stream->parser;
|
|
struct flock_machine_container_config *ccf = &ccx->ccf;
|
|
|
|
switch (res)
|
|
{
|
|
case CPR_ERROR:
|
|
case CPR_MORE:
|
|
CBOR_PARSER_ERROR("Invalid input");
|
|
|
|
case CPR_MAJOR:
|
|
/* Check type acceptance */
|
|
switch (ccx->major_state)
|
|
{
|
|
case 0: /* toplevel */
|
|
if (ctx->type != 5)
|
|
CBOR_PARSER_ERROR("Expected mapping, got %u", ctx->type);
|
|
|
|
ccx->major_state = 1;
|
|
break;
|
|
|
|
case 1: /* inside toplevel mapping */
|
|
if (ctx->type != 0)
|
|
CBOR_PARSER_ERROR("Expected integer, got %u", ctx->type);
|
|
|
|
if (ctx->value >= 3)
|
|
CBOR_PARSER_ERROR("Mapping key too high, got %lu", ctx->value);
|
|
|
|
ccx->major_state = ctx->value + 2;
|
|
break;
|
|
|
|
case 2: /* machine hostname */
|
|
if (ctx->type != 3)
|
|
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
|
|
|
|
if (ctx->tflags & CPT_VARLEN)
|
|
CBOR_PARSER_ERROR("Variable length string not supported yet");
|
|
|
|
if (ccf->cf.name)
|
|
CBOR_PARSER_ERROR("Duplicate argument 0 / hostname");
|
|
|
|
ASSERT_DIE(!ctx->target_buf);
|
|
ccf->cf.name = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
|
|
ctx->target_len = ctx->value;
|
|
break;
|
|
|
|
case 3: /* basedir */
|
|
if (ctx->type != 3)
|
|
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
|
|
|
|
if (ctx->tflags & CPT_VARLEN)
|
|
CBOR_PARSER_ERROR("Variable length string not supported yet");
|
|
|
|
if (ccf->workdir)
|
|
CBOR_PARSER_ERROR("Duplicate argument 1 / basedir");
|
|
|
|
ASSERT_DIE(!ctx->target_buf);
|
|
ccf->basedir = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
|
|
ctx->target_len = ctx->value;
|
|
break;
|
|
|
|
case 4: /* workdir */
|
|
if (ctx->type != 3)
|
|
CBOR_PARSER_ERROR("Expected string, got %u", ctx->type);
|
|
|
|
if (ctx->tflags & CPT_VARLEN)
|
|
CBOR_PARSER_ERROR("Variable length string not supported yet");
|
|
|
|
if (ccf->workdir)
|
|
CBOR_PARSER_ERROR("Duplicate argument 2 / workdir");
|
|
|
|
ASSERT_DIE(!ctx->target_buf);
|
|
ccf->workdir = ctx->target_buf = lp_alloc(ctx->lp, ctx->value + 1);
|
|
ctx->target_len = ctx->value;
|
|
break;
|
|
|
|
default:
|
|
bug("invalid parser state");
|
|
}
|
|
break;
|
|
|
|
case CPR_STR_END:
|
|
/* Bytes read completely! */
|
|
switch (ccx->major_state)
|
|
{
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
ccx->major_state = 1;
|
|
break;
|
|
|
|
default:
|
|
bug("Unexpected state to end a (byte)string in");
|
|
/* Code to run at the end of a (byte)string */
|
|
}
|
|
break;
|
|
|
|
case CPR_BLOCK_END:
|
|
switch (ccx->major_state)
|
|
{
|
|
case 0:
|
|
return CPR_BLOCK_END;
|
|
|
|
case 1: /* the mapping ended */
|
|
if (!ccf->cf.name)
|
|
CBOR_PARSER_ERROR("Missing hostname");
|
|
|
|
if (!ccf->workdir)
|
|
CBOR_PARSER_ERROR("Missing workdir");
|
|
|
|
if (!ccf->basedir)
|
|
CBOR_PARSER_ERROR("Missing basedir");
|
|
|
|
container_start(ccx);
|
|
|
|
ccx->major_state = 0;
|
|
break;
|
|
|
|
default:
|
|
bug("Unexpected state to end a mapping in");
|
|
}
|
|
}
|
|
|
|
return CPR_MORE;
|
|
}
|
|
|
|
static void
|
|
hcf_cancel(struct cbor_stream *stream UNUSED)
|
|
{
|
|
bug("Forker child stream cancelled");
|
|
}
|
|
|
|
void
|
|
hypervisor_container_fork(void)
|
|
{
|
|
int e, *fds = hcf.ctl;
|
|
|
|
/* create socketpair before forking to do communication */
|
|
e = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds);
|
|
if (e < 0)
|
|
die("Failed to create internal socketpair: %m");
|
|
|
|
e = fork();
|
|
if (e < 0)
|
|
die("Failed to fork container forker: %m");
|
|
|
|
if (e)
|
|
{
|
|
/* parent side */
|
|
hcf.loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Container forker");
|
|
|
|
birdloop_enter(hcf.loop);
|
|
hcf.p = rp_new(birdloop_pool(hcf.loop), birdloop_domain(hcf.loop), "Container forker pool");
|
|
hcf.s = sk_new(hcf.p);
|
|
hcf.s->type = SK_MAGIC;
|
|
hcf.s->flags |= SKF_FD_RX;
|
|
/* Set the hooks and fds according to the side we are at */
|
|
sk_set_tbsize(hcf.s, 16384);
|
|
sk_set_rbsize(hcf.s, 128);
|
|
hcf.s->fd = fds[0];
|
|
close(fds[1]);
|
|
|
|
HASH_INIT(hcf.hash, hcf.p, 6);
|
|
|
|
if (sk_open(hcf.s, hcf.loop) < 0)
|
|
bug("Container forker parent: sk_open failed");
|
|
|
|
hcf.s->type = SK_UNIX_MSG;
|
|
hcf.stream.parse = container_fork_request_reply;
|
|
CBOR_STREAM_INIT(&hcf, stream, cch, hcf.p, struct container_fork_request);
|
|
cbor_stream_attach(&hcf.stream, hcf.s);
|
|
|
|
birdloop_leave(hcf.loop);
|
|
return;
|
|
}
|
|
|
|
/* noreturn child side */
|
|
close(fds[0]);
|
|
hexp_cleanup_after_fork();
|
|
this_thread_id |= 0xf000;
|
|
|
|
/* initialize the forker */
|
|
sock *sk = sk_new(&root_pool);
|
|
sk->type = SK_UNIX_MSG;
|
|
sk->fd = fds[1];
|
|
sk->flags |= SKF_FD_TX;
|
|
sk->rbuf = sk->rpos = alloca(sk->rbsize = 4096);
|
|
sk->tbuf = sk->tpos = alloca(sk->tbsize = 64);
|
|
|
|
CBOR_STREAM_EMBED(s, 4) stream;
|
|
CBOR_STREAM_INIT(&stream, s, cch, &root_pool, struct container_fork_request_child);
|
|
cbor_stream_attach(&stream.s, sk);
|
|
stream.s.parse = hcf_parse;
|
|
stream.s.cancel = hcf_cancel;
|
|
|
|
while (true)
|
|
{
|
|
ssize_t rx = read(fds[1], sk->rpos, sk->rbsize);
|
|
times_update();
|
|
|
|
if (rx == 0)
|
|
{
|
|
log(L_INFO "Container forker socket closed, exiting");
|
|
exit(0);
|
|
}
|
|
|
|
if (rx < 0)
|
|
bug("Container forker child: failed to read: %m");
|
|
|
|
sk->rpos += rx;
|
|
if (sk->rx_hook(sk, rx))
|
|
sk->rpos = sk->rbuf;
|
|
}
|
|
}
|