0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2024-12-22 09:41:54 +00:00

Flock: Creating the hypervisor and the external-contact process

This is the first part of rewriting Flock to C to significantly reduce
memory footprint of individual machines from 20+M in Python to (goal)
less than 1M. Now the process eats ~460k and I suspect that this won't
even be the consumption per machine in total as fork() is involved and
some parts of the memory will be heavily shared.
This commit is contained in:
Maria Matejka 2024-08-29 07:53:47 +02:00
parent 8d5fcfc6e8
commit 01bfa5ebf3
5 changed files with 328 additions and 1 deletions

View File

@ -78,7 +78,7 @@ cli: $(client)
$(daemon): LIBS += $(DAEMON_LIBS) $(daemon): LIBS += $(DAEMON_LIBS)
# Include directories # Include directories
dirs := client conf doc filter lib nest test $(addprefix proto/,$(protocols)) @sysdep_dirs@ dirs := client conf doc filter flock lib nest test $(addprefix proto/,$(protocols)) @sysdep_dirs@
# conf/Makefile declarations needed for all other modules # conf/Makefile declarations needed for all other modules
conf-lex-targets := $(addprefix $(objdir)/conf/,cf-lex.o) conf-lex-targets := $(addprefix $(objdir)/conf/,cf-lex.o)

12
flock/Makefile Normal file
View File

@ -0,0 +1,12 @@
src := flock.c hypervisor.c
obj := $(src-o-files)
flock=$(exedir)/flock-sim
$(flock): $(obj)
$(flock): $(common-lib)
$(flock): LIBS += $(COMMON_LIBS)
$(flock):
$(E)echo LD $(LDFLAGS) -o $@ $^ $(LIBS)
$(Q)$(CC) $(LDFLAGS) -o $@ $(patsubst $(common-lib),$(shell cat $(common-lib)),$^) $(LIBS)

209
flock/flock.c Normal file
View File

@ -0,0 +1,209 @@
#include "flock/flock.h"
#include "lib/string.h"
#include "lib/timer.h"
#include "sysdep/unix/unix.h"
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
/* Overall configuration */
struct flock_config flock_config;
/**
* Signal handling
*
* We wanna behave as the init process inside the newly create PID namespace
* which means that the signals have different meanings than for other processes,
* For more information, see pid_namespaces(7).
*/
static sig_atomic_t signal_received;
#define SIGREQ_REBOOT 1
#define SIGREQ_POWEROFF 2
#define SIGREQ_FAIL 4
static void
hypervisor_reboot_sighandler(int signo UNUSED)
{
signal_received |= SIGREQ_REBOOT;
}
static void
hypervisor_poweroff_sighandler(int signo UNUSED)
{
signal_received |= SIGREQ_POWEROFF;
}
static void
hypervisor_fail_sighandler(int signo UNUSED)
{
signal_received |= SIGREQ_FAIL;
int e = fork();
if (e == 0)
{
signal(SIGABRT, SIG_DFL);
abort();
}
if (e > 0)
waitpid(e, NULL, 0);
_exit(1);
}
/*
* The Main.
*
* Bootstrapping and all the fiddling around before anything can actually
* be really executed.
*/
#define SYSCALL(x, ...) ({ int e = x(__VA_ARGS__); if (e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); e; })
#define KILLABLE_SIGNALS SIGINT, SIGTERM, SIGHUP, SIGQUIT
static inline void
usage(FILE *f)
{
fprintf(f,
"Usage: %s name\n\n"
"Runs hypervisor with the given name.\n",
flock_config.exec_name);
}
int
main(int argc, char **argv, char **argh UNUSED)
{
/* Prepare necessary infrastructure */
the_bird_lock();
times_update();
resource_init();
random_init();
birdloop_init();
boot_time = current_time();
log_switch(1, NULL, NULL);
/* Parse args */
flock_config.exec_name = argv[0] ?: "flock-sim";
int opt;
while ((opt = getopt(argc, argv, "")) != -1)
{
/* TODO: add some options */
usage(stderr);
return 2;
}
/* Get hypervisor name */
if (optind != argc - 1)
{
usage(stderr);
return 2;
}
flock_config.hypervisor_name = argv[optind];
/* Mask signals for forking and other fragile stuff */
sigset_t oldmask;
sigset_t newmask;
sigemptyset(&newmask);
#define FROB(x) sigaddset(&newmask, x);
MACRO_FOREACH(FROB, KILLABLE_SIGNALS);
#undef FROB
sigprocmask(SIG_BLOCK, &newmask, &oldmask);
/* Keep the original UID/GIDs */
uid_t euid = geteuid(), egid = getegid();
/* First we need to create the PID + mount + user namespace to acquire capabilities */
SYSCALL(unshare, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER);
/* Then we have to fork() to become PID 1 of the new PID namespace */
pid_t init_pid = fork();
if (init_pid < 0)
die("Failed to become init: %m");
/* The parent process may end now
* TODO: allow wait() and/or writing PIDfile
* instead of just ending */
if (init_pid > 0)
return 0;
/* We also need to fix some UID/GID mappings to become local root.
* TODO: this will need an upgrade for full-scale containers. */
#define WRITE_ONCE(file, data, len) do { \
int fd = SYSCALL(open, file, O_WRONLY); \
int e = write(fd, data, len); \
if (e != len) die("Failed to write %s to %s", data, file); \
close(fd); \
} while (0)
{
char fixer[256];
int len = bsnprintf(fixer, sizeof fixer, "0 %d 1", euid);
WRITE_ONCE("/proc/self/uid_map", fixer, len);
WRITE_ONCE("/proc/self/setgroups", "deny", sizeof "deny");
len = bsnprintf(fixer, sizeof fixer, "0 %d 1", egid);
WRITE_ONCE("/proc/self/gid_map", fixer, len);
}
#undef WRITE_ONCE
/* Remounting proc to reflect the new PID namespace */
SYSCALL(mount, "none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
SYSCALL(mount, "proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
/* Now we are init but in the original network namespace,
* let's spawn a child to do external communication before unsharing */
hypervisor_exposed_fork();
/* And now finally we can go for unsharing the rest -- networks and time */
SYSCALL(unshare, CLONE_NEWTIME | CLONE_NEWNET);
/* Set signal handlers as this process is init in its PID namespace */
signal(SIGTERM, hypervisor_poweroff_sighandler);
signal(SIGINT, hypervisor_poweroff_sighandler);
signal(SIGHUP, hypervisor_reboot_sighandler);
signal(SIGQUIT, hypervisor_fail_sighandler);
/* Unblock signals */
sigprocmask(SIG_SETMASK, &oldmask, NULL);
/* Check limits */
struct rlimit corelimit;
getrlimit(RLIMIT_CORE, &corelimit);
log(L_INFO "Core limit %u %u", corelimit.rlim_cur, corelimit.rlim_max);
/* Wait for Godot */
log(L_INFO "Hypervisor running");
while (1)
{
pause();
uint s = signal_received;
signal_received &= ~s;
if (s & SIGREQ_FAIL)
bug("Fail flag should never propagate from signal");
else if (s & SIGREQ_POWEROFF)
return 0;
else if (s & SIGREQ_REBOOT)
log(L_ERR "Reboot requested but not implemented");
}
}

16
flock/flock.h Normal file
View File

@ -0,0 +1,16 @@
#define _GNU_SOURCE
#ifndef INCLUDE_FLOCK_H
#define INCLUDE_FLOCK_H
#include "lib/birdlib.h"
void hypervisor_exposed_fork(void);
struct flock_config {
const char *hypervisor_name;
const char *exec_name;
};
extern struct flock_config flock_config;
#endif

90
flock/hypervisor.c Normal file
View File

@ -0,0 +1,90 @@
#include "lib/birdlib.h"
#include "lib/resource.h"
#include "lib/io-loop.h"
#include <sys/socket.h>
/* Local communication structure */
static struct hypervisor_exposed {
pool *p;
sock *s;
struct birdloop *loop;
} he;
/**
* Exposed process' parent side (requestor)
**/
static int
hypervisor_exposed_parent_rx(sock *sk, uint size UNUSED)
{
log(L_INFO "HV EP RX");
recvmsg(sk->fd, NULL, 0);
return 0;
}
static void
hypervisor_exposed_parent_err(sock *sk UNUSED, int e UNUSED)
{
}
/**
* Exposed process' child side (executor)
**/
static int
hypervisor_exposed_child_rx(sock *sk, uint size UNUSED)
{
log(L_INFO "HV EC RX");
recvmsg(sk->fd, NULL, 0);
return 0;
}
static void
hypervisor_exposed_child_err(sock *sk UNUSED, int e UNUSED)
{
}
/**
* Common init code
*/
void
hypervisor_exposed_fork(void)
{
int fds[2], e;
/* create socketpair before forking to do communication */
e = socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
if (e < 0)
die("Failed to create internal socketpair: %m");
e = fork();
if (e < 0)
die("Failed to fork exposed: %m");
/* Create the communication channel (both sides at once) */
he.loop = birdloop_new(&root_pool, DOMAIN_ORDER(proto), 0, "Exposed interlink");
birdloop_enter(he.loop);
he.p = rp_new(birdloop_pool(he.loop), birdloop_domain(he.loop), "Exposed interlink pool");
he.s = sk_new(he.p);
he.s->type = SK_MAGIC;
he.s->rx_hook = e ? hypervisor_exposed_parent_rx : hypervisor_exposed_child_rx;
he.s->err_hook = e ? hypervisor_exposed_parent_err : hypervisor_exposed_child_err;
he.s->fd = fds[!!e];
close(fds[!e]);
if (sk_open(he.s, he.loop) < 0)
bug("Exposed parent: sk_open failed");
birdloop_leave(he.loop);
/* Now there is a loop both in child and parent, prepared to read the socket.
* There is only one difference. Whereas the parent has to continue its run
* to do other duties, the child is stuck here forever. */
if (e)
return;
/* Child-only */
while (1)
pause();
}