0
0
mirror of https://gitlab.nic.cz/labs/bird.git synced 2024-12-22 17:51:53 +00:00
bird/flock/flock.c
2024-10-05 15:54:15 +02:00

320 lines
8.0 KiB
C

#include "flock/flock.h"
#include "lib/obstacle.h"
#include "lib/runtime.h"
#include "lib/string.h"
#include "lib/timer.h"
#include "sysdep/unix/unix.h"
#include "sysdep/unix/io-loop.h"
#include <errno.h>
#include <fcntl.h>
#include <linux/mount.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
/* Overall configuration */
struct flock_config flock_config;
/**
* Shutdown routines
*/
event_list shutdown_event_list;
struct shutdown_placeholder shutdown_placeholder;
static void
reboot_event_hook(void *data UNUSED)
{
log(L_ERR "Reboot requested but not implemented");
}
static void
poweroff_event_hook(void *data UNUSED)
{
log(L_INFO "Shutdown requested.");
ev_run_list(&shutdown_event_list);
}
static void
child_event_hook(void *data UNUSED)
{
log(L_INFO "Zombie elimination routine invoked.");
while (1) {
int status;
pid_t p = waitpid(-1, &status, WNOHANG);
if (p < 0)
{
log(L_ERR "Zombie elimination failed: %m");
return;
}
if (p == 0)
return;
const char *coreinfo = WCOREDUMP(status) ? " (core dumped)" : "";
if (WIFEXITED(status))
log(L_INFO "Process %d ended with status %d%s", p, WEXITSTATUS(status), coreinfo);
else if (WIFSIGNALED(status))
log(L_INFO "Process %d exited by signal %d (%s)%s", p, WTERMSIG(status), strsignal(WTERMSIG(status)), coreinfo);
else
log(L_ERR "Process %d exited with a strange status %d", p, status);
}
}
event reboot_event = { .hook = reboot_event_hook },
poweroff_event = { .hook = poweroff_event_hook },
child_event = { .hook = child_event_hook };
callback shutdown_done_callback;
static void
shutdown_done(callback *c UNUSED)
{
log(L_INFO "Shutdown finished.");
exit(0);
}
/**
* Signal handling
*
* We wanna behave as the init process inside the newly create PID namespace
* which means that the signals have different meanings than for other processes,
* For more information, see pid_namespaces(7).
*/
static void
hypervisor_reboot_sighandler(int signo UNUSED)
{
ev_send_loop(&main_birdloop, &reboot_event);
}
static void
hypervisor_poweroff_sighandler(int signo UNUSED)
{
ev_send_loop(&main_birdloop, &poweroff_event);
}
static void
hypervisor_fail_sighandler(int signo UNUSED)
{
int e = fork();
if (e == 0)
{
signal(SIGABRT, SIG_DFL);
abort();
}
if (e > 0)
waitpid(e, NULL, 0);
_exit(1);
}
static void
hypervisor_child_sighandler(int signo UNUSED)
{
ev_send_loop(&main_birdloop, &child_event);
}
/*
* The Main.
*
* Bootstrapping and all the fiddling around before anything can actually
* be really executed.
*/
#define SYSCALL(x, ...) ({ int e = x(__VA_ARGS__); if (e < 0) die("Failed to run %s at %s:%d: %m", #x, __FILE__, __LINE__); e; })
#define KILLABLE_SIGNALS SIGINT, SIGTERM, SIGHUP, SIGQUIT
static inline void
usage(FILE *f)
{
fprintf(f,
"Usage: %s [options] name\n\n"
"Runs Flock hypervisor with the given name.\n"
"\n"
"Options:\n"
"\t-s <path>\topen control socket at this path\n"
"\t-l \tshortcut for -s ./<name>.ctl\n"
"\n",
flock_config.exec_name);
}
int
main(int argc, char **argv, char **argh UNUSED)
{
/* Prepare necessary infrastructure */
the_bird_lock();
times_update();
resource_init();
random_init();
birdloop_init();
struct global_runtime gr = *atomic_load_explicit(&global_runtime, memory_order_relaxed);
gr.alloc.keep_mem_max_global = BIRD_ALIGN(65536, page_size);
gr.alloc.keep_mem_max_local = BIRD_ALIGN(16384, page_size);
gr.alloc.at_once = BIRD_ALIGN(4096, page_size);
// gr.latency_debug = ~0;
switch_runtime(&gr);
ev_init_list(&global_event_list, &main_birdloop, "Global event list");
ev_init_list(&global_work_list, &main_birdloop, "Global work list");
ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list");
/* Shutdown hooks */
ev_init_list(&shutdown_event_list, &main_birdloop, "Shutdown event list");
callback_init(&shutdown_done_callback, shutdown_done, &main_birdloop);
obstacle_target_init(
&shutdown_placeholder.obstacles,
&shutdown_done_callback, &root_pool, "Shutdown");
boot_time = current_time();
log_switch(1, NULL, NULL);
/* Find the original UID/GIDs */
uid_t euid = geteuid(), egid = getegid();
/* Parse args */
flock_config.exec_name = argv[0] ?: "flock-sim";
int opt;
bool csp_local = 0;
while ((opt = getopt(argc, argv, "ls:")) != -1)
{
switch (opt)
{
case 'l':
csp_local = 1;
break;
case 's':
csp_local = 0;
flock_config.control_socket_path = mb_strdup(&root_pool, optarg);
break;
default:
usage(stderr);
return 2;
}
}
/* Get hypervisor name */
if (optind != argc - 1)
{
usage(stderr);
return 2;
}
flock_config.hypervisor_name = argv[optind];
/* Fix the control socket path if -l was given */
if (csp_local)
flock_config.control_socket_path = mb_sprintf(&root_pool, "./%s.ctl", flock_config.hypervisor_name);
else if (!flock_config.control_socket_path)
{
fprintf(stderr, "No control socket path given, use -s or -l.");
usage(stderr);
return 2;
}
/* Mask signals for forking and other fragile stuff */
sigset_t oldmask;
sigset_t newmask;
sigemptyset(&newmask);
#define FROB(x) sigaddset(&newmask, x);
MACRO_FOREACH(FROB, KILLABLE_SIGNALS);
#undef FROB
sigprocmask(SIG_BLOCK, &newmask, &oldmask);
/* First we need to create the PID + mount + user namespace to acquire capabilities,
* and also time namespace for good measure */
SYSCALL(unshare, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWTIME);
/* Then we have to fork() to become PID 1 of the new PID namespace */
pid_t init_pid = fork();
if (init_pid < 0)
die("Failed to become init: %m");
/* The parent process may end now
* TODO: allow wait() and/or writing PIDfile
* instead of just ending */
if (init_pid > 0)
return 0;
/* We also need to fix some UID/GID mappings to become local root.
* TODO: this will need an upgrade for full-scale containers. */
#define WRITE_ONCE(file, data, len) do { \
int fd = SYSCALL(open, file, O_WRONLY); \
int e = write(fd, data, len); \
if (e != len) die("Failed to write %s to %s", data, file); \
close(fd); \
} while (0)
{
char fixer[256];
int len = bsnprintf(fixer, sizeof fixer, "0 %d 1", euid);
WRITE_ONCE("/proc/self/uid_map", fixer, len);
WRITE_ONCE("/proc/self/setgroups", "deny", sizeof "deny");
len = bsnprintf(fixer, sizeof fixer, "0 %d 1", egid);
WRITE_ONCE("/proc/self/gid_map", fixer, len);
}
#undef WRITE_ONCE
/* Remounting proc to reflect the new PID namespace */
SYSCALL(mount, "none", "/", NULL, MS_REC | MS_PRIVATE, NULL);
SYSCALL(mount, "proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL);
/* Now we are init but in the original network namespace,
* let's spawn a child to do external communication before unsharing */
hypervisor_exposed_fork();
/* And now we can unshare the networks */
SYSCALL(unshare, CLONE_NEWNET);
/* Before running in multiple threads, we also need to fork the container forker */
hypervisor_container_fork();
/* Control socket needs to exist */
hypervisor_control_socket();
/* Set signal handlers as this process is init in its PID namespace */
signal(SIGTERM, hypervisor_poweroff_sighandler);
signal(SIGINT, hypervisor_poweroff_sighandler);
signal(SIGHUP, hypervisor_reboot_sighandler);
signal(SIGQUIT, hypervisor_fail_sighandler);
signal(SIGCHLD, hypervisor_child_sighandler);
/* Unblock signals */
sigprocmask(SIG_SETMASK, &oldmask, NULL);
/* Check limits */
struct rlimit corelimit;
getrlimit(RLIMIT_CORE, &corelimit);
log(L_INFO "Core limit %u %u", corelimit.rlim_cur, corelimit.rlim_max);
/* Run worker threads */
struct thread_config tc = {};
bird_thread_commit(&tc);
/* Wait for Godot */
log(L_INFO "Hypervisor running");
birdloop_minimalist_main();
}