mirror of
https://gitlab.nic.cz/labs/bird.git
synced 2024-12-23 02:01:55 +00:00
792189e807
In our usecase, these are impossibly greedy because we often free memory in a different thread than where we allocate, forcing the default allocator to scatter the used memory all over the place.
543 lines
16 KiB
C
543 lines
16 KiB
C
/*
|
|
* BIRD Internet Routing Daemon -- Raw allocation
|
|
*
|
|
* (c) 2020 Maria Matejka <mq@ucw.cz>
|
|
*
|
|
* Can be freely distributed and used under the terms of the GNU GPL.
|
|
*/
|
|
|
|
#include "nest/bird.h"
|
|
#include "lib/resource.h"
|
|
#include "lib/lists.h"
|
|
#include "lib/event.h"
|
|
#include "lib/io-loop.h"
|
|
|
|
#include "conf/conf.h"
|
|
|
|
#include <errno.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
|
|
#ifdef HAVE_MALLOC_H
|
|
# include <malloc.h>
|
|
#endif
|
|
|
|
#ifdef HAVE_MMAP
|
|
# include <sys/mman.h>
|
|
#endif
|
|
|
|
#ifdef CONFIG_DISABLE_THP
|
|
# include <sys/prctl.h>
|
|
# ifndef PR_SET_THP_DISABLE
|
|
# define PR_SET_THP_DISABLE 41
|
|
# endif
|
|
#endif
|
|
|
|
long page_size = 0;
|
|
|
|
#ifdef HAVE_MMAP
|
|
# define KEEP_PAGES_MAX 16384
|
|
# define KEEP_PAGES_MIN 32
|
|
# define KEEP_PAGES_MAX_LOCAL 128
|
|
# define ALLOC_PAGES_AT_ONCE 32
|
|
|
|
STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX);
|
|
STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL);
|
|
|
|
static bool use_fake = 0;
|
|
static bool initialized = 0;
|
|
|
|
# define PROTECT_PAGE(pg)
|
|
# define UNPROTECT_PAGE(pg)
|
|
|
|
# if DEBUGGING
|
|
# ifdef ENABLE_EXPENSIVE_CHECKS
|
|
# undef PROTECT_PAGE
|
|
# undef UNPROTECT_PAGE
|
|
# define PROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ)
|
|
# define UNPROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ | PROT_WRITE)
|
|
# endif
|
|
|
|
# define AJSIZE 16384
|
|
|
|
static struct alloc_journal {
|
|
void *fp;
|
|
void *next;
|
|
u16 pos;
|
|
u16 type;
|
|
uint thread_id;
|
|
} alloc_journal[AJSIZE];
|
|
|
|
_Thread_local int alloc_journal_local_pos = -1;
|
|
_Atomic int alloc_journal_pos = 0;
|
|
|
|
# define AJT_ALLOC_LOCAL_HOT 1
|
|
# define AJT_ALLOC_GLOBAL_HOT 2
|
|
# define AJT_ALLOC_COLD_KEEPER 4
|
|
# define AJT_ALLOC_MMAP 5
|
|
|
|
# define AJT_FREE_LOCAL_HOT 0x11
|
|
# define AJT_FREE_GLOBAL_HOT 0x12
|
|
|
|
# define AJT_CLEANUP_NOTHING 0xc0
|
|
# define AJT_CLEANUP_COLD_STD 0xc3
|
|
# define AJT_CLEANUP_COLD_KEEPER 0xc4
|
|
# define AJT_CLEANUP_BEGIN 0xcb
|
|
# define AJT_CLEANUP_END 0xce
|
|
|
|
# define AJT_FLUSH_COLD 0xfc
|
|
# define AJT_FLUSH_LOCAL_BEGIN 0xfb
|
|
# define AJT_FLUSH_LOCAL_END 0xfe
|
|
# define AJT_SCHEDULE_CLEANUP 0xff
|
|
|
|
static void
|
|
ajlog(void *fp, void *next, u16 pos, u16 type)
|
|
{
|
|
alloc_journal[(alloc_journal_local_pos = atomic_fetch_add_explicit(&alloc_journal_pos, 1, memory_order_relaxed)) % AJSIZE] = (struct alloc_journal) {
|
|
.fp = fp,
|
|
.next = next,
|
|
.pos = pos,
|
|
.type = type,
|
|
.thread_id = THIS_THREAD_ID,
|
|
};
|
|
|
|
#if 0
|
|
log(L_TRACE "ajlog %s: %p, %p, %u, %u",
|
|
(type < 0x10) ? "alloc" : (type < 0x20) ? "free" :
|
|
(type < 0xcf) ? "cleanup" : "flush",
|
|
fp, next, pos, type);
|
|
#endif
|
|
|
|
}
|
|
|
|
struct free_page {
|
|
node unused[42];
|
|
struct free_page * _Atomic next;
|
|
};
|
|
# else /* ! DEBUGGING */
|
|
|
|
# define ajlog(...)
|
|
|
|
struct free_page {
|
|
struct free_page * _Atomic next;
|
|
};
|
|
|
|
# endif
|
|
|
|
# define WRITE_NEXT(pg, val) do { UNPROTECT_PAGE((pg)); (pg)->next = (val); PROTECT_PAGE((pg)); } while (0)
|
|
|
|
# define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
|
|
|
|
struct empty_pages {
|
|
struct empty_pages *next;
|
|
uint pos;
|
|
void *pages[0];
|
|
};
|
|
|
|
static DOMAIN(resource) empty_pages_domain;
|
|
static struct empty_pages *empty_pages = NULL;
|
|
_Atomic int pages_kept_cold = 0;
|
|
_Atomic int pages_kept_cold_index = 0;
|
|
_Atomic int pages_total = 0;
|
|
_Atomic int alloc_locking_in_rcu = 0;
|
|
|
|
static struct free_page * _Atomic page_stack = NULL;
|
|
static _Thread_local struct free_page * local_page_stack = NULL;
|
|
static struct free_page page_stack_blocked;
|
|
|
|
/* Try to replace the page stack head with a cork, until it succeeds. */
|
|
# define PAGE_STACK_GET ({ \
|
|
struct free_page *fp; \
|
|
while ((fp = atomic_exchange_explicit(&page_stack, &page_stack_blocked, memory_order_acq_rel)) == &page_stack_blocked) birdloop_yield(); \
|
|
fp; })
|
|
/* Reinstate the stack with another value */
|
|
# define PAGE_STACK_PUT(val) ASSERT_DIE(atomic_exchange_explicit(&page_stack, (val), memory_order_acq_rel) == &page_stack_blocked)
|
|
|
|
static void page_cleanup(void *);
|
|
static event page_cleanup_event = { .hook = page_cleanup, };
|
|
# define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0)
|
|
|
|
_Atomic int pages_kept = 0;
|
|
_Atomic int pages_kept_locally = 0;
|
|
static _Thread_local int pages_kept_here = 0;
|
|
|
|
static void *
|
|
alloc_sys_page(void)
|
|
{
|
|
void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|
|
|
if (ptr == MAP_FAILED)
|
|
die("mmap(%ld) failed: %m", (s64) page_size);
|
|
|
|
atomic_fetch_add_explicit(&pages_total, ALLOC_PAGES_AT_ONCE, memory_order_acq_rel);
|
|
return ptr;
|
|
}
|
|
|
|
extern int shutting_down; /* Shutdown requested. */
|
|
|
|
#else // ! HAVE_MMAP
|
|
# define use_fake 1
|
|
#endif
|
|
|
|
#define ALLOC_TRACE(fmt...) do { \
|
|
if (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & DL_ALLOCATOR) log(L_TRACE "Allocator: " fmt, ##fmt); } while (0)
|
|
|
|
|
|
static void *
|
|
alloc_hot_page(struct free_page *fp) {
|
|
if (fp = PAGE_STACK_GET)
|
|
{
|
|
/* Reinstate the stack with the next page in list */
|
|
PAGE_STACK_PUT(atomic_load_explicit(&fp->next, memory_order_relaxed));
|
|
|
|
/* Update the counters */
|
|
UNUSED uint pk = atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
|
|
|
|
/* Release the page */
|
|
UNPROTECT_PAGE(fp);
|
|
ajlog(fp, atomic_load_explicit(&fp->next, memory_order_relaxed), pk, AJT_ALLOC_GLOBAL_HOT);
|
|
return fp;
|
|
}
|
|
/* Reinstate the stack with zero */
|
|
PAGE_STACK_PUT(NULL);
|
|
return NULL;
|
|
}
|
|
|
|
void *
|
|
alloc_page(void)
|
|
{
|
|
/* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */
|
|
if (use_fake)
|
|
{
|
|
atomic_fetch_add_explicit(&pages_total, 1, memory_order_acq_rel);
|
|
void *ptr = NULL;
|
|
int err = posix_memalign(&ptr, page_size, page_size);
|
|
|
|
if (err || !ptr)
|
|
die("posix_memalign(%ld) failed", (s64) page_size);
|
|
|
|
return ptr;
|
|
}
|
|
|
|
#ifdef HAVE_MMAP
|
|
/* If there is any free page kept hot in this thread, we use it. */
|
|
struct free_page *fp = local_page_stack;
|
|
if (fp)
|
|
{
|
|
local_page_stack = atomic_load_explicit(&fp->next, memory_order_relaxed);
|
|
atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed);
|
|
pages_kept_here--;
|
|
UNPROTECT_PAGE(fp);
|
|
ajlog(fp, local_page_stack, pages_kept_here, AJT_ALLOC_LOCAL_HOT);
|
|
return fp;
|
|
}
|
|
|
|
ASSERT_DIE(pages_kept_here == 0);
|
|
|
|
/* If there is any free page kept hot in global storage, we use it. */
|
|
if (fp = alloc_hot_page(fp))
|
|
return fp;
|
|
|
|
if (rcu_read_active())
|
|
{
|
|
/* We shouldn't alloc when rcu is active but that's a quest for another day. */
|
|
atomic_fetch_add_explicit(&alloc_locking_in_rcu, 1, memory_order_relaxed);
|
|
}
|
|
|
|
/* If there is any free page kept cold, we warm up some of these. */
|
|
LOCK_DOMAIN(resource, empty_pages_domain);
|
|
|
|
/* Threads were serialized on lock and the first one might have prepared some
|
|
* blocks for the rest of threads */
|
|
if (fp = alloc_hot_page(fp))
|
|
{
|
|
UNLOCK_DOMAIN(resource, empty_pages_domain);
|
|
return fp;
|
|
}
|
|
|
|
if (empty_pages) {
|
|
UNPROTECT_PAGE(empty_pages);
|
|
|
|
/* We flush all the pages in this block to the hot page cache
|
|
* and return the keeper page as allocated. */
|
|
if (empty_pages->pos)
|
|
{
|
|
/* Link one after another */
|
|
for (uint i = 0; i < empty_pages->pos - 1; i++)
|
|
{
|
|
ajlog(empty_pages->pages[i], empty_pages, empty_pages->pos, AJT_FLUSH_COLD);
|
|
atomic_store_explicit(
|
|
&((struct free_page *) empty_pages->pages[i])->next,
|
|
empty_pages->pages[i+1],
|
|
memory_order_relaxed);
|
|
}
|
|
|
|
/* And put into the hot page cache */
|
|
atomic_store_explicit(
|
|
&((struct free_page *) empty_pages->pages[empty_pages->pos - 1])->next,
|
|
PAGE_STACK_GET,
|
|
memory_order_release);
|
|
PAGE_STACK_PUT(empty_pages->pages[0]);
|
|
|
|
/* Update counters */
|
|
atomic_fetch_sub_explicit(&pages_kept_cold, empty_pages->pos, memory_order_relaxed);
|
|
atomic_fetch_add_explicit(&pages_kept, empty_pages->pos, memory_order_relaxed);
|
|
}
|
|
|
|
/* We can then reuse the old keeper page. */
|
|
/* Or the keeper page has no more cold page pointer, return the keeper page */
|
|
fp = (struct free_page *) empty_pages;
|
|
empty_pages = empty_pages->next;
|
|
ajlog(fp, empty_pages, 0, AJT_ALLOC_COLD_KEEPER);
|
|
atomic_fetch_sub_explicit(&pages_kept_cold_index, 1, memory_order_relaxed);
|
|
|
|
if (!empty_pages)
|
|
ALLOC_TRACE("Taken last page from cold storage");
|
|
}
|
|
UNLOCK_DOMAIN(resource, empty_pages_domain);
|
|
|
|
if (fp)
|
|
return fp;
|
|
|
|
/* And in the worst case, allocate some new pages by mmap() */
|
|
void *ptr = alloc_sys_page();
|
|
ajlog(ptr, NULL, 0, AJT_ALLOC_MMAP);
|
|
|
|
for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++)
|
|
free_page(ptr + page_size * i);
|
|
|
|
return ptr;
|
|
#endif
|
|
}
|
|
|
|
void
|
|
free_page(void *ptr)
|
|
{
|
|
/* If the system page allocator is goofy, we just free the block and care no more. */
|
|
if (use_fake)
|
|
{
|
|
atomic_fetch_sub_explicit(&pages_total, 1, memory_order_acq_rel);
|
|
free(ptr);
|
|
return;
|
|
}
|
|
|
|
#ifdef HAVE_MMAP
|
|
/* We primarily try to keep the pages locally. */
|
|
struct free_page *fp = ptr;
|
|
if (pages_kept_here < KEEP_PAGES_MAX_LOCAL)
|
|
{
|
|
struct free_page *next = local_page_stack;
|
|
atomic_store_explicit(&fp->next, next, memory_order_relaxed);
|
|
PROTECT_PAGE(fp);
|
|
local_page_stack = fp;
|
|
|
|
atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed);
|
|
pages_kept_here++;
|
|
ajlog(fp, next, pages_kept_here, AJT_FREE_LOCAL_HOT);
|
|
return;
|
|
}
|
|
|
|
/* If there are too many local pages, we add the free page to the global hot-free-page list */
|
|
struct free_page *next = PAGE_STACK_GET;
|
|
atomic_store_explicit(&fp->next, next, memory_order_relaxed);
|
|
PROTECT_PAGE(fp);
|
|
|
|
/* Unblock the stack with the page being freed */
|
|
PAGE_STACK_PUT(fp);
|
|
|
|
/* Update counters */
|
|
uint pk = atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed);
|
|
ajlog(fp, next, pk, AJT_FREE_GLOBAL_HOT);
|
|
|
|
/* And if there are too many global hot free pages, we ask for page cleanup */
|
|
if (pk >= KEEP_PAGES_MAX)
|
|
SCHEDULE_CLEANUP;
|
|
#endif
|
|
}
|
|
|
|
/* When the routine is going to sleep for a long time, we flush the local
|
|
* hot page cache to not keep dirty pages for nothing. */
|
|
void
|
|
flush_local_pages(void)
|
|
{
|
|
if (use_fake || !local_page_stack || shutting_down)
|
|
return;
|
|
|
|
ajlog(local_page_stack, NULL, pages_kept_here, AJT_FLUSH_LOCAL_BEGIN);
|
|
|
|
/* We first count the pages to enable consistency checking.
|
|
* Also, we need to know the last page. */
|
|
struct free_page *last = local_page_stack, *next;
|
|
int check_count = 1;
|
|
while (next = atomic_load_explicit(&last->next, memory_order_relaxed))
|
|
{
|
|
check_count++;
|
|
last = next;
|
|
}
|
|
|
|
/* The actual number of pages must be equal to the counter value. */
|
|
ASSERT_DIE(check_count == pages_kept_here);
|
|
|
|
/* Block the stack by a cork */
|
|
UNPROTECT_PAGE(last);
|
|
atomic_store_explicit(&last->next, PAGE_STACK_GET, memory_order_relaxed);
|
|
PROTECT_PAGE(last);
|
|
|
|
/* Update the stack */
|
|
PAGE_STACK_PUT(last);
|
|
|
|
/* Finished. Now the local stack is empty. */
|
|
local_page_stack = NULL;
|
|
pages_kept_here = 0;
|
|
|
|
ajlog(NULL, NULL, 0, AJT_FLUSH_LOCAL_END);
|
|
|
|
/* Check the state of global page cache and maybe schedule its cleanup. */
|
|
atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed);
|
|
if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX)
|
|
SCHEDULE_CLEANUP;
|
|
}
|
|
|
|
#ifdef HAVE_MMAP
|
|
static void
|
|
page_cleanup(void *_ UNUSED)
|
|
{
|
|
/* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */
|
|
if (shutting_down)
|
|
return;
|
|
|
|
/* Pages allocated inbetween */
|
|
uint pk = atomic_load_explicit(&pages_kept, memory_order_relaxed);
|
|
if (pk < KEEP_PAGES_MAX)
|
|
return;
|
|
|
|
/* Walk the pages */
|
|
ajlog(NULL, NULL, 0, AJT_CLEANUP_BEGIN);
|
|
uint count = 0;
|
|
do {
|
|
/* Get next hot page */
|
|
struct free_page *fp = PAGE_STACK_GET;
|
|
if (!fp) {
|
|
PAGE_STACK_PUT(NULL);
|
|
ajlog(NULL, NULL, 0, AJT_CLEANUP_END);
|
|
return;
|
|
}
|
|
|
|
/* Reinstate the stack with the next page in list */
|
|
PAGE_STACK_PUT(atomic_load_explicit(&fp->next, memory_order_relaxed));
|
|
|
|
/* Cold pages are locked */
|
|
LOCK_DOMAIN(resource, empty_pages_domain);
|
|
|
|
/* Empty pages are stored as pointers. To store them, we need a pointer block. */
|
|
if (!empty_pages || (empty_pages->pos == EP_POS_MAX))
|
|
{
|
|
/* There is either no pointer block or the last block is full. We use this block as a pointer block. */
|
|
struct empty_pages *ep = (struct empty_pages *) fp;
|
|
UNPROTECT_PAGE(ep);
|
|
*ep = (struct empty_pages) {
|
|
.next = empty_pages,
|
|
};
|
|
PROTECT_PAGE(ep);
|
|
empty_pages = ep;
|
|
ajlog(empty_pages, empty_pages->next, 0, AJT_CLEANUP_COLD_KEEPER);
|
|
atomic_fetch_add_explicit(&pages_kept_cold_index, 1, memory_order_relaxed);
|
|
}
|
|
else
|
|
{
|
|
/* We store this block as a pointer into the first free place
|
|
* and tell the OS that the underlying memory is trash. */
|
|
UNPROTECT_PAGE(empty_pages);
|
|
empty_pages->pages[empty_pages->pos++] = fp;
|
|
PROTECT_PAGE(empty_pages);
|
|
|
|
PROTECT_PAGE(fp);
|
|
if (madvise(fp, page_size,
|
|
#ifdef CONFIG_MADV_DONTNEED_TO_FREE
|
|
MADV_DONTNEED
|
|
#else
|
|
MADV_FREE
|
|
#endif
|
|
) < 0)
|
|
bug("madvise(%p) failed: %m", fp);
|
|
ajlog(fp, empty_pages, empty_pages->pos, AJT_CLEANUP_COLD_STD);
|
|
atomic_fetch_add_explicit(&pages_kept_cold, 1, memory_order_relaxed);
|
|
}
|
|
UNLOCK_DOMAIN(resource, empty_pages_domain);
|
|
count++;
|
|
}
|
|
while (atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2);
|
|
|
|
ALLOC_TRACE("Moved %u pages to cold storage, now %u cold, %u index", count,
|
|
atomic_load_explicit(&pages_kept_cold, memory_order_relaxed),
|
|
atomic_load_explicit(&pages_kept_cold_index, memory_order_relaxed)
|
|
);
|
|
|
|
ajlog(NULL, NULL, 0, AJT_CLEANUP_END);
|
|
}
|
|
#endif
|
|
|
|
void
|
|
page_dump(struct dump_request *dreq)
|
|
{
|
|
#ifdef HAVE_MMAP
|
|
RDUMP("Hot pages:\n");
|
|
struct free_page *fptop = PAGE_STACK_GET;
|
|
for (struct free_page *fp = fptop; fp; fp = atomic_load_explicit(&fp->next, memory_order_relaxed))
|
|
RDUMP(" %p\n", fp);
|
|
|
|
PAGE_STACK_PUT(fptop);
|
|
|
|
RDUMP("Cold pages:\n");
|
|
|
|
LOCK_DOMAIN(resource, empty_pages_domain);
|
|
for (struct empty_pages *ep = empty_pages; ep; ep = ep->next)
|
|
{
|
|
RDUMP(" %p (index)\n", ep);
|
|
for (uint i=0; i<ep->pos; i++)
|
|
RDUMP(" %p\n", ep->pages[i]);
|
|
}
|
|
UNLOCK_DOMAIN(resource, empty_pages_domain);
|
|
RDUMP("This request: %p\n", dreq);
|
|
#endif
|
|
}
|
|
|
|
void
|
|
resource_sys_init(void)
|
|
{
|
|
#ifdef HAVE_MALLOC_H
|
|
if (!mallopt(M_ARENA_MAX, 1))
|
|
log(L_WARN "Failed to disable multiple malloc arenas, memory consumption may skyrocket.");
|
|
#endif
|
|
|
|
#ifdef CONFIG_DISABLE_THP
|
|
/* Disable transparent huge pages, they do not work properly with madvice(MADV_DONTNEED) */
|
|
if (prctl(PR_SET_THP_DISABLE, (unsigned long) 1, (unsigned long) 0, (unsigned long) 0, (unsigned long) 0) < 0)
|
|
log(L_WARN "Cannot disable transparent huge pages: prctl(PR_SET_THP_DISABLE) failed: %m");
|
|
#endif
|
|
|
|
#ifdef HAVE_MMAP
|
|
/* Check what page size the system supports */
|
|
if (!(page_size = sysconf(_SC_PAGESIZE)))
|
|
die("System page size must be non-zero");
|
|
|
|
if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18)))
|
|
{
|
|
/* We assume that page size has only one bit and is between 1K and 256K (incl.).
|
|
* Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */
|
|
|
|
empty_pages_domain = DOMAIN_NEW(resource);
|
|
DOMAIN_SETUP(resource, empty_pages_domain, "Empty Pages", NULL);
|
|
initialized = 1;
|
|
return;
|
|
}
|
|
|
|
/* Too big or strange page, use the aligned allocator instead */
|
|
log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size);
|
|
use_fake = 1;
|
|
#endif
|
|
|
|
page_size = 4096;
|
|
initialized = 1;
|
|
}
|