/* * BIRD Internet Routing Daemon -- Raw allocation * * (c) 2020 Maria Matejka * * Can be freely distributed and used under the terms of the GNU GPL. */ #include "nest/bird.h" #include "lib/resource.h" #include "lib/lists.h" #include "lib/event.h" #include "lib/io-loop.h" #include "conf/conf.h" #include #include #include #ifdef HAVE_MMAP # include #endif #ifdef CONFIG_DISABLE_THP # include # ifndef PR_SET_THP_DISABLE # define PR_SET_THP_DISABLE 41 # endif #endif long page_size = 0; #ifdef HAVE_MMAP # define KEEP_PAGES_MAX 16384 # define KEEP_PAGES_MIN 32 # define KEEP_PAGES_MAX_LOCAL 128 # define ALLOC_PAGES_AT_ONCE 32 STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); static bool use_fake = 0; static bool initialized = 0; # define PROTECT_PAGE(pg) # define UNPROTECT_PAGE(pg) # if DEBUGGING # ifdef ENABLE_EXPENSIVE_CHECKS # undef PROTECT_PAGE # undef UNPROTECT_PAGE # define PROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ) # define UNPROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ | PROT_WRITE) # endif # define AJSIZE 16384 static struct alloc_journal { void *fp; void *next; u16 pos; u16 type; uint thread_id; } alloc_journal[AJSIZE]; _Thread_local int alloc_journal_local_pos = -1; _Atomic int alloc_journal_pos = 0; # define AJT_ALLOC_LOCAL_HOT 1 # define AJT_ALLOC_GLOBAL_HOT 2 # define AJT_ALLOC_COLD_STD 3 # define AJT_ALLOC_COLD_KEEPER 4 # define AJT_ALLOC_MMAP 5 # define AJT_FREE_LOCAL_HOT 0x11 # define AJT_FREE_GLOBAL_HOT 0x12 # define AJT_CLEANUP_NOTHING 0xc0 # define AJT_CLEANUP_COLD_STD 0xc3 # define AJT_CLEANUP_COLD_KEEPER 0xc4 # define AJT_CLEANUP_BEGIN 0xcb # define AJT_CLEANUP_END 0xce # define AJT_FLUSH_LOCAL_BEGIN 0xfb # define AJT_FLUSH_LOCAL_END 0xfe # define AJT_SCHEDULE_CLEANUP 0xff static void ajlog(void *fp, void *next, u16 pos, u16 type) { alloc_journal[(alloc_journal_local_pos = atomic_fetch_add_explicit(&alloc_journal_pos, 1, memory_order_relaxed)) % AJSIZE] = (struct alloc_journal) { .fp = fp, .next = next, .pos = pos, .type = type, .thread_id = THIS_THREAD_ID, }; } struct free_page { node unused[42]; struct free_page * _Atomic next; }; # else /* ! DEBUGGING */ # define ajlog(...) struct free_page { struct free_page * _Atomic next; }; # endif # define WRITE_NEXT(pg, val) do { UNPROTECT_PAGE((pg)); (pg)->next = (val); PROTECT_PAGE((pg)); } while (0) # define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) struct empty_pages { struct empty_pages *next; uint pos; void *pages[0]; }; static DOMAIN(resource) empty_pages_domain; static struct empty_pages *empty_pages = NULL; _Atomic int pages_kept_cold = 0; _Atomic int pages_kept_cold_index = 0; _Atomic int pages_total = 0; static struct free_page * _Atomic page_stack = NULL; static _Thread_local struct free_page * local_page_stack = NULL; static struct free_page page_stack_blocked; /* Try to replace the page stack head with a cork, until it succeeds. */ # define PAGE_STACK_GET ({ \ struct free_page *fp; \ while ((fp = atomic_exchange_explicit(&page_stack, &page_stack_blocked, memory_order_acq_rel)) == &page_stack_blocked) birdloop_yield(); \ fp; }) /* Reinstate the stack with another value */ # define PAGE_STACK_PUT(val) ASSERT_DIE(atomic_exchange_explicit(&page_stack, (val), memory_order_acq_rel) == &page_stack_blocked) static void page_cleanup(void *); static event page_cleanup_event = { .hook = page_cleanup, }; # define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) _Atomic int pages_kept = 0; _Atomic int pages_kept_locally = 0; static _Thread_local int pages_kept_here = 0; static void * alloc_sys_page(void) { void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) die("mmap(%ld) failed: %m", (s64) page_size); atomic_fetch_add_explicit(&pages_total, ALLOC_PAGES_AT_ONCE, memory_order_acq_rel); return ptr; } extern int shutting_down; /* Shutdown requested. */ #else // ! HAVE_MMAP # define use_fake 1 #endif #define ALLOC_TRACE(fmt...) do { \ if (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & DL_ALLOCATOR) log(L_TRACE "Allocator: " fmt, ##fmt); } while (0) void * alloc_page(void) { /* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */ if (use_fake) { atomic_fetch_add_explicit(&pages_total, 1, memory_order_acq_rel); void *ptr = NULL; int err = posix_memalign(&ptr, page_size, page_size); if (err || !ptr) die("posix_memalign(%ld) failed", (s64) page_size); return ptr; } #ifdef HAVE_MMAP /* If there is any free page kept hot in this thread, we use it. */ struct free_page *fp = local_page_stack; if (fp) { local_page_stack = atomic_load_explicit(&fp->next, memory_order_relaxed); atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); pages_kept_here--; UNPROTECT_PAGE(fp); ajlog(fp, local_page_stack, pages_kept_here, AJT_ALLOC_LOCAL_HOT); return fp; } ASSERT_DIE(pages_kept_here == 0); /* If there is any free page kept hot in global storage, we use it. */ if (fp = PAGE_STACK_GET) { /* Reinstate the stack with the next page in list */ PAGE_STACK_PUT(atomic_load_explicit(&fp->next, memory_order_relaxed)); /* Update the counters */ UNUSED uint pk = atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); /* Release the page */ UNPROTECT_PAGE(fp); ajlog(fp, atomic_load_explicit(&fp->next, memory_order_relaxed), pk, AJT_ALLOC_GLOBAL_HOT); return fp; } /* Reinstate the stack with zero */ PAGE_STACK_PUT(NULL); if (rcu_read_active()) { /* We can't lock and we actually shouldn't alloc either when rcu is active * but that's a quest for another day. */ } else { /* If there is any free page kept cold, we use that. */ LOCK_DOMAIN(resource, empty_pages_domain); if (empty_pages) { UNPROTECT_PAGE(empty_pages); if (empty_pages->pos) { /* Either the keeper page contains at least one cold page pointer, return that */ fp = empty_pages->pages[--empty_pages->pos]; PROTECT_PAGE(empty_pages); UNPROTECT_PAGE(fp); ajlog(fp, empty_pages, empty_pages->pos, AJT_ALLOC_COLD_STD); atomic_fetch_sub_explicit(&pages_kept_cold, 1, memory_order_relaxed); } else { /* Or the keeper page has no more cold page pointer, return the keeper page */ fp = (struct free_page *) empty_pages; empty_pages = empty_pages->next; ajlog(fp, empty_pages, 0, AJT_ALLOC_COLD_KEEPER); atomic_fetch_sub_explicit(&pages_kept_cold_index, 1, memory_order_relaxed); if (!empty_pages) ALLOC_TRACE("Taken last page from cold storage"); } } UNLOCK_DOMAIN(resource, empty_pages_domain); if (fp) return fp; } /* And in the worst case, allocate some new pages by mmap() */ void *ptr = alloc_sys_page(); ajlog(ptr, NULL, 0, AJT_ALLOC_MMAP); for (int i=1; inext, next, memory_order_relaxed); PROTECT_PAGE(fp); local_page_stack = fp; atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); pages_kept_here++; ajlog(fp, next, pages_kept_here, AJT_FREE_LOCAL_HOT); return; } /* If there are too many local pages, we add the free page to the global hot-free-page list */ struct free_page *next = PAGE_STACK_GET; atomic_store_explicit(&fp->next, next, memory_order_relaxed); PROTECT_PAGE(fp); /* Unblock the stack with the page being freed */ PAGE_STACK_PUT(fp); /* Update counters */ uint pk = atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed); ajlog(fp, next, pk, AJT_FREE_GLOBAL_HOT); /* And if there are too many global hot free pages, we ask for page cleanup */ if (pk >= KEEP_PAGES_MAX) SCHEDULE_CLEANUP; #endif } /* When the routine is going to sleep for a long time, we flush the local * hot page cache to not keep dirty pages for nothing. */ void flush_local_pages(void) { if (use_fake || !local_page_stack || shutting_down) return; ajlog(local_page_stack, NULL, pages_kept_here, AJT_FLUSH_LOCAL_BEGIN); /* We first count the pages to enable consistency checking. * Also, we need to know the last page. */ struct free_page *last = local_page_stack, *next; int check_count = 1; while (next = atomic_load_explicit(&last->next, memory_order_relaxed)) { check_count++; last = next; } /* The actual number of pages must be equal to the counter value. */ ASSERT_DIE(check_count == pages_kept_here); /* Block the stack by a cork */ UNPROTECT_PAGE(last); atomic_store_explicit(&last->next, PAGE_STACK_GET, memory_order_relaxed); PROTECT_PAGE(last); /* Update the stack */ PAGE_STACK_PUT(last); /* Finished. Now the local stack is empty. */ local_page_stack = NULL; pages_kept_here = 0; ajlog(NULL, NULL, 0, AJT_FLUSH_LOCAL_END); /* Check the state of global page cache and maybe schedule its cleanup. */ atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) SCHEDULE_CLEANUP; } #ifdef HAVE_MMAP static void page_cleanup(void *_ UNUSED) { /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */ if (shutting_down) return; /* Pages allocated inbetween */ uint pk = atomic_load_explicit(&pages_kept, memory_order_relaxed); if (pk < KEEP_PAGES_MAX) return; /* Walk the pages */ ajlog(NULL, NULL, 0, AJT_CLEANUP_BEGIN); uint count = 0; do { /* Get next hot page */ struct free_page *fp = PAGE_STACK_GET; if (!fp) { PAGE_STACK_PUT(NULL); ajlog(NULL, NULL, 0, AJT_CLEANUP_END); return; } /* Reinstate the stack with the next page in list */ PAGE_STACK_PUT(atomic_load_explicit(&fp->next, memory_order_relaxed)); /* Cold pages are locked */ LOCK_DOMAIN(resource, empty_pages_domain); /* Empty pages are stored as pointers. To store them, we need a pointer block. */ if (!empty_pages || (empty_pages->pos == EP_POS_MAX)) { /* There is either no pointer block or the last block is full. We use this block as a pointer block. */ struct empty_pages *ep = (struct empty_pages *) fp; UNPROTECT_PAGE(ep); *ep = (struct empty_pages) { .next = empty_pages, }; PROTECT_PAGE(ep); empty_pages = ep; ajlog(empty_pages, empty_pages->next, 0, AJT_CLEANUP_COLD_KEEPER); atomic_fetch_add_explicit(&pages_kept_cold_index, 1, memory_order_relaxed); } else { /* We store this block as a pointer into the first free place * and tell the OS that the underlying memory is trash. */ UNPROTECT_PAGE(empty_pages); empty_pages->pages[empty_pages->pos++] = fp; PROTECT_PAGE(empty_pages); PROTECT_PAGE(fp); if (madvise(fp, page_size, #ifdef CONFIG_MADV_DONTNEED_TO_FREE MADV_DONTNEED #else MADV_FREE #endif ) < 0) bug("madvise(%p) failed: %m", fp); ajlog(fp, empty_pages, empty_pages->pos, AJT_CLEANUP_COLD_STD); atomic_fetch_add_explicit(&pages_kept_cold, 1, memory_order_relaxed); } UNLOCK_DOMAIN(resource, empty_pages_domain); count++; } while (atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2); ALLOC_TRACE("Moved %u pages to cold storage, now %u cold, %u index", count, atomic_load_explicit(&pages_kept_cold, memory_order_relaxed), atomic_load_explicit(&pages_kept_cold_index, memory_order_relaxed) ); ajlog(NULL, NULL, 0, AJT_CLEANUP_END); } #endif void page_dump(struct dump_request *dreq) { #ifdef HAVE_MMAP RDUMP("Hot pages:\n"); struct free_page *fptop = PAGE_STACK_GET; for (struct free_page *fp = fptop; fp; fp = atomic_load_explicit(&fp->next, memory_order_relaxed)) RDUMP(" %p\n", fp); PAGE_STACK_PUT(fptop); RDUMP("Cold pages:\n"); LOCK_DOMAIN(resource, empty_pages_domain); for (struct empty_pages *ep = empty_pages; ep; ep = ep->next) { RDUMP(" %p (index)\n", ep); for (uint i=0; ipos; i++) RDUMP(" %p\n", ep->pages[i]); } UNLOCK_DOMAIN(resource, empty_pages_domain); RDUMP("This request: %p\n", dreq); #endif } void resource_sys_init(void) { #ifdef CONFIG_DISABLE_THP /* Disable transparent huge pages, they do not work properly with madvice(MADV_DONTNEED) */ if (prctl(PR_SET_THP_DISABLE, (unsigned long) 1, (unsigned long) 0, (unsigned long) 0, (unsigned long) 0) < 0) log(L_WARN "Cannot disable transparent huge pages: prctl(PR_SET_THP_DISABLE) failed: %m"); #endif #ifdef HAVE_MMAP /* Check what page size the system supports */ if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18))) { /* We assume that page size has only one bit and is between 1K and 256K (incl.). * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */ empty_pages_domain = DOMAIN_NEW(resource); DOMAIN_SETUP(resource, empty_pages_domain, "Empty Pages", NULL); initialized = 1; return; } /* Too big or strange page, use the aligned allocator instead */ log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size); use_fake = 1; #endif page_size = 4096; initialized = 1; }