From b14c463e097688a0d95a3a1531c5e970cd9281c3 Mon Sep 17 00:00:00 2001 From: ekyooo <46103109+ekyooo@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:39:37 +0900 Subject: [PATCH] libbpf-tools: add CO-RE profile (#3782) It is based on Brendan Gregg's BCC profile and much of the code for it is borrowed from offcputime.c and ruqlen.c. This is an example of usage. # ./profile -iK Sampling at 49 Hertz of all threads by kernel stack... Hit Ctrl-C to end. ^C _raw_spin_unlock_irq finish_task_switch __schedule schedule schedule_timeout msleep HDMI21_Tx_EARC_MainThread kthread ret_from_fork - hdmi21_earc_eng (1427) 1 __pi___clean_dcache_area_poc dma_direct_map_page dma_direct_map_sg dma_buf_unified_map dma_buf_map_attachment kbase_mem_umm_map_attachment kbase_mem_import kbase_ioctl __arm64_compat_sys_ioctl el0_svc_common.constprop.0 el0_svc_compat_handler el0_svc_compat - surface-manager (1911) 4 __pi___clean_dcache_area_poc dma_direct_map_page dma_direct_map_sg dma_buf_unified_map dma_buf_map_attachment kbase_mem_umm_map_attachment kbase_mem_import kbase_ioctl __arm64_compat_sys_ioctl el0_svc_common.constprop.0 el0_svc_compat_handler el0_svc_compat - tAiToneService (2141) 12 [Missed Kernel Stack] - WebAppMgr (3624) 1 ... This is the result of using the -f (folded) option. # ./profile -fdK ... tCMState;low_mem_notify_threshold;__alloc_pages_nodemask;__get_free_pages;__pollwait;unix_poll;sock_poll;do_sys_poll;__arm64_sys_poll;el0_svc_common.constprop.0;el0_svc_compat_handler;el0_svc_compat 2 surface-manager;sock_poll;do_sys_poll;__arm64_sys_poll;el0_svc_common.constprop.0;el0_svc_compat_handler;el0_svc_compat 1 qml-runner;sock_poll;do_sys_poll;__arm64_sys_poll;el0_svc_common.constprop.0;el0_svc_compat_handler;el0_svc_compat 96 mali-cmar-backe;_raw_spin_unlock_irqrestore;kbase_pm_update_cores_state;kbase_pm_do_poweron;kbase_pm_update_active;kbase_hwaccess_pm_gpu_active;kbase_pm_context_active_handle_suspend;kbase_js_sched;kbase_jd_submit;kbase_ioctl;__arm64_compat_sys_ioctl;el0_svc_common.constprop.0;el0_svc_compat_handler;el0_svc_compat 2 surface-manager;__fget;__fget_light;__fdget;sockfd_lookup_light;__sys_sendmsg;__arm64_compat_sys_sendmsg;el0_svc_common.constprop.0;el0_svc_compat_handler;el0_svc_compat 1 ... WARNING: 4 stack traces could not be displayed. Consider increasing --stack-storage-size. Signed-off-by: Eunseon Lee es.lee@lge.com --- libbpf-tools/.gitignore | 1 + libbpf-tools/Makefile | 1 + libbpf-tools/profile.bpf.c | 73 +++++ libbpf-tools/profile.c | 534 +++++++++++++++++++++++++++++++++++++ libbpf-tools/profile.h | 16 ++ 5 files changed, 625 insertions(+) create mode 100644 libbpf-tools/profile.bpf.c create mode 100644 libbpf-tools/profile.c create mode 100644 libbpf-tools/profile.h diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore index 2a76ec265c34..fb2ff35bf574 100644 --- a/libbpf-tools/.gitignore +++ b/libbpf-tools/.gitignore @@ -43,6 +43,7 @@ /offcputime /oomkill /opensnoop +/profile /readahead /runqlat /runqlen diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile index 7ed7ca16ba2f..2faaaba950e5 100644 --- a/libbpf-tools/Makefile +++ b/libbpf-tools/Makefile @@ -70,6 +70,7 @@ APPS = \ numamove \ offcputime \ oomkill \ + profile \ readahead \ runqlat \ runqlen \ diff --git a/libbpf-tools/profile.bpf.c b/libbpf-tools/profile.bpf.c new file mode 100644 index 000000000000..1d0b53b03f92 --- /dev/null +++ b/libbpf-tools/profile.bpf.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Copyright (c) 2022 LG Electronics + * + * Based on profile from BCC by Brendan Gregg and others. + * 28-Dec-2021 Eunseon Lee Created this. + */ +#include +#include +#include +#include +#include "profile.h" +#include "maps.bpf.h" + +const volatile bool kernel_stacks_only = false; +const volatile bool user_stacks_only = false; +const volatile bool include_idle = false; +const volatile pid_t targ_pid = -1; +const volatile pid_t targ_tid = -1; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); + +SEC("perf_event") +int do_perf_event(struct bpf_perf_event_data *ctx) +{ + u64 id = bpf_get_current_pid_tgid(); + u32 pid = id >> 32; + u32 tid = id; + u64 *valp; + static const u64 zero; + struct key_t key = {}; + + if (!include_idle && tid == 0) + return 0; + + if (targ_pid != -1 && targ_pid != pid) + return 0; + + if (targ_tid != -1 && targ_tid != tid) + return 0; + + key.pid = pid; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + if (user_stacks_only) + key.kern_stack_id = -1; + else + key.kern_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, 0); + + if (kernel_stacks_only) + key.user_stack_id = -1; + else + key.user_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, + BPF_F_USER_STACK); + + valp = bpf_map_lookup_or_try_init(&counts, &key, &zero); + if (valp) + __sync_fetch_and_add(valp, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/libbpf-tools/profile.c b/libbpf-tools/profile.c new file mode 100644 index 000000000000..95af848ec689 --- /dev/null +++ b/libbpf-tools/profile.c @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * profile Profile CPU usage by sampling stack traces at a timed interval. + * Copyright (c) 2022 LG Electronics + * + * Based on profile from BCC by Brendan Gregg and others. + * 28-Dec-2021 Eunseon Lee Created this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "profile.h" +#include "profile.skel.h" +#include "trace_helpers.h" + +#define OPT_PERF_MAX_STACK_DEPTH 1 /* --perf-max-stack-depth */ +#define OPT_STACK_STORAGE_SIZE 2 /* --stack-storage-size */ + +/* + * -EFAULT in get_stackid normally means the stack-trace is not available, + * such as getting kernel stack trace in user mode + */ +#define STACK_ID_EFAULT(stack_id) (stack_id == -EFAULT) + +#define STACK_ID_ERR(stack_id) ((stack_id < 0) && !STACK_ID_EFAULT(stack_id)) + +#define NEED_DELIMITER(delimiter, ustack_id, kstack_id) \ + (delimiter && ustack_id >= 0 && kstack_id >= 0) + +/* hash collision (-EEXIST) suggests that stack map size may be too small */ +#define CHECK_STACK_COLLISION(ustack_id, kstack_id) \ + (kstack_id == -EEXIST || ustack_id == -EEXIST) + +#define MISSING_STACKS(ustack_id, kstack_id) \ + (!env.user_stacks_only && STACK_ID_ERR(kstack_id)) + (!env.kernel_stacks_only && STACK_ID_ERR(ustack_id)) + +/* This structure combines key_t and count which should be sorted together */ +struct key_ext_t { + struct key_t k; + __u64 v; +}; + +typedef const char* (*symname_fn_t)(unsigned long); + +static struct env { + pid_t pid; + pid_t tid; + bool user_stacks_only; + bool kernel_stacks_only; + int stack_storage_size; + int perf_max_stack_depth; + int duration; + bool verbose; + bool freq; + int sample_freq; + bool delimiter; + bool include_idle; + int cpu; +} env = { + .pid = -1, + .tid = -1, + .stack_storage_size = 1024, + .perf_max_stack_depth = 127, + .duration = INT_MAX, + .freq = 1, + .sample_freq = 49, + .cpu = -1, +}; + +const char *argp_program_version = "profile 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +const char argp_program_doc[] = +"Profile CPU usage by sampling stack traces at a timed interval.\n" +"\n" +"USAGE: profile [OPTIONS...] [duration]\n" +"EXAMPLES:\n" +" profile # profile stack traces at 49 Hertz until Ctrl-C\n" +" profile -F 99 # profile stack traces at 99 Hertz\n" +" profile -c 1000000 # profile stack traces every 1 in a million events\n" +" profile 5 # profile at 49 Hertz for 5 seconds only\n" +" profile -p 185 # only profile process with PID 185\n" +" profile -L 185 # only profile thread with TID 185\n" +" profile -U # only show user space stacks (no kernel)\n" +" profile -K # only show kernel space stacks (no user)\n"; + +static const struct argp_option opts[] = { + { "pid", 'p', "PID", 0, "profile process with this PID only" }, + { "tid", 'L', "TID", 0, "profile thread with this TID only" }, + { "user-stacks-only", 'U', NULL, 0, + "show stacks from user space only (no kernel space stacks)" }, + { "kernel-stacks-only", 'K', NULL, 0, + "show stacks from kernel space only (no user space stacks)" }, + { "frequency", 'F', "FREQUENCY", 0, "sample frequency, Hertz" }, + { "delimited", 'd', NULL, 0, "insert delimiter between kernel/user stacks" }, + { "include-idle ", 'I', NULL, 0, "include CPU idle stacks" }, + { "stack-storage-size", OPT_STACK_STORAGE_SIZE, "STACK-STORAGE-SIZE", 0, + "the number of unique stack traces that can be stored and displayed (default 1024)" }, + { "cpu", 'C', "CPU", 0, "cpu number to run profile on" }, + { "perf-max-stack-depth", OPT_PERF_MAX_STACK_DEPTH, + "PERF-MAX-STACK-DEPTH", 0, "the limit for both kernel and user stack traces (default 127)" }, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +struct ksyms *ksyms; +struct syms_cache *syms_cache; +struct syms *syms; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'p': + errno = 0; + env.pid = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid PID: %s\n", arg); + argp_usage(state); + } + break; + case 'L': + errno = 0; + env.tid = strtol(arg, NULL, 10); + if (errno || env.tid <= 0) { + fprintf(stderr, "Invalid TID: %s\n", arg); + argp_usage(state); + } + break; + case 'U': + env.user_stacks_only = true; + break; + case 'K': + env.kernel_stacks_only = true; + break; + case 'F': + errno = 0; + env.sample_freq = strtol(arg, NULL, 10); + if (errno || env.sample_freq <= 0) { + fprintf(stderr, "invalid FREQUENCY: %s\n", arg); + argp_usage(state); + } + break; + case 'd': + env.delimiter = true; + break; + case 'I': + env.include_idle = true; + break; + case 'C': + errno = 0; + env.cpu = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid CPU: %s\n", arg); + argp_usage(state); + } + break; + case OPT_PERF_MAX_STACK_DEPTH: + errno = 0; + env.perf_max_stack_depth = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid perf max stack depth: %s\n", arg); + argp_usage(state); + } + break; + case OPT_STACK_STORAGE_SIZE: + errno = 0; + env.stack_storage_size = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid stack storage size: %s\n", arg); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + errno = 0; + env.duration = strtol(arg, NULL, 10); + if (errno || env.duration <= 0) { + fprintf(stderr, "Invalid duration (in s): %s\n", arg); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static int nr_cpus; + +static int open_and_attach_perf_event(int freq, struct bpf_program *prog, + struct bpf_link *links[]) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = env.freq, + .sample_freq = env.sample_freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + int i, fd; + + for (i = 0; i < nr_cpus; i++) { + if (env.cpu != -1 && env.cpu != i) + continue; + + fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (fd < 0) { + /* Ignore CPU that is offline */ + if (errno == ENODEV) + continue; + + fprintf(stderr, "failed to init perf sampling: %s\n", + strerror(errno)); + return -1; + } + + links[i] = bpf_program__attach_perf_event(prog, fd); + if (!links[i]) { + fprintf(stderr, "failed to attach perf event on cpu: " + "%d\n", i); + links[i] = NULL; + close(fd); + return -1; + } + } + + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + + return vfprintf(stderr, format, args); +} + +static void sig_handler(int sig) +{ +} + +static int cmp_counts(const void *a, const void *b) +{ + const __u64 x = ((struct key_ext_t *) a)->v; + const __u64 y = ((struct key_ext_t *) b)->v; + + /* descending order */ + return y - x; +} + +static int read_counts_map(int fd, struct key_ext_t *items, __u32 *count) +{ + struct key_t empty = {}; + struct key_t *lookup_key = ∅ + int i = 0; + int err; + + while (bpf_map_get_next_key(fd, lookup_key, &items[i].k) == 0) { + err = bpf_map_lookup_elem(fd, &items[i].k, &items[i].v); + if (err < 0) { + fprintf(stderr, "failed to lookup counts: %d\n", err); + return -err; + } + + if (items[i].v == 0) + continue; + + lookup_key = &items[i].k; + i++; + } + + *count = i; + return 0; +} + +static const char *ksymname(unsigned long addr) +{ + const struct ksym *ksym = ksyms__map_addr(ksyms, addr); + + return ksym ? ksym->name : "[unknown]"; +} + +static const char *usymname(unsigned long addr) +{ + const struct sym *sym = syms__map_addr(syms, addr); + + return sym ? sym->name : "[unknown]"; +} + +static void print_stacktrace(unsigned long *ip, symname_fn_t symname) +{ + for (size_t i = 0; ip[i] && i < env.perf_max_stack_depth; i++) + printf(" %s\n", symname(ip[i])); +} + +static int print_count(struct key_t *event, __u64 count, int stack_map) +{ + unsigned long *ip; + + ip = calloc(env.perf_max_stack_depth, sizeof(unsigned long)); + if (!ip) { + fprintf(stderr, "failed to alloc ip\n"); + return -ENOMEM; + } + + /* kernel stack */ + if (!env.user_stacks_only && !STACK_ID_EFAULT(event->kern_stack_id)) { + if (bpf_map_lookup_elem(stack_map, &event->kern_stack_id, ip) != 0) + printf(" [Missed Kernel Stack]\n"); + else + print_stacktrace(ip, ksymname); + } + + /* user stack */ + if (!env.kernel_stacks_only && !STACK_ID_EFAULT(event->user_stack_id)) { + if (NEED_DELIMITER(env.delimiter, event->user_stack_id, + event->kern_stack_id)) + printf(" --\n"); + + if (bpf_map_lookup_elem(stack_map, &event->user_stack_id, ip) != 0) { + printf(" [Missed User Stack]\n"); + } else { + syms = syms_cache__get_syms(syms_cache, event->pid); + if (!syms) + fprintf(stderr, "failed to get syms\n"); + else + print_stacktrace(ip, usymname); + } + } + + /* process information */ + printf(" %-16s %s (%d)\n", "-", event->name, event->pid); + + /* count sampled */ + printf(" %lld\n\n", count); + + free(ip); + + return 0; +} + +static int print_counts(int counts_map, int stack_map) +{ + struct key_ext_t *counts; + struct key_t *event; + __u64 count; + __u32 nr_count = MAX_ENTRIES; + size_t nr_missing_stacks = 0; + bool has_collision = false; + int i, ret = 0; + + counts = calloc(MAX_ENTRIES, sizeof(struct key_ext_t)); + if (!counts) { + fprintf(stderr, "Out of memory\n"); + return -ENOMEM; + } + + ret = read_counts_map(counts_map, counts, &nr_count); + if (ret) + goto cleanup; + + qsort(counts, nr_count, sizeof(struct key_ext_t), cmp_counts); + + for (i = 0; i < nr_count; i++) { + event = &counts[i].k; + count = counts[i].v; + + print_count(event, count, stack_map); + + /* handle stack id errors */ + nr_missing_stacks += MISSING_STACKS(event->user_stack_id, event->kern_stack_id); + has_collision = CHECK_STACK_COLLISION(event->user_stack_id, event->kern_stack_id); + } + + if (nr_missing_stacks > 0) { + fprintf(stderr, "WARNING: %zu stack traces could not be displayed.%s\n", + nr_missing_stacks, has_collision ? + " Consider increasing --stack-storage-size.":""); + } + +cleanup: + free(counts); + + return ret; +} + +static void print_headers() +{ + printf("Sampling at %d Hertz of", env.sample_freq); + + if (env.pid != -1) + printf(" PID %d", env.pid); + else if (env.tid != -1) + printf(" TID %d", env.tid); + else + printf(" all threads"); + + if (env.user_stacks_only) + printf(" by user"); + else if (env.kernel_stacks_only) + printf(" by kernel"); + else + printf(" by user + kernel"); + + if (env.cpu != -1) + printf(" on CPU#%d", env.cpu); + + if (env.duration < INT_MAX) + printf(" for %d secs.\n", env.duration); + else + printf("... Hit Ctrl-C to end.\n"); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + struct bpf_link *links[MAX_CPU_NR] = {}; + struct profile_bpf *obj; + int err, i; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + if (env.user_stacks_only && env.kernel_stacks_only) { + fprintf(stderr, "user_stacks_only and kernel_stacks_only cannot be used together.\n"); + return 1; + } + + libbpf_set_print(libbpf_print_fn); + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + printf("failed to get # of possible cpus: '%s'!\n", + strerror(-nr_cpus)); + return 1; + } + if (nr_cpus > MAX_CPU_NR) { + fprintf(stderr, "the number of cpu cores is too big, please " + "increase MAX_CPU_NR's value and recompile"); + return 1; + } + + obj = profile_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + return 1; + } + + /* initialize global data (filtering options) */ + obj->rodata->targ_pid = env.pid; + obj->rodata->targ_tid = env.tid; + obj->rodata->user_stacks_only = env.user_stacks_only; + obj->rodata->kernel_stacks_only = env.kernel_stacks_only; + obj->rodata->include_idle = env.include_idle; + + bpf_map__set_value_size(obj->maps.stackmap, + env.perf_max_stack_depth * sizeof(unsigned long)); + bpf_map__set_max_entries(obj->maps.stackmap, env.stack_storage_size); + + err = profile_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF programs\n"); + goto cleanup; + } + + ksyms = ksyms__load(); + if (!ksyms) { + fprintf(stderr, "failed to load kallsyms\n"); + goto cleanup; + } + + syms_cache = syms_cache__new(0); + if (!syms_cache) { + fprintf(stderr, "failed to create syms_cache\n"); + goto cleanup; + } + + err = open_and_attach_perf_event(env.freq, obj->progs.do_perf_event, links); + if (err) + goto cleanup; + + signal(SIGINT, sig_handler); + + print_headers(); + + /* + * We'll get sleep interrupted when someone presses Ctrl-C. + * (which will be "handled" with noop by sig_handler) + */ + sleep(env.duration); + + print_counts(bpf_map__fd(obj->maps.counts), + bpf_map__fd(obj->maps.stackmap)); + +cleanup: + if (env.cpu != -1) + bpf_link__destroy(links[env.cpu]); + else { + for (i = 0; i < nr_cpus; i++) + bpf_link__destroy(links[i]); + } + if (syms_cache) + syms_cache__free(syms_cache); + if (ksyms) + ksyms__free(ksyms); + profile_bpf__destroy(obj); + + return err != 0; +} diff --git a/libbpf-tools/profile.h b/libbpf-tools/profile.h new file mode 100644 index 000000000000..e3a106183c9d --- /dev/null +++ b/libbpf-tools/profile.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#ifndef __PROFILE_H +#define __PROFILE_H + +#define TASK_COMM_LEN 16 +#define MAX_CPU_NR 128 +#define MAX_ENTRIES 10240 + +struct key_t { + __u32 pid; + int user_stack_id; + int kern_stack_id; + char name[TASK_COMM_LEN]; +}; + +#endif /* __PROFILE_H */