diff --git a/Makefile b/Makefile index 22f78cbe0d..df3774b349 100644 --- a/Makefile +++ b/Makefile @@ -176,7 +176,6 @@ build: clean_build_local _build_local copy_build_local ## Build binary and copy .PHONY: generate generate: ## Generate BPF code locally. +@$(GOENV) go generate ./pkg/bpf - +@$(GOENV) go generate ./pkg/bpftest _build_local: generate ## Build Kepler binary locally. @echo TAGS=$(GO_BUILD_TAGS) @@ -275,7 +274,7 @@ container_test: TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep -v pkg/bpf | grep -v e2e) SUDO?=sudo -SUDO_TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep pkg/bpftest) +SUDO_TEST_PKGS := $(shell go list -tags $(GO_BUILD_TAGS) ./... | grep pkg/bpf) ##@ testing @@ -305,11 +304,11 @@ bpf-test: generate ginkgo-set ## Run BPF tests.$(GOBIN) -tags $(GO_TEST_TAGS) \ -cover \ --covermode=atomic \ - ./pkg/bpftest + ./pkg/bpf $(SUDO) $(ENVTEST_ASSETS_DIR)/ginkgo \ - ./pkg/bpftest/bpftest.test + ./pkg/bpf/bpf.test -escapes_detect: tidy-vendor +escapes_detect: tidy-vendor @$(GOENV) go build -tags $(GO_BUILD_TAGS) -gcflags="-m -l" ./... 2>&1 | grep "escapes to heap" || true check-govuln: govulncheck tidy-vendor ## Check Go vulnerabilities. diff --git a/bpf/kepler.bpf.c b/bpf/kepler.bpf.c index 80fb332471..251f1bcfec 100644 --- a/bpf/kepler.bpf.c +++ b/bpf/kepler.bpf.c @@ -3,6 +3,168 @@ #include "kepler.bpf.h" +// Ring buffer sizing +// 256kB is sufficient to store around 1000 events/sec for 5 seconds +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); // 256 KB +} rb SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cpu_cycles_event_reader SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cpu_instructions_event_reader SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, u32); +} cache_miss_event_reader SEC(".maps"); + +SEC(".rodata.config") +__attribute__((btf_decl_tag("Hardware Events Enabled"))) volatile const int HW = 1; + +static __always_inline u64 get_on_cpu_cycles(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cpu_cycles_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +static __always_inline u64 get_on_cpu_instr(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cpu_instructions_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +static __always_inline u64 get_on_cpu_cache_miss(u32 *cpu_id) +{ + long error; + struct bpf_perf_event_value c = {}; + + error = bpf_perf_event_read_value( + &cache_miss_event_reader, *cpu_id, &c, sizeof(c)); + if (error) + return 0; + + return c.counter; +} + +// Wake up userspace if there are at least 1000 events unprocessed +const long wakeup_data_size = sizeof(struct event) * 1000; + +// Get the flags for the ring buffer submit +static inline long get_flags() +{ + long sz; + + if (!wakeup_data_size) + return 0; + + sz = bpf_ringbuf_query(&rb, BPF_RB_AVAIL_DATA); + return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP; +} + +static inline int do_kepler_sched_switch_trace( + u32 prev_pid, u32 prev_tgid, u32 next_pid, u32 next_tgid) +{ + struct event *e; + u64 cpu_cycles, cpu_instr, cache_miss = 0; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + e->ts = bpf_ktime_get_ns(); + e->event_type = SCHED_SWITCH; + e->cpu_id = bpf_get_smp_processor_id(); + e->pid = next_tgid; + e->tid = next_pid; + e->offcpu_pid = prev_tgid; + e->offcpu_tid = prev_pid; + if (HW) { + e->cpu_cycles = get_on_cpu_cycles(&e->cpu_id); + e->cpu_instr = get_on_cpu_instr(&e->cpu_id); + e->cache_miss = get_on_cpu_cache_miss(&e->cpu_id); + } + e->offcpu_cgroup_id = bpf_get_current_cgroup_id(); + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + +static inline int do_kepler_irq_trace(u32 vec) +{ + struct event *e; + + // We are interested in NET_TX, NET_RX, and BLOCK + if (vec == NET_TX || vec == NET_RX || vec == BLOCK) { + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = IRQ; + e->ts = bpf_ktime_get_ns(); + e->cpu_id = bpf_get_smp_processor_id(); + e->pid = bpf_get_current_pid_tgid() >> 32; + e->tid = (u32)bpf_get_current_pid_tgid(); + e->irq_number = vec; + + bpf_ringbuf_submit(e, get_flags()); + } + + return 0; +} + +static inline int do_page_cache_hit_increment(u32 curr_tgid) +{ + struct event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = PAGE_CACHE_HIT; + e->ts = bpf_ktime_get_ns(); + e->pid = curr_tgid; + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + +static inline int do_process_free(u32 curr_tgid) +{ + struct event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + e->event_type = FREE; + e->ts = bpf_ktime_get_ns(); + e->pid = curr_tgid; + + bpf_ringbuf_submit(e, get_flags()); + + return 0; +} + SEC("tp_btf/sched_switch") int kepler_sched_switch_trace(u64 *ctx) { @@ -12,21 +174,17 @@ int kepler_sched_switch_trace(u64 *ctx) next_task = (struct task_struct *)ctx[2]; return do_kepler_sched_switch_trace( - prev_task->pid, next_task->pid, prev_task->tgid, next_task->tgid); + prev_task->pid, prev_task->tgid, next_task->pid, next_task->tgid); } SEC("tp_btf/softirq_entry") int kepler_irq_trace(u64 *ctx) { - u32 curr_tgid; - struct process_metrics_t *process_metrics; unsigned int vec; - - curr_tgid = bpf_get_current_pid_tgid() >> 32; vec = (unsigned int)ctx[0]; - process_metrics = bpf_map_lookup_elem(&processes, &curr_tgid); - if (process_metrics != 0 && vec < 10) - process_metrics->vec_nr[vec] += 1; + + do_kepler_irq_trace(vec); + return 0; } @@ -52,4 +210,37 @@ int kepler_write_page_trace(void *ctx) return 0; } +SEC("tp_btf/sched_process_free") +int kepler_sched_process_free(u64 *ctx) +{ + struct task_struct *task; + task = (struct task_struct *)ctx[0]; + do_process_free(task->tgid); + return 0; +} + +// TEST PROGRAMS - These programs are never attached in production + +SEC("raw_tp") +int test_kepler_write_page_trace(void *ctx) +{ + do_page_cache_hit_increment(42); + return 0; +} + +SEC("raw_tp") +int test_kepler_sched_switch_trace(u64 *ctx) +{ + // 42 going offcpu, 43 going on cpu + do_kepler_sched_switch_trace(42, 42, 43, 43); + return 0; +} + +SEC("raw_tp") +int test_kepler_sched_process_free(u64 *ctx) +{ + do_process_free(42); + return 0; +} + char __license[] SEC("license") = "Dual BSD/GPL"; diff --git a/bpf/kepler.bpf.h b/bpf/kepler.bpf.h index cb2da309a5..6eb4b96e5b 100644 --- a/bpf/kepler.bpf.h +++ b/bpf/kepler.bpf.h @@ -27,16 +27,7 @@ typedef struct pid_time_t { __u32 pid; } pid_time_t; -#ifndef NUM_CPUS -# define NUM_CPUS 128 -#endif - -#ifndef MAP_SIZE -# define MAP_SIZE 32768 -#endif - #include -#include enum bpf_map_type { BPF_MAP_TYPE_UNSPEC = 0, @@ -77,6 +68,29 @@ enum { BPF_F_LOCK = 4, }; +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +enum irq_type { + NET_TX = 2, + NET_RX = 3, + BLOCK = 4 +}; +const enum irq_type *unused2 __attribute__((unused)); + enum { BPF_F_INDEX_MASK = 0xffffffffULL, BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, @@ -90,285 +104,35 @@ struct bpf_perf_event_value { __u64 running; }; -typedef struct process_metrics_t { - u64 cgroup_id; - u64 pid; // pid is the kernel space view of the thread id - u64 process_run_time; +enum event_type { + SCHED_SWITCH = 1, + IRQ = 2, + PAGE_CACHE_HIT = 3, + FREE = 4 +}; + +// Force emitting enum event_type into the ELF. +const enum event_type *unused_event_type __attribute__((unused)); + +struct event { + u64 event_type; + u64 ts; + u32 pid; // kernel tgid == userspace pid + u32 tid; // kernel pid == userspace tid + u32 offcpu_pid; // kernel tgid == userspace pid + u32 offcpu_tid; // kernel pid == userspace tid + u64 offcpu_cgroup_id; // cgroup id is only known for processes going off cpu u64 cpu_cycles; u64 cpu_instr; u64 cache_miss; - u64 page_cache_hit; - u16 vec_nr[10]; - char comm[16]; -} process_metrics_t; - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, process_metrics_t); - __uint(max_entries, MAP_SIZE); -} processes SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_LRU_HASH); - __type(key, u32); - __type(value, u64); - __uint(max_entries, MAP_SIZE); -} pid_time_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cpu_cycles_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cpu_cycles SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cpu_instructions_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cpu_instructions SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __type(key, int); - __type(value, u32); - __uint(max_entries, NUM_CPUS); -} cache_miss_event_reader SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, NUM_CPUS); -} cache_miss SEC(".maps"); - -// Test mode skips unsupported helpers -SEC(".rodata.config") -__attribute__((btf_decl_tag("Test"))) static volatile const int TEST = 0; - -// Test mode skips unsupported helpers -SEC(".rodata.config") -__attribute__((btf_decl_tag( - "Hardware Events Enabled"))) static volatile const int HW = 1; - -// The sampling rate should be disabled by default because its impact on the -// measurements is unknown. -SEC(".rodata.config") -__attribute__(( - btf_decl_tag("Sample Rate"))) static volatile const int SAMPLE_RATE = 0; + u32 cpu_id; + u32 irq_number; // one of NET_TX, NET_RX, BLOCK +}; -int counter_sched_switch = 0; +// Force emitting struct event into the ELF. +const struct event *unused_event __attribute__((unused)); struct task_struct { int pid; unsigned int tgid; } __attribute__((preserve_access_index)); - -static inline u64 calc_delta(u64 *prev_val, u64 val) -{ - u64 delta = 0; - // Probably a clock issue where the recorded on-CPU event had a - // timestamp later than the recorded off-CPU event, or vice versa. - if (prev_val && val > *prev_val) - delta = val - *prev_val; - - return delta; -} - -static inline u64 get_on_cpu_elapsed_time_us(u32 prev_pid, u64 curr_ts) -{ - u64 cpu_time = 0; - u64 *prev_ts; - - prev_ts = bpf_map_lookup_elem(&pid_time_map, &prev_pid); - if (prev_ts) { - cpu_time = calc_delta(prev_ts, curr_ts) / 1000; - bpf_map_delete_elem(&pid_time_map, &prev_pid); - } - - return cpu_time; -} - -static inline u64 get_on_cpu_cycles(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cpu_cycles_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - - val = c.counter; - prev_val = bpf_map_lookup_elem(&cpu_cycles, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cpu_cycles, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline u64 get_on_cpu_instr(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cpu_instructions_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - - val = c.counter; - prev_val = bpf_map_lookup_elem(&cpu_instructions, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cpu_instructions, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline u64 get_on_cpu_cache_miss(u32 *cpu_id) -{ - u64 delta, val, *prev_val; - long error; - struct bpf_perf_event_value c = {}; - - error = bpf_perf_event_read_value( - &cache_miss_event_reader, *cpu_id, &c, sizeof(c)); - if (error) - return 0; - val = c.counter; - prev_val = bpf_map_lookup_elem(&cache_miss, cpu_id); - delta = calc_delta(prev_val, val); - bpf_map_update_elem(&cache_miss, cpu_id, &val, BPF_ANY); - - return delta; -} - -static inline void register_new_process_if_not_exist(u32 tgid) -{ - u64 cgroup_id; - struct process_metrics_t *curr_tgid_metrics; - - // create new process metrics - curr_tgid_metrics = bpf_map_lookup_elem(&processes, &tgid); - if (!curr_tgid_metrics) { - cgroup_id = bpf_get_current_cgroup_id(); - // the Kernel tgid is the user-space PID, and the Kernel pid is the - // user-space TID - process_metrics_t new_process = { - .pid = tgid, - .cgroup_id = cgroup_id, - }; - - if (!TEST) - bpf_get_current_comm( - &new_process.comm, sizeof(new_process.comm)); - - bpf_map_update_elem(&processes, &tgid, &new_process, BPF_NOEXIST); - } -} - -static inline void collect_metrics_and_reset_counters( - struct process_metrics_t *buf, u32 prev_pid, u64 curr_ts, u32 cpu_id) -{ - if (HW) { - buf->cpu_cycles = get_on_cpu_cycles(&cpu_id); - buf->cpu_instr = get_on_cpu_instr(&cpu_id); - buf->cache_miss = get_on_cpu_cache_miss(&cpu_id); - } - // Get current time to calculate the previous task on-CPU time - buf->process_run_time = get_on_cpu_elapsed_time_us(prev_pid, curr_ts); -} - -static inline void do_page_cache_hit_increment(u32 curr_pid) -{ - struct process_metrics_t *process_metrics; - - process_metrics = bpf_map_lookup_elem(&processes, &curr_pid); - if (process_metrics) - process_metrics->page_cache_hit++; -} - -static inline int do_kepler_sched_switch_trace( - u32 prev_pid, u32 next_pid, u32 prev_tgid, u32 next_tgid) -{ - u32 cpu_id; - u64 curr_ts = bpf_ktime_get_ns(); - - struct process_metrics_t *curr_tgid_metrics, *prev_tgid_metrics; - struct process_metrics_t buf = {}; - - cpu_id = bpf_get_smp_processor_id(); - - // Collect metrics - // Regardless of skipping the collection, we need to update the hardware - // counter events to keep the metrics map current. - collect_metrics_and_reset_counters(&buf, prev_pid, curr_ts, cpu_id); - - // Skip some samples to minimize overhead - // Note that we can only skip samples after updating the metric maps to - // collect the right values - if (SAMPLE_RATE > 0) { - if (counter_sched_switch > 0) { - counter_sched_switch--; - return 0; - } - counter_sched_switch = SAMPLE_RATE; - } - - // The process_run_time is 0 if we do not have the previous timestamp of - // the task or due to a clock issue. In either case, we skip collecting - // all metrics to avoid discrepancies between the hardware counter and CPU - // time. - if (buf.process_run_time > 0) { - prev_tgid_metrics = bpf_map_lookup_elem(&processes, &prev_tgid); - if (prev_tgid_metrics) { - prev_tgid_metrics->process_run_time += buf.process_run_time; - prev_tgid_metrics->cpu_cycles += buf.cpu_cycles; - prev_tgid_metrics->cpu_instr += buf.cpu_instr; - prev_tgid_metrics->cache_miss += buf.cache_miss; - } - } - - // Add task on-cpu running start time - bpf_map_update_elem(&pid_time_map, &next_pid, &curr_ts, BPF_ANY); - - // create new process metrics - register_new_process_if_not_exist(next_tgid); - - return 0; -} - -static __always_inline void * -bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) -{ - void *val; - int err; - - val = bpf_map_lookup_elem(map, key); - if (val) - return val; - - err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); - if (err && err != -17) - return 0; - - return bpf_map_lookup_elem(map, key); -} diff --git a/bpf/test.bpf.c b/bpf/test.bpf.c deleted file mode 100644 index 83b84ae3c1..0000000000 --- a/bpf/test.bpf.c +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -// Copyright 2021. - -#include "kepler.bpf.h" - -SEC("raw_tp") -int test_kepler_write_page_trace(void *ctx) -{ - do_page_cache_hit_increment(0); - return 0; -} - -SEC("raw_tp") -int test_register_new_process_if_not_exist(void *ctx) -{ - register_new_process_if_not_exist(42); - return 0; -} - -SEC("raw_tp/sched_switch") -int test_kepler_sched_switch_trace(u64 *ctx) -{ - do_kepler_sched_switch_trace(42, 43, 42, 43); - - return 0; -} - -char __license[] SEC("license") = "Dual BSD/GPL"; diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index 64f62b9525..a30c9d6fda 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -150,6 +150,11 @@ func main() { klog.Fatalf("failed to create eBPF exporter: %v", err) } defer bpfExporter.Detach() + stopCh := make(chan struct{}) + bpfErrCh := make(chan error) + go func() { + bpfErrCh <- bpfExporter.Start(stopCh) + }() m := manager.New(bpfExporter) @@ -199,6 +204,8 @@ func main() { select { case err := <-errChan: klog.Fatalf("%s", fmt.Sprintf("failed to listen and serve: %v", err)) + case err := <-bpfErrCh: + klog.Fatalf("%s", fmt.Sprintf("failed to start eBPF exporter: %v", err)) case <-signalChan: klog.Infof("Received shutdown signal") ctx, cancel := context.WithDeadline(ctx, time.Now().Add(5*time.Second)) diff --git a/go.mod b/go.mod index e7fbc8afcb..c92a5856af 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/HabanaAI/gohlml v1.16.0 github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f github.com/NVIDIA/go-nvml v0.12.0-1 - github.com/cilium/ebpf v0.15.0 + github.com/cilium/ebpf v0.16.0 github.com/jaypipes/ghw v0.12.0 github.com/joho/godotenv v1.5.1 github.com/jszwec/csvutil v1.10.0 @@ -19,7 +19,7 @@ require ( github.com/prometheus/client_golang v1.19.1 github.com/prometheus/prometheus v0.53.1 github.com/sirupsen/logrus v1.9.3 - golang.org/x/exp v0.0.0-20240119083558-1b970713d09a + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 golang.org/x/sys v0.22.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.29.7 @@ -69,12 +69,12 @@ require ( github.com/prometheus/procfs v0.12.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/testify v1.9.0 // indirect - golang.org/x/net v0.26.0 // indirect + golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect + golang.org/x/term v0.22.0 // indirect golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.22.0 // indirect + golang.org/x/tools v0.23.0 // indirect google.golang.org/protobuf v1.34.1 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index a10bf555dd..abccc13d29 100644 --- a/go.sum +++ b/go.sum @@ -16,8 +16,8 @@ github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJR github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cilium/ebpf v0.15.0 h1:7NxJhNiBT3NG8pZJ3c+yfrVdHY8ScgKD27sScgjLMMk= -github.com/cilium/ebpf v0.15.0/go.mod h1:DHp1WyrLeiBh19Cf/tfiSMhqheEiK8fXFZ4No0P1Hso= +github.com/cilium/ebpf v0.16.0 h1:+BiEnHL6Z7lXnlGUsXQPPAE7+kenAd4ES8MQ5min0Ok= +github.com/cilium/ebpf v0.16.0/go.mod h1:L7u2Blt2jMM/vLAVgjxluxtBKlz3/GWjB0dMOEngfwE= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -77,8 +77,12 @@ github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jszwec/csvutil v1.10.0 h1:upMDUxhQKqZ5ZDCs/wy+8Kib8rZR8I8lOR34yJkdqhI= @@ -95,6 +99,10 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= +github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= @@ -149,21 +157,23 @@ go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20240119083558-1b970713d09a h1:Q8/wZp0KX97QFTc2ywcOE0YRjZPVIx+MXInMzdvQqcA= -golang.org/x/exp v0.0.0-20240119083558-1b970713d09a/go.mod h1:idGWGoKP1toJGkd5/ig9ZLuPcZBC3ewk7SzmH0uou08= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -172,8 +182,8 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= @@ -184,8 +194,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA= -golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c= +golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= +golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/pkg/bpf/bpf_suite_test.go b/pkg/bpf/bpf_suite_test.go index 8fc64463b8..8baecb3e15 100644 --- a/pkg/bpf/bpf_suite_test.go +++ b/pkg/bpf/bpf_suite_test.go @@ -1,13 +1,303 @@ package bpf import ( + "bytes" + "encoding/binary" "testing" + "time" + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/ringbuf" + "github.com/cilium/ebpf/rlimit" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/onsi/gomega/gmeasure" + "golang.org/x/sys/unix" ) func TestBpf(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Bpf Suite") } + +var _ = Describe("BPF Exporter", func() { + It("should send a page cache hit event", func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record, err := rd.Read() + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(42))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypePAGE_CACHE_HIT))) + }) + + It("should send a process free event", func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerSchedProcessFree.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record, err := rd.Read() + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(42))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeFREE))) + }) + + It("should increment the page hit counter efficiently", func() { + experiment := gmeasure.NewExperiment("Increment the page hit counter") + AddReportEntry(experiment.Name, experiment) + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + experiment.Sample(func(idx int) { + experiment.MeasureDuration("page hit counter increment", func() { + out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + }, gmeasure.Precision(time.Nanosecond)) + }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) + }) + + It("collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + perfEvents, err := createTestHardwarePerfEvents( + obj.CpuInstructionsEventReader, + obj.CpuCyclesEventReader, + obj.CacheMissEventReader, + ) + Expect(err).NotTo(HaveOccurred()) + defer func() { + for _, fd := range perfEvents { + unix.Close(fd) + } + }() + + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record := new(ringbuf.Record) + + err = rd.ReadInto(record) + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(43))) + Expect(event.Tid).To(Equal(uint32(43))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeSCHED_SWITCH))) + Expect(event.CpuCycles).To(BeNumerically(">", uint64(0))) + Expect(event.CpuInstr).To(BeNumerically(">", uint64(0))) + Expect(event.CacheMiss).To(BeNumerically(">", uint64(0))) + Expect(event.OffcpuPid).To(Equal(uint32(42))) + Expect(event.OffcpuTid).To(Equal(uint32(42))) + }) + + It("collects metrics for sched_switch events when no hardware events are enabled", Label("perf_event"), func() { + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + err = specs.RewriteConstants(map[string]interface{}{ + "HW": int32(-1), + }) + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) + + // Read the event from the ring buffer + rd, err := ringbuf.NewReader(obj.Rb) + Expect(err).NotTo(HaveOccurred()) + defer rd.Close() + + var event keplerEvent + record := new(ringbuf.Record) + + err = rd.ReadInto(record) + Expect(err).NotTo(HaveOccurred()) + + err = binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event) + Expect(err).NotTo(HaveOccurred()) + Expect(event.Pid).To(Equal(uint32(43))) + Expect(event.Tid).To(Equal(uint32(43))) + Expect(event.Ts).To(BeNumerically(">", uint64(0))) + Expect(event.EventType).To(Equal(uint64(keplerEventTypeSCHED_SWITCH))) + Expect(event.CpuCycles).To(BeNumerically("==", uint64(0))) + Expect(event.CpuInstr).To(BeNumerically("==", uint64(0))) + Expect(event.CacheMiss).To(BeNumerically("==", uint64(0))) + Expect(event.OffcpuPid).To(Equal(uint32(42))) + Expect(event.OffcpuTid).To(Equal(uint32(42))) + }) + + It("efficiently collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { + experiment := gmeasure.NewExperiment("sched_switch tracepoint") + AddReportEntry(experiment.Name, experiment) + // Remove resource limits for kernels <5.11. + err := rlimit.RemoveMemlock() + Expect(err).NotTo(HaveOccurred()) + + // Load eBPF Specs + specs, err := loadKepler() + Expect(err).NotTo(HaveOccurred()) + + var obj keplerObjects + // Load eBPF objects + err = specs.LoadAndAssign(&obj, nil) + Expect(err).NotTo(HaveOccurred()) + + perfEvents, err := createTestHardwarePerfEvents( + obj.CpuInstructionsEventReader, + obj.CpuCyclesEventReader, + obj.CacheMissEventReader, + ) + Expect(err).NotTo(HaveOccurred()) + defer func() { + for _, fd := range perfEvents { + unix.Close(fd) + } + }() + experiment.Sample(func(idx int) { + experiment.MeasureDuration("sched_switch tracepoint", func() { + runSchedSwitchTracepoint(&obj) + }, gmeasure.Precision(time.Nanosecond)) + Expect(err).NotTo(HaveOccurred()) + }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) + }) +}) + +func runSchedSwitchTracepoint(obj *keplerObjects) { + out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ + Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU + CPU: uint32(0), + }) + Expect(err).NotTo(HaveOccurred()) + Expect(out).To(Equal(uint32(0))) +} + +// This function is used to create hardware perf events for CPU cycles, instructions and cache misses. +// Instead of using hardware perf events, we use the software perf event for testing purposes. +func createTestHardwarePerfEvents(cpuCyclesMap, cpuInstructionsMap, cacheMissMap *ebpf.Map) ([]int, error) { + cpuCyclesFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cpuCyclesMap.Update(uint32(0), uint32(cpuCyclesFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + cpuInstructionsFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cpuInstructionsMap.Update(uint32(0), uint32(cpuInstructionsFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + cacheMissFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK, 1) + if err != nil { + return nil, err + } + err = cacheMissMap.Update(uint32(0), uint32(cacheMissFd[0]), ebpf.UpdateAny) + if err != nil { + return nil, err + } + + return []int{cpuCyclesFd[0], cpuInstructionsFd[0], cacheMissFd[0]}, nil +} diff --git a/pkg/bpf/exporter.go b/pkg/bpf/exporter.go index ded8a88887..c9814fcc69 100644 --- a/pkg/bpf/exporter.go +++ b/pkg/bpf/exporter.go @@ -17,15 +17,18 @@ limitations under the License. package bpf import ( + "bytes" + "encoding/binary" "errors" "fmt" "os" "runtime" - "time" + "sync" "unsafe" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" + "github.com/cilium/ebpf/ringbuf" "github.com/cilium/ebpf/rlimit" "github.com/jaypipes/ghw" "github.com/sustainable-computing-io/kepler/pkg/config" @@ -35,23 +38,38 @@ import ( ) type exporter struct { - bpfObjects keplerObjects - + bpfObjects keplerObjects + cpus int schedSwitchLink link.Link irqLink link.Link pageWriteLink link.Link pageReadLink link.Link + processFreeLink link.Link perfEvents *hardwarePerfEvents enabledHardwareCounters sets.Set[string] enabledSoftwareCounters sets.Set[string] + + // Locks processMetrics and freedPIDs. + // Acquired in CollectProcesses - to prevent new events from being processed + // while summarizing the metrics and resetting the counters. + // Acquired in handleEvents - to prevent CollectProcesses from summarizing + // the metrics while we're handling an event from the ring buffer. + // Note: Release this lock as soon as possible as it will block the + // processing of new events from the ring buffer. + mu *sync.Mutex + processMetrics map[uint32]*bpfMetrics + freedPIDs []int } func NewExporter() (Exporter, error) { e := &exporter{ + cpus: ebpf.MustPossibleCPU(), enabledHardwareCounters: sets.New[string](), enabledSoftwareCounters: sets.New[string](), + mu: &sync.Mutex{}, + processMetrics: make(map[uint32]*bpfMetrics), } err := e.attach() if err != nil { @@ -89,20 +107,20 @@ func (e *exporter) attach() error { } } - // Set program global variables - err = specs.RewriteConstants(map[string]interface{}{ - "SAMPLE_RATE": int32(config.BPFSampleRate), - }) - if err != nil { - return fmt.Errorf("error rewriting program constants: %v", err) - } - // Load the eBPF program(s) if err := specs.LoadAndAssign(&e.bpfObjects, nil); err != nil { return fmt.Errorf("error loading eBPF objects: %v", err) } // Attach the eBPF program(s) + e.processFreeLink, err = link.AttachTracing(link.TracingOptions{ + Program: e.bpfObjects.KeplerSchedProcessFree, + AttachType: ebpf.AttachTraceRawTp, + }) + if err != nil { + return fmt.Errorf("error attaching sched_process_free tracepoint: %v", err) + } + e.schedSwitchLink, err = link.AttachTracing(link.TracingOptions{ Program: e.bpfObjects.KeplerSchedSwitchTrace, AttachType: ebpf.AttachTraceRawTp, @@ -192,38 +210,212 @@ func (e *exporter) Detach() { } // Perf events - e.perfEvents.close() - e.perfEvents = nil + if e.perfEvents != nil { + e.perfEvents.close() + e.perfEvents = nil + } // Objects e.bpfObjects.Close() } -func (e *exporter) CollectProcesses() ([]ProcessMetrics, error) { - start := time.Now() - // Get the max number of entries in the map - maxEntries := e.bpfObjects.Processes.MaxEntries() - total := 0 - deleteKeys := make([]uint32, maxEntries) - deleteValues := make([]ProcessMetrics, maxEntries) - var cursor ebpf.MapBatchCursor +func (e *exporter) Start(stopChan <-chan struct{}) error { + rd, err := ringbuf.NewReader(e.bpfObjects.Rb) + if err != nil { + return fmt.Errorf("failed to create ring buffer reader: %w", err) + } + defer rd.Close() + for { - count, err := e.bpfObjects.Processes.BatchLookupAndDelete( - &cursor, - deleteKeys, - deleteValues, - &ebpf.BatchOptions{}, - ) - total += count - if errors.Is(err, ebpf.ErrKeyNotExist) { - break + var record *ringbuf.Record + + select { + case <-stopChan: + if err := rd.Close(); err != nil { + return fmt.Errorf("closing ring buffer reader: %w", err) + } + return nil + default: + var event keplerEvent + record = new(ringbuf.Record) + + err := rd.ReadInto(record) + if err != nil { + if errors.Is(err, ringbuf.ErrClosed) { + return nil + } + if errors.Is(err, ringbuf.ErrFlushed) { + record.RawSample = record.RawSample[:0] + } + klog.Errorf("reading from reader: %s", err) + continue + } + + if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.NativeEndian, &event); err != nil { + klog.Errorf("parsing ringbuf event: %s", err) + continue + } + + // Process the event + e.handleEvent(event) } - if err != nil { - return nil, fmt.Errorf("failed to batch lookup and delete: %v", err) + } +} + +type bpfMetrics struct { + CGroupID uint64 + CPUCyles PerCPUCounter + CPUInstructions PerCPUCounter + CacheMiss PerCPUCounter + CPUTime PerCPUCounter + TxIRQ uint64 + RxIRQ uint64 + BlockIRQ uint64 + PageCacheHit uint64 +} + +func (p *bpfMetrics) Reset() { + p.CPUCyles.Reset() + p.CPUInstructions.Reset() + p.CacheMiss.Reset() + p.CPUTime.Reset() + p.TxIRQ = 0 + p.RxIRQ = 0 + p.BlockIRQ = 0 + p.PageCacheHit = 0 +} + +func newBpfMetrics() *bpfMetrics { + return &bpfMetrics{ + CPUCyles: NewPerCPUCounter(), + CPUInstructions: NewPerCPUCounter(), + CacheMiss: NewPerCPUCounter(), + CPUTime: NewPerCPUCounter(), + } +} + +type PerCPUCounter struct { + Values map[uint64]uint64 + Total uint64 +} + +func NewPerCPUCounter() PerCPUCounter { + return PerCPUCounter{ + Values: make(map[uint64]uint64), + } +} + +func (p *PerCPUCounter) Start(cpu, taskID uint32, value uint64) { + key := uint64(cpu)<<32 | uint64(taskID) + + // TODO: The eBPF code would blindly overwrite the value if it already exists. + // We will preserve the old behavior for now, but we should consider + // returning an error if the value already exists. + p.Values[key] = value +} + +func (p *PerCPUCounter) Stop(cpu, taskID uint32, value uint64) { + if value == 0 { + return + } + + key := uint64(cpu)<<32 | uint64(taskID) + + if _, ok := p.Values[key]; !ok { + return + } + + delta := uint64(0) + + // Probably a clock issue where the recorded on-CPU event had a + // timestamp later than the recorded off-CPU event, or vice versa. + if value > p.Values[key] { + delta = value - p.Values[key] + } + + p.Total += delta + + delete(p.Values, key) +} + +func (p *PerCPUCounter) Reset() { + // Leave values in place since we may have in-flight + p.Total = 0 +} + +func (e *exporter) handleEvent(event keplerEvent) { + e.mu.Lock() + defer e.mu.Unlock() + + var p *bpfMetrics + + if _, ok := e.processMetrics[event.Pid]; !ok { + e.processMetrics[event.Pid] = newBpfMetrics() + } + p = e.processMetrics[event.Pid] + + switch event.EventType { + case uint64(keplerEventTypeSCHED_SWITCH): + // Handle the new task going on CPU + p.CPUCyles.Start(event.CpuId, event.Tid, event.CpuCycles) + p.CPUInstructions.Start(event.CpuId, event.Tid, event.CpuInstr) + p.CacheMiss.Start(event.CpuId, event.Tid, event.CacheMiss) + p.CPUTime.Start(event.CpuId, event.Tid, event.Ts) + + // Handle the task going OFF CPU + if _, ok := e.processMetrics[event.OffcpuPid]; !ok { + e.processMetrics[event.OffcpuPid] = newBpfMetrics() + } + offcpu := e.processMetrics[event.OffcpuPid] + offcpu.CPUCyles.Stop(event.CpuId, event.OffcpuTid, event.CpuCycles) + offcpu.CPUInstructions.Stop(event.CpuId, event.OffcpuTid, event.CpuInstr) + offcpu.CacheMiss.Stop(event.CpuId, event.OffcpuTid, event.CacheMiss) + offcpu.CPUTime.Stop(event.CpuId, event.OffcpuTid, event.Ts) + offcpu.CGroupID = event.OffcpuCgroupId + case uint64(keplerEventTypePAGE_CACHE_HIT): + p.PageCacheHit += 1 + case uint64(keplerEventTypeIRQ): + switch event.IrqNumber { + case uint32(keplerIrqTypeNET_TX): + p.TxIRQ += 1 + case uint32(keplerIrqTypeNET_RX): + p.RxIRQ += 1 + case uint32(keplerIrqTypeBLOCK): + p.BlockIRQ += 1 } + return + case uint64(keplerEventTypeFREE): + e.freedPIDs = append(e.freedPIDs, int(event.Pid)) + } +} + +func (e *exporter) CollectProcesses() (ProcessMetricsCollection, error) { + e.mu.Lock() + defer e.mu.Unlock() + + result := ProcessMetricsCollection{ + Metrics: make([]ProcessMetrics, len(e.processMetrics)), + FreedPIDs: e.freedPIDs, } - klog.V(5).Infof("collected %d process samples in %v", total, time.Since(start)) - return deleteValues[:total], nil + for pid, m := range e.processMetrics { + result.Metrics = append(result.Metrics, ProcessMetrics{ + CGroupID: m.CGroupID, + Pid: uint64(pid), + ProcessRunTime: m.CPUTime.Total / 1000, // convert nanoseconds to milliseconds + CPUCyles: m.CPUCyles.Total, + CPUInstructions: m.CPUInstructions.Total, + CacheMiss: m.CacheMiss.Total, + PageCacheHit: m.PageCacheHit, + NetTxIRQ: m.TxIRQ, + NetRxIRQ: m.RxIRQ, + NetBlockIRQ: m.BlockIRQ, + }) + m.Reset() + } + // Clear the cache of any PIDs freed this sample period + e.freedPIDs = []int{} + + return result, nil } /////////////////////////////////////////////////////////////////////////// @@ -281,12 +473,12 @@ func (h *hardwarePerfEvents) close() { func createHardwarePerfEvents(cpuInstructionsMap, cpuCyclesMap, cacheMissMap *ebpf.Map, numCPU int) (*hardwarePerfEvents, error) { var err error events := &hardwarePerfEvents{ - cpuCyclesPerfEvents: []int{}, - cpuInstructionsPerfEvents: []int{}, - cacheMissPerfEvents: []int{}, + cpuCyclesPerfEvents: make([]int, 0, numCPU), + cpuInstructionsPerfEvents: make([]int, 0, numCPU), + cacheMissPerfEvents: make([]int, 0, numCPU), } defer func() { - if err != nil { + if err != nil && events != nil { unixClosePerfEvents(events.cpuCyclesPerfEvents) unixClosePerfEvents(events.cpuInstructionsPerfEvents) unixClosePerfEvents(events.cacheMissPerfEvents) diff --git a/pkg/bpf/exporter_test.go b/pkg/bpf/exporter_test.go new file mode 100644 index 0000000000..57736a25dc --- /dev/null +++ b/pkg/bpf/exporter_test.go @@ -0,0 +1,45 @@ +package bpf + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("PerCPUCounter", func() { + + var counter PerCPUCounter + BeforeEach(func() { + counter = NewPerCPUCounter() + }) + + It("should record the correct delta for one time period", func() { + counter.Start(1, 1, 10) + key := uint64(1)<<32 | uint64(1) + Expect(counter.Values[key]).To(Equal(uint64(10))) + counter.Stop(1, 1, 21) + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(11))) + }) + + It("should record the correct delta for an additional time period", func() { + counter.Start(1, 1, 10) + key := uint64(1)<<32 | uint64(1) + Expect(counter.Values[key]).To(Equal(uint64(10))) + counter.Stop(1, 1, 21) + + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(11))) + counter.Start(1, 1, 30) + + Expect(counter.Values[key]).To(Equal(uint64(30))) + counter.Stop(1, 1, 42) + Expect(counter.Values).NotTo(ContainElement(key)) + Expect(counter.Total).To(Equal(uint64(23))) + }) + + It("should not increment Total if Start() has not been called", func() { + counter.Stop(1, 1, 42) + Expect(counter.Total).To(Equal(uint64(0))) + }) + +}) diff --git a/pkg/bpf/gen.go b/pkg/bpf/gen.go index 95ac1f552e..b22c0ca7bb 100644 --- a/pkg/bpf/gen.go +++ b/pkg/bpf/gen.go @@ -1,3 +1,3 @@ package bpf -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 kepler ../../bpf/kepler.bpf.c -- -I../../bpf/include +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 -type event -type event_type -type irq_type kepler ../../bpf/kepler.bpf.c -- -I../../bpf/include diff --git a/pkg/bpf/kepler_bpfeb.go b/pkg/bpf/kepler_bpfeb.go index 1f82307638..eda2abc0f9 100644 --- a/pkg/bpf/kepler_bpfeb.go +++ b/pkg/bpf/kepler_bpfeb.go @@ -12,19 +12,38 @@ import ( "github.com/cilium/ebpf" ) -type keplerProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 +type keplerEvent struct { + EventType uint64 + Ts uint64 + Pid uint32 + Tid uint32 + OffcpuPid uint32 + OffcpuTid uint32 + OffcpuCgroupId uint64 CpuCycles uint64 CpuInstr uint64 CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte + CpuId uint32 + IrqNumber uint32 } +type keplerEventType uint32 + +const ( + keplerEventTypeSCHED_SWITCH keplerEventType = 1 + keplerEventTypeIRQ keplerEventType = 2 + keplerEventTypePAGE_CACHE_HIT keplerEventType = 3 + keplerEventTypeFREE keplerEventType = 4 +) + +type keplerIrqType uint32 + +const ( + keplerIrqTypeNET_TX keplerIrqType = 2 + keplerIrqTypeNET_RX keplerIrqType = 3 + keplerIrqTypeBLOCK keplerIrqType = 4 +) + // loadKepler returns the embedded CollectionSpec for kepler. func loadKepler() (*ebpf.CollectionSpec, error) { reader := bytes.NewReader(_KeplerBytes) @@ -66,24 +85,24 @@ type keplerSpecs struct { // // It can be passed ebpf.CollectionSpec.Assign. type keplerProgramSpecs struct { - KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` } // keplerMapSpecs contains maps before they are loaded into the kernel. // // It can be passed ebpf.CollectionSpec.Assign. type keplerMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` + Rb *ebpf.MapSpec `ebpf:"rb"` } // keplerObjects contains all objects after they have been loaded into the kernel. @@ -105,26 +124,18 @@ func (o *keplerObjects) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` + Rb *ebpf.Map `ebpf:"rb"` } func (m *keplerMaps) Close() error { return _KeplerClose( - m.CacheMiss, m.CacheMissEventReader, - m.CpuCycles, m.CpuCyclesEventReader, - m.CpuInstructions, m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, + m.Rb, ) } @@ -132,18 +143,26 @@ func (m *keplerMaps) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerPrograms struct { - KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.Program `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.Program `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` } func (p *keplerPrograms) Close() error { return _KeplerClose( p.KeplerIrqTrace, p.KeplerReadPageTrace, + p.KeplerSchedProcessFree, p.KeplerSchedSwitchTrace, p.KeplerWritePageTrace, + p.TestKeplerSchedProcessFree, + p.TestKeplerSchedSwitchTrace, + p.TestKeplerWritePageTrace, ) } diff --git a/pkg/bpf/kepler_bpfeb.o b/pkg/bpf/kepler_bpfeb.o index be2eedd1df..c624302b7a 100644 Binary files a/pkg/bpf/kepler_bpfeb.o and b/pkg/bpf/kepler_bpfeb.o differ diff --git a/pkg/bpf/kepler_bpfel.go b/pkg/bpf/kepler_bpfel.go index 3fdd09bcda..f99d21dff7 100644 --- a/pkg/bpf/kepler_bpfel.go +++ b/pkg/bpf/kepler_bpfel.go @@ -12,19 +12,38 @@ import ( "github.com/cilium/ebpf" ) -type keplerProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 +type keplerEvent struct { + EventType uint64 + Ts uint64 + Pid uint32 + Tid uint32 + OffcpuPid uint32 + OffcpuTid uint32 + OffcpuCgroupId uint64 CpuCycles uint64 CpuInstr uint64 CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte + CpuId uint32 + IrqNumber uint32 } +type keplerEventType uint32 + +const ( + keplerEventTypeSCHED_SWITCH keplerEventType = 1 + keplerEventTypeIRQ keplerEventType = 2 + keplerEventTypePAGE_CACHE_HIT keplerEventType = 3 + keplerEventTypeFREE keplerEventType = 4 +) + +type keplerIrqType uint32 + +const ( + keplerIrqTypeNET_TX keplerIrqType = 2 + keplerIrqTypeNET_RX keplerIrqType = 3 + keplerIrqTypeBLOCK keplerIrqType = 4 +) + // loadKepler returns the embedded CollectionSpec for kepler. func loadKepler() (*ebpf.CollectionSpec, error) { reader := bytes.NewReader(_KeplerBytes) @@ -66,24 +85,24 @@ type keplerSpecs struct { // // It can be passed ebpf.CollectionSpec.Assign. type keplerProgramSpecs struct { - KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.ProgramSpec `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.ProgramSpec `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.ProgramSpec `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` } // keplerMapSpecs contains maps before they are loaded into the kernel. // // It can be passed ebpf.CollectionSpec.Assign. type keplerMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` + Rb *ebpf.MapSpec `ebpf:"rb"` } // keplerObjects contains all objects after they have been loaded into the kernel. @@ -105,26 +124,18 @@ func (o *keplerObjects) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` + Rb *ebpf.Map `ebpf:"rb"` } func (m *keplerMaps) Close() error { return _KeplerClose( - m.CacheMiss, m.CacheMissEventReader, - m.CpuCycles, m.CpuCyclesEventReader, - m.CpuInstructions, m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, + m.Rb, ) } @@ -132,18 +143,26 @@ func (m *keplerMaps) Close() error { // // It can be passed to loadKeplerObjects or ebpf.CollectionSpec.LoadAndAssign. type keplerPrograms struct { - KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` - KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` - KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` - KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + KeplerIrqTrace *ebpf.Program `ebpf:"kepler_irq_trace"` + KeplerReadPageTrace *ebpf.Program `ebpf:"kepler_read_page_trace"` + KeplerSchedProcessFree *ebpf.Program `ebpf:"kepler_sched_process_free"` + KeplerSchedSwitchTrace *ebpf.Program `ebpf:"kepler_sched_switch_trace"` + KeplerWritePageTrace *ebpf.Program `ebpf:"kepler_write_page_trace"` + TestKeplerSchedProcessFree *ebpf.Program `ebpf:"test_kepler_sched_process_free"` + TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` + TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` } func (p *keplerPrograms) Close() error { return _KeplerClose( p.KeplerIrqTrace, p.KeplerReadPageTrace, + p.KeplerSchedProcessFree, p.KeplerSchedSwitchTrace, p.KeplerWritePageTrace, + p.TestKeplerSchedProcessFree, + p.TestKeplerSchedSwitchTrace, + p.TestKeplerWritePageTrace, ) } diff --git a/pkg/bpf/kepler_bpfel.o b/pkg/bpf/kepler_bpfel.o index 7f61cec6f9..672e3a04f7 100644 Binary files a/pkg/bpf/kepler_bpfel.o and b/pkg/bpf/kepler_bpfel.o differ diff --git a/pkg/bpf/test_utils.go b/pkg/bpf/test_utils.go index 4105f64b96..ef71b56b07 100644 --- a/pkg/bpf/test_utils.go +++ b/pkg/bpf/test_utils.go @@ -36,6 +36,10 @@ func NewMockExporter(bpfSupportedMetrics SupportedMetrics) Exporter { } } +func (m *mockExporter) Start(<-chan struct{}) error { + return nil +} + func (m *mockExporter) SupportedMetrics() SupportedMetrics { return SupportedMetrics{ HardwareCounters: m.hardwareCounters, @@ -45,18 +49,22 @@ func (m *mockExporter) SupportedMetrics() SupportedMetrics { func (m *mockExporter) Detach() {} -func (m *mockExporter) CollectProcesses() ([]ProcessMetrics, error) { - return []ProcessMetrics{ - { - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, +func (m *mockExporter) CollectProcesses() (ProcessMetricsCollection, error) { + return ProcessMetricsCollection{ + Metrics: []ProcessMetrics{ + { + CGroupID: 0, + Pid: 0, + ProcessRunTime: 0, + CPUCyles: 0, + CPUInstructions: 0, + CacheMiss: 0, + PageCacheHit: 0, + NetTxIRQ: 0, + NetRxIRQ: 0, + NetBlockIRQ: 0, + }, }, + FreedPIDs: []int{0}, }, nil } diff --git a/pkg/bpf/types.go b/pkg/bpf/types.go index 840a3d9bb0..077b69cd3e 100644 --- a/pkg/bpf/types.go +++ b/pkg/bpf/types.go @@ -20,22 +20,29 @@ import ( "k8s.io/apimachinery/pkg/util/sets" ) -const ( - // Per /sys/kernel/debug/tracing/events/irq/softirq_entry/format - // { 0, "HI" }, { 1, "TIMER" }, { 2, "NET_TX" }, { 3, "NET_RX" }, { 4, "BLOCK" }, { 5, "IRQ_POLL" }, { 6, "TASKLET" }, { 7, "SCHED" }, { 8, "HRTIMER" }, { 9, "RCU" } - - // IRQ vector to IRQ number - IRQNetTX = 2 - IRQNetRX = 3 - IRQBlock = 4 -) - -type ProcessMetrics = keplerProcessMetricsT - type Exporter interface { SupportedMetrics() SupportedMetrics Detach() - CollectProcesses() ([]ProcessMetrics, error) + CollectProcesses() (ProcessMetricsCollection, error) + Start(<-chan struct{}) error +} + +type ProcessMetrics struct { + CGroupID uint64 + Pid uint64 + ProcessRunTime uint64 + CPUCyles uint64 + CPUInstructions uint64 + CacheMiss uint64 + PageCacheHit uint64 + NetTxIRQ uint64 + NetRxIRQ uint64 + NetBlockIRQ uint64 +} + +type ProcessMetricsCollection struct { + Metrics []ProcessMetrics + FreedPIDs []int } type SupportedMetrics struct { diff --git a/pkg/bpftest/bpf_suite_test.go b/pkg/bpftest/bpf_suite_test.go deleted file mode 100644 index 74b911431f..0000000000 --- a/pkg/bpftest/bpf_suite_test.go +++ /dev/null @@ -1,450 +0,0 @@ -package bpftest - -import ( - "fmt" - "syscall" - "testing" - "time" - "unsafe" - - "github.com/cilium/ebpf" - "github.com/cilium/ebpf/rlimit" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "github.com/onsi/gomega/gmeasure" - "golang.org/x/sys/unix" -) - -func TestBpf(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "Bpf Suite") -} - -var _ = Describe("BPF Exporter", func() { - It("should increment the page cache hit counter", func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - key := uint32(0) - - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - // Read the page cache hit counter - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - - Expect(res.PageCacheHit).To(BeNumerically("==", uint64(1))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("should register a new process if one doesn't exist", func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestRegisterNewProcessIfNotExist.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(BeNumerically("==", uint32(0))) - - // Read the page cache hit counter - var res testProcessMetricsT - key := uint32(42) // Kernel TGID - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - - Expect(res.Pid).To(BeNumerically("==", uint64(42))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("should increment the page hit counter efficiently", func() { - experiment := gmeasure.NewExperiment("Increment the page hit counter") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - key := uint32(0) - - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 0, - ProcessRunTime: 0, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - - experiment.Sample(func(idx int) { - experiment.MeasureDuration("page hit counter increment", func() { - out, err := obj.TestKeplerWritePageTrace.Run(&ebpf.RunOptions{}) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - }, gmeasure.Precision(time.Nanosecond)) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) - - It("collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "HW": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - Expect(res.CpuCycles).To(BeNumerically(">", uint64(0))) - Expect(res.CpuInstr).To(BeNumerically(">", uint64(0))) - Expect(res.CacheMiss).To(BeNumerically(">", uint64(0))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("collects metrics for sched_switch events when no hardware events are enabled", Label("perf_event"), func() { - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "HW": int32(0), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err = obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) - - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) - - var res testProcessMetricsT - err = obj.Processes.Lookup(key, &res) - Expect(err).NotTo(HaveOccurred()) - Expect(res.CpuCycles).To(BeNumerically("==", uint64(0))) - Expect(res.ProcessRunTime).To(BeNumerically(">", uint64(0))) - - err = obj.Processes.Delete(key) - Expect(err).NotTo(HaveOccurred()) - }) - - It("efficiently collects hardware counter metrics for sched_switch events", Label("perf_event"), func() { - experiment := gmeasure.NewExperiment("sched_switch tracepoint") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - experiment.Sample(func(idx int) { - preRunSchedSwitchTracepoint(&obj) - experiment.MeasureDuration("sampled sched_switch tracepoint", func() { - runSchedSwitchTracepoint(&obj) - }, gmeasure.Precision(time.Nanosecond)) - err = obj.Processes.Delete(uint32(42)) - Expect(err).NotTo(HaveOccurred()) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) - - It("uses sample rate to reduce CPU time", Label("perf_event"), func() { - experiment := gmeasure.NewExperiment("sampled sched_switch tracepoint") - AddReportEntry(experiment.Name, experiment) - // Remove resource limits for kernels <5.11. - err := rlimit.RemoveMemlock() - Expect(err).NotTo(HaveOccurred()) - - // Load eBPF Specs - specs, err := loadTest() - Expect(err).NotTo(HaveOccurred()) - - err = specs.RewriteConstants(map[string]interface{}{ - "TEST": int32(1), - "SAMPLE_RATE": int32(1000), - }) - Expect(err).NotTo(HaveOccurred()) - - var obj testObjects - // Load eBPF objects - err = specs.LoadAndAssign(&obj, nil) - Expect(err).NotTo(HaveOccurred()) - - perfEvents, err := createHardwarePerfEvents( - obj.CpuInstructionsEventReader, - obj.CpuCyclesEventReader, - obj.CacheMissEventReader, - ) - Expect(err).NotTo(HaveOccurred()) - defer func() { - for _, fd := range perfEvents { - unix.Close(fd) - } - }() - experiment.Sample(func(idx int) { - preRunSchedSwitchTracepoint(&obj) - experiment.MeasureDuration("sampled sched_switch tracepoint", func() { - runSchedSwitchTracepoint(&obj) - }, gmeasure.Precision(time.Nanosecond)) - err = obj.Processes.Delete(uint32(42)) - Expect(err).NotTo(HaveOccurred()) - }, gmeasure.SamplingConfig{N: 1000000, Duration: 10 * time.Second}) - }) -}) - -func getNSecs() uint64 { - var ts syscall.Timespec - _, _, err := syscall.Syscall(syscall.SYS_CLOCK_GETTIME, 4, uintptr(unsafe.Pointer(&ts)), 0) - if err != 0 { - panic(err) - } - return uint64(ts.Sec*1e9 + ts.Nsec) -} - -func preRunSchedSwitchTracepoint(obj *testObjects) { - // Register TGID 42 - This would be done by register_new_process_if_not_exist - // when we get a sched_switch event for a new process - key := uint32(42) - nsecs := getNSecs() - err := obj.Processes.Put(key, testProcessMetricsT{ - CgroupId: 0, - Pid: 42, - ProcessRunTime: nsecs, - CpuCycles: 0, - CpuInstr: 0, - CacheMiss: 0, - PageCacheHit: 0, - VecNr: [10]uint16{}, - Comm: [16]int8{}, - }) - Expect(err).NotTo(HaveOccurred()) - err = obj.PidTimeMap.Put(key, nsecs) - Expect(err).NotTo(HaveOccurred()) -} - -func runSchedSwitchTracepoint(obj *testObjects) { - out, err := obj.TestKeplerSchedSwitchTrace.Run(&ebpf.RunOptions{ - Flags: uint32(1), // BPF_F_TEST_RUN_ON_CPU - CPU: uint32(0), - }) - Expect(err).NotTo(HaveOccurred()) - Expect(out).To(Equal(uint32(0))) -} - -func unixOpenPerfEvent(typ, conf int) (int, error) { - sysAttr := &unix.PerfEventAttr{ - Type: uint32(typ), - Size: uint32(unsafe.Sizeof(unix.PerfEventAttr{})), - Config: uint64(conf), - } - - cloexecFlags := unix.PERF_FLAG_FD_CLOEXEC - fd, err := unix.PerfEventOpen(sysAttr, -1, 0, -1, cloexecFlags) - if fd < 0 { - return 0, fmt.Errorf("failed to open bpf perf event on cpu 0: %w", err) - } - - return fd, nil -} - -// This function is used to create hardware perf events for CPU cycles, instructions and cache misses. -// Instead of using hardware perf events, we use the software perf event for testing purposes. -func createHardwarePerfEvents(cpuCyclesMap, cpuInstructionsMap, cacheMissMap *ebpf.Map) ([]int, error) { - cpuCyclesFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cpuCyclesMap.Update(uint32(0), uint32(cpuCyclesFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - cpuInstructionsFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cpuInstructionsMap.Update(uint32(0), uint32(cpuInstructionsFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - cacheMissFd, err := unixOpenPerfEvent(unix.PERF_TYPE_SOFTWARE, unix.PERF_COUNT_SW_CPU_CLOCK) - if err != nil { - return nil, err - } - err = cacheMissMap.Update(uint32(0), uint32(cacheMissFd), ebpf.UpdateAny) - if err != nil { - return nil, err - } - - return []int{cpuCyclesFd, cpuInstructionsFd, cacheMissFd}, nil -} diff --git a/pkg/bpftest/gen.go b/pkg/bpftest/gen.go deleted file mode 100644 index 94def23250..0000000000 --- a/pkg/bpftest/gen.go +++ /dev/null @@ -1,3 +0,0 @@ -package bpftest - -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go@v0.15.0 test ../../bpf/test.bpf.c -- -I../../bpf/include diff --git a/pkg/bpftest/test_bpfeb.go b/pkg/bpftest/test_bpfeb.go deleted file mode 100644 index db3880c877..0000000000 --- a/pkg/bpftest/test_bpfeb.go +++ /dev/null @@ -1,159 +0,0 @@ -// Code generated by bpf2go; DO NOT EDIT. -//go:build mips || mips64 || ppc64 || s390x - -package bpftest - -import ( - "bytes" - _ "embed" - "fmt" - "io" - - "github.com/cilium/ebpf" -) - -type testProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 - CpuCycles uint64 - CpuInstr uint64 - CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte -} - -// loadTest returns the embedded CollectionSpec for test. -func loadTest() (*ebpf.CollectionSpec, error) { - reader := bytes.NewReader(_TestBytes) - spec, err := ebpf.LoadCollectionSpecFromReader(reader) - if err != nil { - return nil, fmt.Errorf("can't load test: %w", err) - } - - return spec, err -} - -// loadTestObjects loads test and converts it into a struct. -// -// The following types are suitable as obj argument: -// -// *testObjects -// *testPrograms -// *testMaps -// -// See ebpf.CollectionSpec.LoadAndAssign documentation for details. -func loadTestObjects(obj interface{}, opts *ebpf.CollectionOptions) error { - spec, err := loadTest() - if err != nil { - return err - } - - return spec.LoadAndAssign(obj, opts) -} - -// testSpecs contains maps and programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testSpecs struct { - testProgramSpecs - testMapSpecs -} - -// testSpecs contains programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testProgramSpecs struct { - TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.ProgramSpec `ebpf:"test_register_new_process_if_not_exist"` -} - -// testMapSpecs contains maps before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` -} - -// testObjects contains all objects after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testObjects struct { - testPrograms - testMaps -} - -func (o *testObjects) Close() error { - return _TestClose( - &o.testPrograms, - &o.testMaps, - ) -} - -// testMaps contains all maps after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` -} - -func (m *testMaps) Close() error { - return _TestClose( - m.CacheMiss, - m.CacheMissEventReader, - m.CpuCycles, - m.CpuCyclesEventReader, - m.CpuInstructions, - m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, - ) -} - -// testPrograms contains all programs after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testPrograms struct { - TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.Program `ebpf:"test_register_new_process_if_not_exist"` -} - -func (p *testPrograms) Close() error { - return _TestClose( - p.TestKeplerSchedSwitchTrace, - p.TestKeplerWritePageTrace, - p.TestRegisterNewProcessIfNotExist, - ) -} - -func _TestClose(closers ...io.Closer) error { - for _, closer := range closers { - if err := closer.Close(); err != nil { - return err - } - } - return nil -} - -// Do not access this directly. -// -//go:embed test_bpfeb.o -var _TestBytes []byte diff --git a/pkg/bpftest/test_bpfeb.o b/pkg/bpftest/test_bpfeb.o deleted file mode 100644 index f190a3ab7b..0000000000 Binary files a/pkg/bpftest/test_bpfeb.o and /dev/null differ diff --git a/pkg/bpftest/test_bpfel.go b/pkg/bpftest/test_bpfel.go deleted file mode 100644 index 7317a75a1b..0000000000 --- a/pkg/bpftest/test_bpfel.go +++ /dev/null @@ -1,159 +0,0 @@ -// Code generated by bpf2go; DO NOT EDIT. -//go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64 - -package bpftest - -import ( - "bytes" - _ "embed" - "fmt" - "io" - - "github.com/cilium/ebpf" -) - -type testProcessMetricsT struct { - CgroupId uint64 - Pid uint64 - ProcessRunTime uint64 - CpuCycles uint64 - CpuInstr uint64 - CacheMiss uint64 - PageCacheHit uint64 - VecNr [10]uint16 - Comm [16]int8 - _ [4]byte -} - -// loadTest returns the embedded CollectionSpec for test. -func loadTest() (*ebpf.CollectionSpec, error) { - reader := bytes.NewReader(_TestBytes) - spec, err := ebpf.LoadCollectionSpecFromReader(reader) - if err != nil { - return nil, fmt.Errorf("can't load test: %w", err) - } - - return spec, err -} - -// loadTestObjects loads test and converts it into a struct. -// -// The following types are suitable as obj argument: -// -// *testObjects -// *testPrograms -// *testMaps -// -// See ebpf.CollectionSpec.LoadAndAssign documentation for details. -func loadTestObjects(obj interface{}, opts *ebpf.CollectionOptions) error { - spec, err := loadTest() - if err != nil { - return err - } - - return spec.LoadAndAssign(obj, opts) -} - -// testSpecs contains maps and programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testSpecs struct { - testProgramSpecs - testMapSpecs -} - -// testSpecs contains programs before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testProgramSpecs struct { - TestKeplerSchedSwitchTrace *ebpf.ProgramSpec `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.ProgramSpec `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.ProgramSpec `ebpf:"test_register_new_process_if_not_exist"` -} - -// testMapSpecs contains maps before they are loaded into the kernel. -// -// It can be passed ebpf.CollectionSpec.Assign. -type testMapSpecs struct { - CacheMiss *ebpf.MapSpec `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.MapSpec `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.MapSpec `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.MapSpec `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.MapSpec `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.MapSpec `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.MapSpec `ebpf:"pid_time_map"` - Processes *ebpf.MapSpec `ebpf:"processes"` -} - -// testObjects contains all objects after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testObjects struct { - testPrograms - testMaps -} - -func (o *testObjects) Close() error { - return _TestClose( - &o.testPrograms, - &o.testMaps, - ) -} - -// testMaps contains all maps after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testMaps struct { - CacheMiss *ebpf.Map `ebpf:"cache_miss"` - CacheMissEventReader *ebpf.Map `ebpf:"cache_miss_event_reader"` - CpuCycles *ebpf.Map `ebpf:"cpu_cycles"` - CpuCyclesEventReader *ebpf.Map `ebpf:"cpu_cycles_event_reader"` - CpuInstructions *ebpf.Map `ebpf:"cpu_instructions"` - CpuInstructionsEventReader *ebpf.Map `ebpf:"cpu_instructions_event_reader"` - PidTimeMap *ebpf.Map `ebpf:"pid_time_map"` - Processes *ebpf.Map `ebpf:"processes"` -} - -func (m *testMaps) Close() error { - return _TestClose( - m.CacheMiss, - m.CacheMissEventReader, - m.CpuCycles, - m.CpuCyclesEventReader, - m.CpuInstructions, - m.CpuInstructionsEventReader, - m.PidTimeMap, - m.Processes, - ) -} - -// testPrograms contains all programs after they have been loaded into the kernel. -// -// It can be passed to loadTestObjects or ebpf.CollectionSpec.LoadAndAssign. -type testPrograms struct { - TestKeplerSchedSwitchTrace *ebpf.Program `ebpf:"test_kepler_sched_switch_trace"` - TestKeplerWritePageTrace *ebpf.Program `ebpf:"test_kepler_write_page_trace"` - TestRegisterNewProcessIfNotExist *ebpf.Program `ebpf:"test_register_new_process_if_not_exist"` -} - -func (p *testPrograms) Close() error { - return _TestClose( - p.TestKeplerSchedSwitchTrace, - p.TestKeplerWritePageTrace, - p.TestRegisterNewProcessIfNotExist, - ) -} - -func _TestClose(closers ...io.Closer) error { - for _, closer := range closers { - if err := closer.Close(); err != nil { - return err - } - } - return nil -} - -// Do not access this directly. -// -//go:embed test_bpfel.o -var _TestBytes []byte diff --git a/pkg/bpftest/test_bpfel.o b/pkg/bpftest/test_bpfel.o deleted file mode 100644 index 5c8972722c..0000000000 Binary files a/pkg/bpftest/test_bpfel.o and /dev/null differ diff --git a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go index d45bc59c3f..2a132ab553 100644 --- a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go +++ b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go @@ -18,11 +18,10 @@ package bpf import "C" import ( - "unsafe" - "github.com/sustainable-computing-io/kepler/pkg/bpf" "github.com/sustainable-computing-io/kepler/pkg/cgroup" "github.com/sustainable-computing-io/kepler/pkg/collector/stats" + "github.com/sustainable-computing-io/kepler/pkg/comm" "github.com/sustainable-computing-io/kepler/pkg/config" "github.com/sustainable-computing-io/kepler/pkg/libvirt" "github.com/sustainable-computing-io/kepler/pkg/utils" @@ -32,6 +31,8 @@ import ( type ProcessBPFMetrics = bpf.ProcessMetrics +var commResolver = comm.NewCommResolver() + // update software counter metrics func updateSWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64]*stats.ProcessStats, bpfSupportedMetrics bpf.SupportedMetrics) { // update ebpf metrics @@ -43,11 +44,11 @@ func updateSWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64 case config.PageCacheHit: processStats[key].ResourceUsage[config.PageCacheHit].AddDeltaStat(utils.GenericSocketID, ct.PageCacheHit/(1000*1000)) case config.IRQNetTXLabel: - processStats[key].ResourceUsage[config.IRQNetTXLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQNetTX])) + processStats[key].ResourceUsage[config.IRQNetTXLabel].AddDeltaStat(utils.GenericSocketID, ct.NetTxIRQ) case config.IRQNetRXLabel: - processStats[key].ResourceUsage[config.IRQNetRXLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQNetRX])) + processStats[key].ResourceUsage[config.IRQNetRXLabel].AddDeltaStat(utils.GenericSocketID, ct.NetRxIRQ) case config.IRQBlockLabel: - processStats[key].ResourceUsage[config.IRQBlockLabel].AddDeltaStat(utils.GenericSocketID, uint64(ct.VecNr[bpf.IRQBlock])) + processStats[key].ResourceUsage[config.IRQBlockLabel].AddDeltaStat(utils.GenericSocketID, ct.NetBlockIRQ) default: klog.Errorf("counter %s is not supported\n", counterKey) } @@ -61,13 +62,13 @@ func updateHWCounters(key uint64, ct *ProcessBPFMetrics, processStats map[uint64 var event string switch counterKey { case config.CPUCycle: - val = ct.CpuCycles + val = ct.CPUCyles event = config.CPUCycle case config.CPURefCycle: - val = ct.CpuCycles + val = ct.CPUCyles event = config.CPURefCycle case config.CPUInstruction: - val = ct.CpuInstr + val = ct.CPUInstructions event = config.CPUInstruction case config.CacheMiss: val = ct.CacheMiss @@ -86,12 +87,23 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* klog.Errorln("could not collect ebpf metrics") return } - for _, ct := range processesData { - comm := C.GoString((*C.char)(unsafe.Pointer(&ct.Comm))) + + // Clear the cache of any PIDs freed this sample period. + // This is safe given that the *stats.ProcessStats.Command is only updated if it is not already known. + // If it is a long-running process, the comm will be preserved from the previous sample period. + commResolver.Clear(processesData.FreedPIDs) + + for _, ct := range processesData.Metrics { + processComm, err := commResolver.ResolveComm(int(ct.Pid)) + if err != nil { + // skip process that is not running + klog.V(6).Infof("failed to resolve comm for PID %v: %v, set comm=%s", ct.Pid, err, utils.SystemProcessName) + continue + } if ct.Pid != 0 { klog.V(6).Infof("process %s (pid=%d, cgroup=%d) has %d CPU cycles, %d instructions, %d cache misses, %d page cache hits", - comm, ct.Pid, ct.CgroupId, ct.CpuCycles, ct.CpuInstr, ct.CacheMiss, ct.PageCacheHit) + processComm, ct.Pid, ct.CGroupID, ct.CPUCyles, ct.CPUInstructions, ct.CacheMiss, ct.PageCacheHit) } // skip process without resource utilization if ct.CacheMiss == 0 && ct.PageCacheHit == 0 { @@ -99,9 +111,9 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* } // if the pid is within a container, it will have a container ID - containerID, err := cgroup.GetContainerID(ct.CgroupId, ct.Pid, config.EnabledEBPFCgroupID) + containerID, err := cgroup.GetContainerID(ct.CGroupID, ct.Pid, config.EnabledEBPFCgroupID) if err != nil { - klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.Pid, comm, err, utils.SystemProcessName) + klog.V(6).Infof("failed to resolve container for PID %v (command=%s): %v, set containerID=%s", ct.Pid, processComm, err, utils.SystemProcessName) } // if the pid is within a VM, it will have an VM ID @@ -109,12 +121,12 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* if config.IsExposeVMStatsEnabled() { vmID, err = libvirt.GetVMID(ct.Pid) if err != nil { - klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", ct.Pid, comm, err) + klog.V(6).Infof("failed to resolve VM ID for PID %v (command=%s): %v", ct.Pid, processComm, err) } } mapKey := ct.Pid - if ct.CgroupId == 1 && config.EnabledEBPFCgroupID { + if ct.CGroupID == 1 && config.EnabledEBPFCgroupID { // we aggregate all kernel process to minimize overhead // all kernel process has cgroup id as 1 and pid 1 is also a kernel process mapKey = 1 @@ -124,11 +136,12 @@ func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]* var ok bool var pStat *stats.ProcessStats if pStat, ok = processStats[mapKey]; !ok { - pStat = stats.NewProcessStats(ct.Pid, ct.CgroupId, containerID, vmID, comm, bpfSupportedMetrics) + pStat = stats.NewProcessStats(ct.Pid, ct.CGroupID, containerID, vmID, processComm, bpfSupportedMetrics) processStats[mapKey] = pStat } else if pStat.Command == "" { - pStat.Command = comm + pStat.Command = processComm } + // when the process metrics are updated, reset the idle counter pStat.IdleCounter = 0 diff --git a/pkg/comm/resolve_comm.go b/pkg/comm/resolve_comm.go new file mode 100644 index 0000000000..7a4ffe6053 --- /dev/null +++ b/pkg/comm/resolve_comm.go @@ -0,0 +1,105 @@ +package comm + +import ( + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" +) + +const unknownComm = "unknown" + +type CommResolver struct { + cacheExist map[int]string + cacheNotExist map[int]struct{} + procFsResolver func(pid int) (string, error) +} + +func NewCommResolver() *CommResolver { + return &CommResolver{ + cacheExist: map[int]string{}, + cacheNotExist: map[int]struct{}{}, + procFsResolver: readCommandFromProcFs, + } +} + +func NewTestCommResolver(procFsResolver func(pid int) (string, error)) *CommResolver { + return &CommResolver{ + cacheExist: map[int]string{}, + cacheNotExist: map[int]struct{}{}, + procFsResolver: procFsResolver, + } +} + +func (r *CommResolver) ResolveComm(pid int) (string, error) { + if comm, ok := r.cacheExist[pid]; ok { + return comm, nil + } + if _, ok := r.cacheNotExist[pid]; ok { + return unknownComm, fmt.Errorf("process not running") + } + + comm, err := r.procFsResolver(pid) + if err != nil && os.IsNotExist(err) { + // skip process that is not running + r.cacheNotExist[pid] = struct{}{} + return unknownComm, fmt.Errorf("process not running: %w", err) + } + + r.cacheExist[pid] = comm + return comm, nil +} + +func (r *CommResolver) Clear(freed []int) { + for _, pid := range freed { + delete(r.cacheExist, pid) + delete(r.cacheNotExist, pid) + } +} + +func readCommandFromProcFs(pid int) (string, error) { + if _, err := os.Stat("/proc/" + strconv.Itoa(pid)); os.IsNotExist(err) { + return "", err + } + var comm string + if cmdLineBytes, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/cmdline"); err == nil { + comm = readCommandFromProcFsCmdline(cmdLineBytes) + } + if comm != "" { + return comm, nil + } + if commBytes, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/comm"); err == nil { + comm = readCommandFromProcFsComm(commBytes) + } + if comm != "" { + return comm, nil + } + return unknownComm, nil +} + +// This gives the same output as `ps -o comm` command +func readCommandFromProcFsCmdline(b []byte) string { + // replace null bytes with new line + buf := bytes.ReplaceAll(b, []byte{0x0}, []byte{0x0a}) + // Using all the parts would be nice, but as these become prometheus labels + // we need to be careful about the cardinality. Just use the first part. + parts := strings.Split(strings.TrimSpace(unix.ByteSliceToString(buf)), "\n") + if len(parts) > 0 && parts[0] != "" { + return parts[0] + } + return "" +} + +// This is a fallback method when we can't read the executable name from +// the cmdline. i.e for kernel threads +func readCommandFromProcFsComm(b []byte) string { + comm := strings.TrimSpace(unix.ByteSliceToString(b)) + if comm != "" { + // return the command in square brackets, like ps does + return "[" + comm + "]" + } + return "" +} diff --git a/pkg/comm/resolve_comm_test.go b/pkg/comm/resolve_comm_test.go new file mode 100644 index 0000000000..7efd98195c --- /dev/null +++ b/pkg/comm/resolve_comm_test.go @@ -0,0 +1,83 @@ +package comm + +import ( + "os" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestCollector(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Comm Resolver Suite") +} + +func resolveFromCmdline(pid int) (string, error) { + return readCommandFromProcFsCmdline([]byte("cmdline\x00is\x00a\x000test")), nil +} + +func resolveFromComm(pid int) (string, error) { + return readCommandFromProcFsComm([]byte("comm")), nil +} + +func resolveNotExist(pid int) (string, error) { + return unknownComm, os.ErrNotExist +} + +var _ = Describe("CommResolver", func() { + Describe("ResolveComm", func() { + Context("when the process ID exists", func() { + It("should return the resolved command name from cmdline", func() { + resolver := NewTestCommResolver(resolveFromCmdline) + resolvedComm, err := resolver.ResolveComm(1234) + Expect(err).ToNot(HaveOccurred()) + Expect(resolvedComm).To(Equal("cmdline")) + // Verify that the resolved command name is cached + Expect(resolver.cacheExist).To(HaveKey(1234)) + Expect(resolver.cacheExist[1234]).To(Equal("cmdline")) + }) + + It("should return the resolved command name from comm", func() { + resolver := NewTestCommResolver(resolveFromComm) + resolvedComm, err := resolver.ResolveComm(1234) + Expect(err).ToNot(HaveOccurred()) + Expect(resolvedComm).To(Equal("[comm]")) + // Verify that the resolved command name is cached + Expect(resolver.cacheExist).To(HaveKey(1234)) + Expect(resolver.cacheExist[1234]).To(Equal("[comm]")) + }) + }) + + Context("when the process ID does not exist", func() { + It("should return an error", func() { + resolver := NewTestCommResolver(resolveNotExist) + resolvedComm, err := resolver.ResolveComm(54321) + Expect(err).To(HaveOccurred()) + Expect(resolvedComm) + // Verify that the process ID is cached as non-existent + Expect(resolver.cacheNotExist).To(HaveKey(54321)) + }) + }) + }) + + Describe("Clear", func() { + It("should clear the cache for freed process IDs", func() { + freed := []int{123, 456, 789} + resolver := NewTestCommResolver(resolveFromCmdline) + + // Add some entries to the cache + for _, pid := range freed { + _, err := resolver.ResolveComm(pid) + Expect(err).ToNot(HaveOccurred()) + } + + // Clear the cache + resolver.Clear(freed) + + // Verify that the cache is empty for the freed process IDs + Expect(resolver.cacheExist).To(HaveLen(0)) + Expect(resolver.cacheNotExist).To(HaveLen(0)) + }) + }) +}) diff --git a/pkg/config/config.go b/pkg/config/config.go index 7d989ddd19..372e20a006 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -82,7 +82,6 @@ var ( BindAddressKey = "BIND_ADDRESS" CPUArchOverride = getConfig("CPU_ARCH_OVERRIDE", "") MaxLookupRetry = getIntConfig("MAX_LOOKUP_RETRY", defaultMaxLookupRetry) - BPFSampleRate = getIntConfig("EXPERIMENTAL_BPF_SAMPLE_RATE", 0) EstimatorModel = getConfig("ESTIMATOR_MODEL", defaultMetricValue) // auto-select EstimatorSelectFilter = getConfig("ESTIMATOR_SELECT_FILTER", defaultMetricValue) // no filter @@ -154,7 +153,6 @@ func logBoolConfigs() { klog.V(5).Infof("EXPOSE_BPF_METRICS: %t", ExposeBPFMetrics) klog.V(5).Infof("EXPOSE_COMPONENT_POWER: %t", ExposeComponentPower) klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", ExposeIdlePowerMetrics) - klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", BPFSampleRate) } } diff --git a/vendor/github.com/cilium/ebpf/.vimto.toml b/vendor/github.com/cilium/ebpf/.vimto.toml new file mode 100644 index 0000000000..49a12dbc09 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.vimto.toml @@ -0,0 +1,12 @@ +kernel="ghcr.io/cilium/ci-kernels:stable" +smp="cpus=2" +memory="1G" +user="root" +setup=[ + "mount -t cgroup2 -o nosuid,noexec,nodev cgroup2 /sys/fs/cgroup", + "/bin/sh -c 'modprobe bpf_testmod || true'", + "dmesg --clear", +] +teardown=[ + "dmesg --read-clear", +] diff --git a/vendor/github.com/cilium/ebpf/CODEOWNERS b/vendor/github.com/cilium/ebpf/CODEOWNERS index ad13437ea2..ca65d23c09 100644 --- a/vendor/github.com/cilium/ebpf/CODEOWNERS +++ b/vendor/github.com/cilium/ebpf/CODEOWNERS @@ -7,3 +7,5 @@ perf/ @florianl ringbuf/ @florianl btf/ @dylandreimerink + +cmd/bpf2go/ @mejedi diff --git a/vendor/github.com/cilium/ebpf/Makefile b/vendor/github.com/cilium/ebpf/Makefile index c55a93d9cb..d355eea71c 100644 --- a/vendor/github.com/cilium/ebpf/Makefile +++ b/vendor/github.com/cilium/ebpf/Makefile @@ -106,7 +106,7 @@ testdata/loader-%-eb.elf: testdata/loader.c $(STRIP) -g $@ .PHONY: update-kernel-deps -update-kernel-deps: export KERNEL_VERSION?=6.7 +update-kernel-deps: export KERNEL_VERSION?=6.8 update-kernel-deps: ./testdata/sh/update-kernel-deps.sh $(MAKE) container-all diff --git a/vendor/github.com/cilium/ebpf/btf/btf.go b/vendor/github.com/cilium/ebpf/btf/btf.go index 204757dbf6..671f680b2a 100644 --- a/vendor/github.com/cilium/ebpf/btf/btf.go +++ b/vendor/github.com/cilium/ebpf/btf/btf.go @@ -66,7 +66,7 @@ func (s *immutableTypes) typeByID(id TypeID) (Type, bool) { // mutableTypes is a set of types which may be changed. type mutableTypes struct { imm immutableTypes - mu *sync.RWMutex // protects copies below + mu sync.RWMutex // protects copies below copies map[Type]Type // map[orig]copy copiedTypeIDs map[Type]TypeID // map[copy]origID } @@ -94,10 +94,14 @@ func (mt *mutableTypes) add(typ Type, typeIDs map[Type]TypeID) Type { } // copy a set of mutable types. -func (mt *mutableTypes) copy() mutableTypes { - mtCopy := mutableTypes{ +func (mt *mutableTypes) copy() *mutableTypes { + if mt == nil { + return nil + } + + mtCopy := &mutableTypes{ mt.imm, - &sync.RWMutex{}, + sync.RWMutex{}, make(map[Type]Type, len(mt.copies)), make(map[Type]TypeID, len(mt.copiedTypeIDs)), } @@ -169,7 +173,7 @@ func (mt *mutableTypes) anyTypesByName(name string) ([]Type, error) { // Spec allows querying a set of Types and loading the set into the // kernel. type Spec struct { - mutableTypes + *mutableTypes // String table from ELF. strings *stringTable @@ -339,7 +343,7 @@ func loadRawSpec(btf io.ReaderAt, bo binary.ByteOrder, base *Spec) (*Spec, error typeIDs, typesByName := indexTypes(types, firstTypeID) return &Spec{ - mutableTypes{ + &mutableTypes{ immutableTypes{ types, typeIDs, @@ -347,7 +351,7 @@ func loadRawSpec(btf io.ReaderAt, bo binary.ByteOrder, base *Spec) (*Spec, error typesByName, bo, }, - &sync.RWMutex{}, + sync.RWMutex{}, make(map[Type]Type), make(map[Type]TypeID), }, @@ -522,6 +526,10 @@ func fixupDatasecLayout(ds *Datasec) error { // Copy creates a copy of Spec. func (s *Spec) Copy() *Spec { + if s == nil { + return nil + } + return &Spec{ s.mutableTypes.copy(), s.strings, diff --git a/vendor/github.com/cilium/ebpf/btf/handle.go b/vendor/github.com/cilium/ebpf/btf/handle.go index b6b3e87f50..adfa6fed4b 100644 --- a/vendor/github.com/cilium/ebpf/btf/handle.go +++ b/vendor/github.com/cilium/ebpf/btf/handle.go @@ -41,6 +41,8 @@ func NewHandle(b *Builder) (*Handle, error) { // // Returns an error wrapping ErrNotSupported if the kernel doesn't support BTF. func NewHandleFromRawBTF(btf []byte) (*Handle, error) { + const minLogSize = 64 * 1024 + if uint64(len(btf)) > math.MaxUint32 { return nil, errors.New("BTF exceeds the maximum size") } @@ -50,26 +52,54 @@ func NewHandleFromRawBTF(btf []byte) (*Handle, error) { BtfSize: uint32(len(btf)), } - fd, err := sys.BtfLoad(attr) - if err == nil { - return &Handle{fd, attr.BtfSize, false}, nil + var ( + logBuf []byte + err error + ) + for { + var fd *sys.FD + fd, err = sys.BtfLoad(attr) + if err == nil { + return &Handle{fd, attr.BtfSize, false}, nil + } + + if attr.BtfLogTrueSize != 0 && attr.BtfLogSize >= attr.BtfLogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.BtfLogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Up until at least kernel 6.0, the BTF verifier does not return ENOSPC + // if there are other verification errors. ENOSPC is only returned when + // the BTF blob is correct, a log was requested, and the provided buffer + // is too small. We're therefore not sure whether we got the full + // log or not. + break + } + + // Make an educated guess how large the buffer should be. Start + // at a reasonable minimum and then double the size. + logSize := uint32(max(len(logBuf)*2, minLogSize)) + if int(logSize) < len(logBuf) { + return nil, errors.New("overflow while probing log buffer size") + } + + if attr.BtfLogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.BtfLogTrueSize + } + + logBuf = make([]byte, logSize) + attr.BtfLogSize = logSize + attr.BtfLogBuf = sys.NewSlicePointer(logBuf) + attr.BtfLogLevel = 1 } if err := haveBTF(); err != nil { return nil, err } - logBuf := make([]byte, 64*1024) - attr.BtfLogBuf = sys.NewSlicePointer(logBuf) - attr.BtfLogSize = uint32(len(logBuf)) - attr.BtfLogLevel = 1 - - // Up until at least kernel 6.0, the BTF verifier does not return ENOSPC - // if there are other verification errors. ENOSPC is only returned when - // the BTF blob is correct, a log was requested, and the provided buffer - // is too small. - _, ve := sys.BtfLoad(attr) - return nil, internal.ErrorWithLog("load btf", err, logBuf, errors.Is(ve, unix.ENOSPC)) + return nil, internal.ErrorWithLog("load btf", err, logBuf) } // NewHandleFromID returns the BTF handle for a given id. diff --git a/vendor/github.com/cilium/ebpf/btf/types.go b/vendor/github.com/cilium/ebpf/btf/types.go index 3cb9184f00..a3397460b9 100644 --- a/vendor/github.com/cilium/ebpf/btf/types.go +++ b/vendor/github.com/cilium/ebpf/btf/types.go @@ -682,6 +682,10 @@ func Copy(typ Type) Type { } func copyType(typ Type, ids map[Type]TypeID, copies map[Type]Type, copiedIDs map[Type]TypeID) Type { + if typ == nil { + return nil + } + cpy, ok := copies[typ] if ok { // This has been copied previously, no need to continue. diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go index a5532220fd..b2cb214adc 100644 --- a/vendor/github.com/cilium/ebpf/collection.go +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -57,7 +57,7 @@ func (cs *CollectionSpec) Copy() *CollectionSpec { Maps: make(map[string]*MapSpec, len(cs.Maps)), Programs: make(map[string]*ProgramSpec, len(cs.Programs)), ByteOrder: cs.ByteOrder, - Types: cs.Types, + Types: cs.Types.Copy(), } for name, spec := range cs.Maps { diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go index d55ab88928..620037d80a 100644 --- a/vendor/github.com/cilium/ebpf/elf_reader.go +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -972,6 +972,9 @@ func mapSpecFromBTF(es *elfSection, vs *btf.VarSecinfo, def *btf.Struct, spec *b return nil, fmt.Errorf("resolving values contents: %w", err) } + case "map_extra": + return nil, fmt.Errorf("BTF map definition: field %s: %w", member.Name, ErrNotSupported) + default: return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name) } diff --git a/vendor/github.com/cilium/ebpf/info.go b/vendor/github.com/cilium/ebpf/info.go index 79b11c951f..04c60c64b8 100644 --- a/vendor/github.com/cilium/ebpf/info.go +++ b/vendor/github.com/cilium/ebpf/info.go @@ -20,6 +20,23 @@ import ( "github.com/cilium/ebpf/internal/unix" ) +// The *Info structs expose metadata about a program or map. Most +// fields are exposed via a getter: +// +// func (*MapInfo) ID() (MapID, bool) +// +// This is because the metadata available changes based on kernel version. +// The second boolean return value indicates whether a particular field is +// available on the current kernel. +// +// Always add new metadata as such a getter, unless you can somehow get the +// value of the field on all supported kernels. Also document which version +// a particular field first appeared in. +// +// Some metadata is a buffer which needs additional parsing. In this case, +// store the undecoded data in the Info struct and provide a getter which +// decodes it when necessary. See ProgramInfo.Instructions for an example. + // MapInfo describes a map. type MapInfo struct { Type MapType @@ -30,6 +47,8 @@ type MapInfo struct { Flags uint32 // Name as supplied by user space at load time. Available from 4.15. Name string + + btf btf.ID } func newMapInfoFromFd(fd *sys.FD) (*MapInfo, error) { @@ -50,6 +69,7 @@ func newMapInfoFromFd(fd *sys.FD) (*MapInfo, error) { info.MaxEntries, uint32(info.MapFlags), unix.ByteSliceToString(info.Name[:]), + btf.ID(info.BtfId), }, nil } @@ -77,12 +97,27 @@ func (mi *MapInfo) ID() (MapID, bool) { return mi.id, mi.id > 0 } +// BTFID returns the BTF ID associated with the Map. +// +// The ID is only valid as long as the associated Map is kept alive. +// Available from 4.18. +// +// The bool return value indicates whether this optional field is available and +// populated. (The field may be available but not populated if the kernel +// supports the field but the Map was loaded without BTF information.) +func (mi *MapInfo) BTFID() (btf.ID, bool) { + return mi.btf, mi.btf > 0 +} + // programStats holds statistics of a program. type programStats struct { // Total accumulated runtime of the program ins ns. runtime time.Duration // Total number of times the program was called. runCount uint64 + // Total number of times the programm was NOT called. + // Added in commit 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented"). + recursionMisses uint64 } // ProgramInfo describes a program. @@ -125,8 +160,9 @@ func newProgramInfoFromFd(fd *sys.FD) (*ProgramInfo, error) { Name: unix.ByteSliceToString(info.Name[:]), btf: btf.ID(info.BtfId), stats: &programStats{ - runtime: time.Duration(info.RunTimeNs), - runCount: info.RunCnt, + runtime: time.Duration(info.RunTimeNs), + runCount: info.RunCnt, + recursionMisses: info.RecursionMisses, }, } @@ -259,6 +295,16 @@ func (pi *ProgramInfo) Runtime() (time.Duration, bool) { return time.Duration(0), false } +// RecursionMisses returns the total number of times the program was NOT called. +// This can happen when another bpf program is already running on the cpu, which +// is likely to happen for example when you interrupt bpf program execution. +func (pi *ProgramInfo) RecursionMisses() (uint64, bool) { + if pi.stats != nil { + return pi.stats.recursionMisses, true + } + return 0, false +} + // Instructions returns the 'xlated' instruction stream of the program // after it has been verified and rewritten by the kernel. These instructions // cannot be loaded back into the kernel as-is, this is mainly used for diff --git a/vendor/github.com/cilium/ebpf/internal/epoll/poller.go b/vendor/github.com/cilium/ebpf/internal/epoll/poller.go new file mode 100644 index 0000000000..ed1c3a3c8f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/epoll/poller.go @@ -0,0 +1,278 @@ +package epoll + +import ( + "errors" + "fmt" + "math" + "os" + "runtime" + "slices" + "sync" + "time" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +var ErrFlushed = errors.New("data was flushed") + +// Poller waits for readiness notifications from multiple file descriptors. +// +// The wait can be interrupted by calling Close. +type Poller struct { + // mutexes protect the fields declared below them. If you need to + // acquire both at once you must lock epollMu before eventMu. + epollMu sync.Mutex + epollFd int + + eventMu sync.Mutex + closeEvent *eventFd + flushEvent *eventFd +} + +func New() (_ *Poller, err error) { + closeFDOnError := func(fd int) { + if err != nil { + unix.Close(fd) + } + } + closeEventFDOnError := func(e *eventFd) { + if err != nil { + e.close() + } + } + + epollFd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("create epoll fd: %v", err) + } + defer closeFDOnError(epollFd) + + p := &Poller{epollFd: epollFd} + p.closeEvent, err = newEventFd() + if err != nil { + return nil, err + } + defer closeEventFDOnError(p.closeEvent) + + p.flushEvent, err = newEventFd() + if err != nil { + return nil, err + } + defer closeEventFDOnError(p.flushEvent) + + if err := p.Add(p.closeEvent.raw, 0); err != nil { + return nil, fmt.Errorf("add close eventfd: %w", err) + } + + if err := p.Add(p.flushEvent.raw, 0); err != nil { + return nil, fmt.Errorf("add flush eventfd: %w", err) + } + + runtime.SetFinalizer(p, (*Poller).Close) + return p, nil +} + +// Close the poller. +// +// Interrupts any calls to Wait. Multiple calls to Close are valid, but subsequent +// calls will return os.ErrClosed. +func (p *Poller) Close() error { + runtime.SetFinalizer(p, nil) + + // Interrupt Wait() via the closeEvent fd if it's currently blocked. + if err := p.wakeWaitForClose(); err != nil { + return err + } + + // Acquire the lock. This ensures that Wait isn't running. + p.epollMu.Lock() + defer p.epollMu.Unlock() + + // Prevent other calls to Close(). + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.epollFd != -1 { + unix.Close(p.epollFd) + p.epollFd = -1 + } + + if p.closeEvent != nil { + p.closeEvent.close() + p.closeEvent = nil + } + + if p.flushEvent != nil { + p.flushEvent.close() + p.flushEvent = nil + } + + return nil +} + +// Add an fd to the poller. +// +// id is returned by Wait in the unix.EpollEvent.Pad field any may be zero. It +// must not exceed math.MaxInt32. +// +// Add is blocked by Wait. +func (p *Poller) Add(fd int, id int) error { + if int64(id) > math.MaxInt32 { + return fmt.Errorf("unsupported id: %d", id) + } + + p.epollMu.Lock() + defer p.epollMu.Unlock() + + if p.epollFd == -1 { + return fmt.Errorf("epoll add: %w", os.ErrClosed) + } + + // The representation of EpollEvent isn't entirely accurate. + // Pad is fully usable, not just padding. Hence we stuff the + // id in there, which allows us to identify the event later (e.g., + // in case of perf events, which CPU sent it). + event := unix.EpollEvent{ + Events: unix.EPOLLIN, + Fd: int32(fd), + Pad: int32(id), + } + + if err := unix.EpollCtl(p.epollFd, unix.EPOLL_CTL_ADD, fd, &event); err != nil { + return fmt.Errorf("add fd to epoll: %v", err) + } + + return nil +} + +// Wait for events. +// +// Returns the number of pending events and any errors. +// +// - [os.ErrClosed] if interrupted by [Close]. +// - [ErrFlushed] if interrupted by [Flush]. +// - [os.ErrDeadlineExceeded] if deadline is reached. +func (p *Poller) Wait(events []unix.EpollEvent, deadline time.Time) (int, error) { + p.epollMu.Lock() + defer p.epollMu.Unlock() + + if p.epollFd == -1 { + return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) + } + + for { + timeout := int(-1) + if !deadline.IsZero() { + msec := time.Until(deadline).Milliseconds() + // Deadline is in the past, don't block. + msec = max(msec, 0) + // Deadline is too far in the future. + msec = min(msec, math.MaxInt) + + timeout = int(msec) + } + + n, err := unix.EpollWait(p.epollFd, events, timeout) + if temp, ok := err.(temporaryError); ok && temp.Temporary() { + // Retry the syscall if we were interrupted, see https://github.com/golang/go/issues/20400 + continue + } + + if err != nil { + return 0, err + } + + if n == 0 { + return 0, fmt.Errorf("epoll wait: %w", os.ErrDeadlineExceeded) + } + + for i := 0; i < n; { + event := events[i] + if int(event.Fd) == p.closeEvent.raw { + // Since we don't read p.closeEvent the event is never cleared and + // we'll keep getting this wakeup until Close() acquires the + // lock and sets p.epollFd = -1. + return 0, fmt.Errorf("epoll wait: %w", os.ErrClosed) + } + if int(event.Fd) == p.flushEvent.raw { + // read event to prevent it from continuing to wake + p.flushEvent.read() + err = ErrFlushed + events = slices.Delete(events, i, i+1) + n -= 1 + continue + } + i++ + } + + return n, err + } +} + +type temporaryError interface { + Temporary() bool +} + +// wakeWaitForClose unblocks Wait if it's epoll_wait. +func (p *Poller) wakeWaitForClose() error { + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.closeEvent == nil { + return fmt.Errorf("epoll wake: %w", os.ErrClosed) + } + + return p.closeEvent.add(1) +} + +// Flush unblocks Wait if it's epoll_wait, for purposes of reading pending samples +func (p *Poller) Flush() error { + p.eventMu.Lock() + defer p.eventMu.Unlock() + + if p.flushEvent == nil { + return fmt.Errorf("epoll wake: %w", os.ErrClosed) + } + + return p.flushEvent.add(1) +} + +// eventFd wraps a Linux eventfd. +// +// An eventfd acts like a counter: writes add to the counter, reads retrieve +// the counter and reset it to zero. Reads also block if the counter is zero. +// +// See man 2 eventfd. +type eventFd struct { + file *os.File + // prefer raw over file.Fd(), since the latter puts the file into blocking + // mode. + raw int +} + +func newEventFd() (*eventFd, error) { + fd, err := unix.Eventfd(0, unix.O_CLOEXEC|unix.O_NONBLOCK) + if err != nil { + return nil, err + } + file := os.NewFile(uintptr(fd), "event") + return &eventFd{file, fd}, nil +} + +func (efd *eventFd) close() error { + return efd.file.Close() +} + +func (efd *eventFd) add(n uint64) error { + var buf [8]byte + internal.NativeEndian.PutUint64(buf[:], n) + _, err := efd.file.Write(buf[:]) + return err +} + +func (efd *eventFd) read() (uint64, error) { + var buf [8]byte + _, err := efd.file.Read(buf[:]) + return internal.NativeEndian.Uint64(buf[:]), err +} diff --git a/vendor/github.com/cilium/ebpf/internal/errors.go b/vendor/github.com/cilium/ebpf/internal/errors.go index bda01e2fde..83a371ad35 100644 --- a/vendor/github.com/cilium/ebpf/internal/errors.go +++ b/vendor/github.com/cilium/ebpf/internal/errors.go @@ -12,7 +12,7 @@ import ( // // The default error output is a summary of the full log. The latter can be // accessed via VerifierError.Log or by formatting the error, see Format. -func ErrorWithLog(source string, err error, log []byte, truncated bool) *VerifierError { +func ErrorWithLog(source string, err error, log []byte) *VerifierError { const whitespace = "\t\r\v\n " // Convert verifier log C string by truncating it on the first 0 byte @@ -23,7 +23,7 @@ func ErrorWithLog(source string, err error, log []byte, truncated bool) *Verifie log = bytes.Trim(log, whitespace) if len(log) == 0 { - return &VerifierError{source, err, nil, truncated} + return &VerifierError{source, err, nil, false} } logLines := bytes.Split(log, []byte{'\n'}) @@ -34,7 +34,7 @@ func ErrorWithLog(source string, err error, log []byte, truncated bool) *Verifie lines = append(lines, string(bytes.TrimRight(line, whitespace))) } - return &VerifierError{source, err, lines, truncated} + return &VerifierError{source, err, lines, false} } // VerifierError includes information from the eBPF verifier. @@ -46,7 +46,7 @@ type VerifierError struct { Cause error // The verifier output split into lines. Log []string - // Whether the log output is truncated, based on several heuristics. + // Deprecated: the log is never truncated anymore. Truncated bool } @@ -70,7 +70,7 @@ func (le *VerifierError) Error() string { } lines := log[n-1:] - if n >= 2 && (includePreviousLine(log[n-1]) || le.Truncated) { + if n >= 2 && includePreviousLine(log[n-1]) { // Add one more line of context if it aids understanding the error. lines = log[n-2:] } @@ -81,22 +81,9 @@ func (le *VerifierError) Error() string { } omitted := len(le.Log) - len(lines) - if omitted == 0 && !le.Truncated { - return b.String() - } - - b.WriteString(" (") - if le.Truncated { - b.WriteString("truncated") - } - if omitted > 0 { - if le.Truncated { - b.WriteString(", ") - } - fmt.Fprintf(&b, "%d line(s) omitted", omitted) + fmt.Fprintf(&b, " (%d line(s) omitted)", omitted) } - b.WriteString(")") return b.String() } @@ -188,10 +175,6 @@ func (le *VerifierError) Format(f fmt.State, verb rune) { } } - if le.Truncated { - fmt.Fprintf(f, "\n\t(truncated)") - } - default: fmt.Fprintf(f, "%%!%c(BADVERB)", verb) } diff --git a/vendor/github.com/cilium/ebpf/internal/sys/types.go b/vendor/github.com/cilium/ebpf/internal/sys/types.go index d2ae942668..70e754de71 100644 --- a/vendor/github.com/cilium/ebpf/internal/sys/types.go +++ b/vendor/github.com/cilium/ebpf/internal/sys/types.go @@ -359,7 +359,7 @@ const ( BPF_LINK_TYPE_TCX LinkType = 11 BPF_LINK_TYPE_UPROBE_MULTI LinkType = 12 BPF_LINK_TYPE_NETKIT LinkType = 13 - MAX_BPF_LINK_TYPE LinkType = 14 + __MAX_BPF_LINK_TYPE LinkType = 14 ) type MapType uint32 @@ -528,7 +528,7 @@ type LinkInfo struct { Id LinkID ProgId uint32 _ [4]byte - Extra [40]uint8 + Extra [48]uint8 } type MapInfo struct { @@ -1263,7 +1263,7 @@ type CgroupLinkInfo struct { _ [4]byte CgroupId uint64 AttachType AttachType - _ [28]byte + _ [36]byte } type IterLinkInfo struct { @@ -1287,6 +1287,7 @@ type KprobeLinkInfo struct { Offset uint32 Addr uint64 Missed uint64 + _ [8]byte } type KprobeMultiLinkInfo struct { @@ -1298,7 +1299,7 @@ type KprobeMultiLinkInfo struct { Count uint32 Flags uint32 Missed uint64 - _ [16]byte + _ [24]byte } type NetNsLinkInfo struct { @@ -1308,7 +1309,7 @@ type NetNsLinkInfo struct { _ [4]byte NetnsIno uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type NetfilterLinkInfo struct { @@ -1320,7 +1321,7 @@ type NetfilterLinkInfo struct { Hooknum uint32 Priority int32 Flags uint32 - _ [24]byte + _ [32]byte } type NetkitLinkInfo struct { @@ -1330,7 +1331,7 @@ type NetkitLinkInfo struct { _ [4]byte Ifindex uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type PerfEventLinkInfo struct { @@ -1348,7 +1349,7 @@ type RawTracepointLinkInfo struct { _ [4]byte TpName Pointer TpNameLen uint32 - _ [28]byte + _ [36]byte } type TcxLinkInfo struct { @@ -1358,7 +1359,7 @@ type TcxLinkInfo struct { _ [4]byte Ifindex uint32 AttachType AttachType - _ [32]byte + _ [40]byte } type TracingLinkInfo struct { @@ -1369,7 +1370,7 @@ type TracingLinkInfo struct { AttachType AttachType TargetObjId uint32 TargetBtfId TypeID - _ [28]byte + _ [36]byte } type XDPLinkInfo struct { @@ -1378,5 +1379,5 @@ type XDPLinkInfo struct { ProgId uint32 _ [4]byte Ifindex uint32 - _ [36]byte + _ [44]byte } diff --git a/vendor/github.com/cilium/ebpf/link/cgroup.go b/vendor/github.com/cilium/ebpf/link/cgroup.go index 79f3d2b7f4..f17d34f03c 100644 --- a/vendor/github.com/cilium/ebpf/link/cgroup.go +++ b/vendor/github.com/cilium/ebpf/link/cgroup.go @@ -6,6 +6,7 @@ import ( "os" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) type cgroupAttachFlags uint32 @@ -187,3 +188,21 @@ func newLinkCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program) return &linkCgroup{*link}, err } + +func (cg *linkCgroup) Info() (*Info, error) { + var info sys.CgroupLinkInfo + if err := sys.ObjInfo(cg.fd, &info); err != nil { + return nil, fmt.Errorf("cgroup link info: %s", err) + } + extra := &CgroupInfo{ + CgroupId: info.CgroupId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/kprobe.go b/vendor/github.com/cilium/ebpf/link/kprobe.go index b54ca90853..fe3f17c371 100644 --- a/vendor/github.com/cilium/ebpf/link/kprobe.go +++ b/vendor/github.com/cilium/ebpf/link/kprobe.go @@ -59,6 +59,8 @@ func (ko *KprobeOptions) cookie() uint64 { // If attaching to symbol fails, automatically retries with the running // platform's syscall prefix (e.g. __x64_) to support attaching to syscalls // in a portable fashion. +// +// The returned Link may implement [PerfEvent]. func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { k, err := kprobe(symbol, prog, opts, false) if err != nil { @@ -90,6 +92,8 @@ func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error // // On kernels 5.10 and earlier, setting a kretprobe on a nonexistent symbol // incorrectly returns unix.EINVAL instead of os.ErrNotExist. +// +// The returned Link may implement [PerfEvent]. func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { k, err := kprobe(symbol, prog, opts, true) if err != nil { @@ -274,7 +278,11 @@ func pmuProbe(args tracefs.ProbeArgs) (*perfEvent, error) { } } - rawFd, err := unix.PerfEventOpen(&attr, args.Pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + cpu := 0 + if args.Pid != perfAllThreads { + cpu = -1 + } + rawFd, err := unix.PerfEventOpen(&attr, args.Pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs. diff --git a/vendor/github.com/cilium/ebpf/link/kprobe_multi.go b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go index 4d364d80eb..f7a8291f94 100644 --- a/vendor/github.com/cilium/ebpf/link/kprobe_multi.go +++ b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go @@ -130,12 +130,23 @@ func (kml *kprobeMultiLink) Update(prog *ebpf.Program) error { return fmt.Errorf("update kprobe_multi: %w", ErrNotSupported) } -func (kml *kprobeMultiLink) Pin(string) error { - return fmt.Errorf("pin kprobe_multi: %w", ErrNotSupported) -} +func (kml *kprobeMultiLink) Info() (*Info, error) { + var info sys.KprobeMultiLinkInfo + if err := sys.ObjInfo(kml.fd, &info); err != nil { + return nil, fmt.Errorf("kprobe multi link info: %s", err) + } + extra := &KprobeMultiInfo{ + count: info.Count, + flags: info.Flags, + missed: info.Missed, + } -func (kml *kprobeMultiLink) Unpin() error { - return fmt.Errorf("unpin kprobe_multi: %w", ErrNotSupported) + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil } var haveBPFLinkKprobeMulti = internal.NewFeatureTest("bpf_link_kprobe_multi", "5.18", func() error { diff --git a/vendor/github.com/cilium/ebpf/link/link.go b/vendor/github.com/cilium/ebpf/link/link.go index 81428568f8..9c34616c9a 100644 --- a/vendor/github.com/cilium/ebpf/link/link.go +++ b/vendor/github.com/cilium/ebpf/link/link.go @@ -119,13 +119,15 @@ func wrapRawLink(raw *RawLink) (_ Link, err error) { case UprobeMultiType: return &uprobeMultiLink{*raw}, nil case PerfEventType: - return nil, fmt.Errorf("recovering perf event fd: %w", ErrNotSupported) + return &perfEventLink{*raw, nil}, nil case TCXType: return &tcxLink{*raw}, nil case NetfilterType: return &netfilterLink{*raw}, nil case NetkitType: return &netkitLink{*raw}, nil + case XDPType: + return &xdpLink{*raw}, nil default: return raw, nil } @@ -438,6 +440,9 @@ func (l *RawLink) UpdateArgs(opts RawLinkUpdateOptions) error { } // Info returns metadata about the link. +// +// Linktype specific metadata is not included and can be retrieved +// via the linktype specific Info() method. func (l *RawLink) Info() (*Info, error) { var info sys.LinkInfo @@ -445,117 +450,11 @@ func (l *RawLink) Info() (*Info, error) { return nil, fmt.Errorf("link info: %s", err) } - var extra interface{} - switch info.Type { - case CgroupType: - var cgroupInfo sys.CgroupLinkInfo - if err := sys.ObjInfo(l.fd, &cgroupInfo); err != nil { - return nil, fmt.Errorf("cgroup link info: %s", err) - } - extra = &CgroupInfo{ - CgroupId: cgroupInfo.CgroupId, - AttachType: cgroupInfo.AttachType, - } - case NetNsType: - var netnsInfo sys.NetNsLinkInfo - if err := sys.ObjInfo(l.fd, &netnsInfo); err != nil { - return nil, fmt.Errorf("netns link info: %s", err) - } - extra = &NetNsInfo{ - NetnsIno: netnsInfo.NetnsIno, - AttachType: netnsInfo.AttachType, - } - case TracingType: - var tracingInfo sys.TracingLinkInfo - if err := sys.ObjInfo(l.fd, &tracingInfo); err != nil { - return nil, fmt.Errorf("tracing link info: %s", err) - } - extra = &TracingInfo{ - TargetObjId: tracingInfo.TargetObjId, - TargetBtfId: tracingInfo.TargetBtfId, - AttachType: tracingInfo.AttachType, - } - case XDPType: - var xdpInfo sys.XDPLinkInfo - if err := sys.ObjInfo(l.fd, &xdpInfo); err != nil { - return nil, fmt.Errorf("xdp link info: %s", err) - } - extra = &XDPInfo{ - Ifindex: xdpInfo.Ifindex, - } - case RawTracepointType, IterType, UprobeMultiType: - // Extra metadata not supported. - case TCXType: - var tcxInfo sys.TcxLinkInfo - if err := sys.ObjInfo(l.fd, &tcxInfo); err != nil { - return nil, fmt.Errorf("tcx link info: %s", err) - } - extra = &TCXInfo{ - Ifindex: tcxInfo.Ifindex, - AttachType: tcxInfo.AttachType, - } - case NetfilterType: - var netfilterInfo sys.NetfilterLinkInfo - if err := sys.ObjInfo(l.fd, &netfilterInfo); err != nil { - return nil, fmt.Errorf("netfilter link info: %s", err) - } - extra = &NetfilterInfo{ - Pf: netfilterInfo.Pf, - Hooknum: netfilterInfo.Hooknum, - Priority: netfilterInfo.Priority, - Flags: netfilterInfo.Flags, - } - case NetkitType: - var netkitInfo sys.NetkitLinkInfo - if err := sys.ObjInfo(l.fd, &netkitInfo); err != nil { - return nil, fmt.Errorf("tcx link info: %s", err) - } - extra = &NetkitInfo{ - Ifindex: netkitInfo.Ifindex, - AttachType: netkitInfo.AttachType, - } - case KprobeMultiType: - var kprobeMultiInfo sys.KprobeMultiLinkInfo - if err := sys.ObjInfo(l.fd, &kprobeMultiInfo); err != nil { - return nil, fmt.Errorf("kprobe multi link info: %s", err) - } - extra = &KprobeMultiInfo{ - count: kprobeMultiInfo.Count, - flags: kprobeMultiInfo.Flags, - missed: kprobeMultiInfo.Missed, - } - case PerfEventType: - var perfEventInfo sys.PerfEventLinkInfo - if err := sys.ObjInfo(l.fd, &perfEventInfo); err != nil { - return nil, fmt.Errorf("perf event link info: %s", err) - } - - var extra2 interface{} - switch perfEventInfo.PerfEventType { - case sys.BPF_PERF_EVENT_KPROBE, sys.BPF_PERF_EVENT_KRETPROBE: - var kprobeInfo sys.KprobeLinkInfo - if err := sys.ObjInfo(l.fd, &kprobeInfo); err != nil { - return nil, fmt.Errorf("kprobe multi link info: %s", err) - } - extra2 = &KprobeInfo{ - address: kprobeInfo.Addr, - missed: kprobeInfo.Missed, - } - } - - extra = &PerfEventInfo{ - Type: perfEventInfo.PerfEventType, - extra: extra2, - } - default: - return nil, fmt.Errorf("unknown link info type: %d", info.Type) - } - return &Info{ info.Type, info.Id, ebpf.ProgramID(info.ProgId), - extra, + nil, }, nil } diff --git a/vendor/github.com/cilium/ebpf/link/netfilter.go b/vendor/github.com/cilium/ebpf/link/netfilter.go index 250c87677b..34be390859 100644 --- a/vendor/github.com/cilium/ebpf/link/netfilter.go +++ b/vendor/github.com/cilium/ebpf/link/netfilter.go @@ -67,4 +67,24 @@ func (*netfilterLink) Update(new *ebpf.Program) error { return fmt.Errorf("netfilter update: %w", ErrNotSupported) } +func (nf *netfilterLink) Info() (*Info, error) { + var info sys.NetfilterLinkInfo + if err := sys.ObjInfo(nf.fd, &info); err != nil { + return nil, fmt.Errorf("netfilter link info: %s", err) + } + extra := &NetfilterInfo{ + Pf: info.Pf, + Hooknum: info.Hooknum, + Priority: info.Priority, + Flags: info.Flags, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + var _ Link = (*netfilterLink)(nil) diff --git a/vendor/github.com/cilium/ebpf/link/netkit.go b/vendor/github.com/cilium/ebpf/link/netkit.go index 36ed72a480..5eee3b023a 100644 --- a/vendor/github.com/cilium/ebpf/link/netkit.go +++ b/vendor/github.com/cilium/ebpf/link/netkit.go @@ -69,3 +69,21 @@ type netkitLink struct { } var _ Link = (*netkitLink)(nil) + +func (netkit *netkitLink) Info() (*Info, error) { + var info sys.NetkitLinkInfo + if err := sys.ObjInfo(netkit.fd, &info); err != nil { + return nil, fmt.Errorf("netkit link info: %s", err) + } + extra := &NetkitInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/netns.go b/vendor/github.com/cilium/ebpf/link/netns.go index 344ecced6b..b1edd340a3 100644 --- a/vendor/github.com/cilium/ebpf/link/netns.go +++ b/vendor/github.com/cilium/ebpf/link/netns.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) // NetNsLink is a program attached to a network namespace. @@ -34,3 +35,21 @@ func AttachNetNs(ns int, prog *ebpf.Program) (*NetNsLink, error) { return &NetNsLink{*link}, nil } + +func (ns *NetNsLink) Info() (*Info, error) { + var info sys.NetNsLinkInfo + if err := sys.ObjInfo(ns.fd, &info); err != nil { + return nil, fmt.Errorf("netns link info: %s", err) + } + extra := &NetNsInfo{ + NetnsIno: info.NetnsIno, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/perf_event.go b/vendor/github.com/cilium/ebpf/link/perf_event.go index 5f7a628b3d..1d8feb58c1 100644 --- a/vendor/github.com/cilium/ebpf/link/perf_event.go +++ b/vendor/github.com/cilium/ebpf/link/perf_event.go @@ -3,6 +3,7 @@ package link import ( "errors" "fmt" + "os" "runtime" "unsafe" @@ -78,6 +79,18 @@ func (pe *perfEvent) Close() error { return nil } +// PerfEvent is implemented by some Link types which use a perf event under +// the hood. +type PerfEvent interface { + // PerfEvent returns a file for the underlying perf event. + // + // It is the callers responsibility to close the returned file. + // + // Making changes to the associated perf event lead to + // undefined behaviour. + PerfEvent() (*os.File, error) +} + // perfEventLink represents a bpf perf link. type perfEventLink struct { RawLink @@ -86,30 +99,16 @@ type perfEventLink struct { func (pl *perfEventLink) isLink() {} -// Pinning requires the underlying perf event FD to stay open. -// -// | PerfEvent FD | BpfLink FD | Works | -// |--------------|------------|-------| -// | Open | Open | Yes | -// | Closed | Open | No | -// | Open | Closed | No (Pin() -> EINVAL) | -// | Closed | Closed | No (Pin() -> EINVAL) | -// -// There is currently no pretty way to recover the perf event FD -// when loading a pinned link, so leave as not supported for now. -func (pl *perfEventLink) Pin(string) error { - return fmt.Errorf("perf event link pin: %w", ErrNotSupported) -} - -func (pl *perfEventLink) Unpin() error { - return fmt.Errorf("perf event link unpin: %w", ErrNotSupported) -} - func (pl *perfEventLink) Close() error { if err := pl.fd.Close(); err != nil { return fmt.Errorf("perf link close: %w", err) } + // when created from pinned link + if pl.pe == nil { + return nil + } + if err := pl.pe.Close(); err != nil { return fmt.Errorf("perf event close: %w", err) } @@ -120,6 +119,54 @@ func (pl *perfEventLink) Update(prog *ebpf.Program) error { return fmt.Errorf("perf event link update: %w", ErrNotSupported) } +var _ PerfEvent = (*perfEventLink)(nil) + +func (pl *perfEventLink) PerfEvent() (*os.File, error) { + // when created from pinned link + if pl.pe == nil { + return nil, ErrNotSupported + } + + fd, err := pl.pe.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + +func (pl *perfEventLink) Info() (*Info, error) { + var info sys.PerfEventLinkInfo + if err := sys.ObjInfo(pl.fd, &info); err != nil { + return nil, fmt.Errorf("perf event link info: %s", err) + } + + var extra2 interface{} + switch info.PerfEventType { + case sys.BPF_PERF_EVENT_KPROBE, sys.BPF_PERF_EVENT_KRETPROBE: + var kprobeInfo sys.KprobeLinkInfo + if err := sys.ObjInfo(pl.fd, &kprobeInfo); err != nil { + return nil, fmt.Errorf("kprobe link info: %s", err) + } + extra2 = &KprobeInfo{ + address: kprobeInfo.Addr, + missed: kprobeInfo.Missed, + } + } + + extra := &PerfEventInfo{ + Type: info.PerfEventType, + extra: extra2, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + // perfEventIoctl implements Link and handles the perf event lifecycle // via ioctl(). type perfEventIoctl struct { @@ -154,6 +201,17 @@ func (pi *perfEventIoctl) Info() (*Info, error) { return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported) } +var _ PerfEvent = (*perfEventIoctl)(nil) + +func (pi *perfEventIoctl) PerfEvent() (*os.File, error) { + fd, err := pi.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + // attach the given eBPF prog to the perf event stored in pe. // pe must contain a valid perf event fd. // prog's type must match the program type stored in pe. @@ -229,7 +287,11 @@ func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) { Wakeup: 1, } - fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + cpu := 0 + if pid != perfAllThreads { + cpu = -1 + } + fd, err := unix.PerfEventOpen(&attr, pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) if err != nil { return nil, fmt.Errorf("opening tracepoint perf event: %w", err) } diff --git a/vendor/github.com/cilium/ebpf/link/tcx.go b/vendor/github.com/cilium/ebpf/link/tcx.go index 88f2237d29..ac045b71da 100644 --- a/vendor/github.com/cilium/ebpf/link/tcx.go +++ b/vendor/github.com/cilium/ebpf/link/tcx.go @@ -69,3 +69,21 @@ type tcxLink struct { } var _ Link = (*tcxLink)(nil) + +func (tcx *tcxLink) Info() (*Info, error) { + var info sys.TcxLinkInfo + if err := sys.ObjInfo(tcx.fd, &info); err != nil { + return nil, fmt.Errorf("tcx link info: %s", err) + } + extra := &TCXInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/tracepoint.go b/vendor/github.com/cilium/ebpf/link/tracepoint.go index 95f5fae3b0..6fc78b9828 100644 --- a/vendor/github.com/cilium/ebpf/link/tracepoint.go +++ b/vendor/github.com/cilium/ebpf/link/tracepoint.go @@ -30,6 +30,8 @@ type TracepointOptions struct { // // Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is // only possible as of kernel 4.14 (commit cf5f5ce). +// +// The returned Link may implement [PerfEvent]. func Tracepoint(group, name string, prog *ebpf.Program, opts *TracepointOptions) (Link, error) { if group == "" || name == "" { return nil, fmt.Errorf("group and name cannot be empty: %w", errInvalidInput) diff --git a/vendor/github.com/cilium/ebpf/link/tracing.go b/vendor/github.com/cilium/ebpf/link/tracing.go index 1e1a7834d8..9e570afc96 100644 --- a/vendor/github.com/cilium/ebpf/link/tracing.go +++ b/vendor/github.com/cilium/ebpf/link/tracing.go @@ -18,6 +18,25 @@ func (f *tracing) Update(new *ebpf.Program) error { return fmt.Errorf("tracing update: %w", ErrNotSupported) } +func (f *tracing) Info() (*Info, error) { + var info sys.TracingLinkInfo + if err := sys.ObjInfo(f.fd, &info); err != nil { + return nil, fmt.Errorf("tracing link info: %s", err) + } + extra := &TracingInfo{ + TargetObjId: info.TargetObjId, + TargetBtfId: info.TargetBtfId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + // AttachFreplace attaches the given eBPF program to the function it replaces. // // The program and name can either be provided at link time, or can be provided diff --git a/vendor/github.com/cilium/ebpf/link/uprobe.go b/vendor/github.com/cilium/ebpf/link/uprobe.go index ad85024e38..194d1d319a 100644 --- a/vendor/github.com/cilium/ebpf/link/uprobe.go +++ b/vendor/github.com/cilium/ebpf/link/uprobe.go @@ -222,6 +222,8 @@ func (ex *Executable) address(symbol string, address, offset uint64) (uint64, er // // Functions provided by shared libraries can currently not be traced and // will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { u, err := ex.uprobe(symbol, prog, opts, false) if err != nil { @@ -256,6 +258,8 @@ func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOpti // // Functions provided by shared libraries can currently not be traced and // will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { u, err := ex.uprobe(symbol, prog, opts, true) if err != nil { diff --git a/vendor/github.com/cilium/ebpf/link/uprobe_multi.go b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go index 9a8d329c8f..aea807b329 100644 --- a/vendor/github.com/cilium/ebpf/link/uprobe_multi.go +++ b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go @@ -172,14 +172,6 @@ func (kml *uprobeMultiLink) Update(prog *ebpf.Program) error { return fmt.Errorf("update uprobe_multi: %w", ErrNotSupported) } -func (kml *uprobeMultiLink) Pin(string) error { - return fmt.Errorf("pin uprobe_multi: %w", ErrNotSupported) -} - -func (kml *uprobeMultiLink) Unpin() error { - return fmt.Errorf("unpin uprobe_multi: %w", ErrNotSupported) -} - var haveBPFLinkUprobeMulti = internal.NewFeatureTest("bpf_link_uprobe_multi", "6.6", func() error { prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ Name: "probe_upm_link", diff --git a/vendor/github.com/cilium/ebpf/link/xdp.go b/vendor/github.com/cilium/ebpf/link/xdp.go index aa8dd3a4cb..2ec441229a 100644 --- a/vendor/github.com/cilium/ebpf/link/xdp.go +++ b/vendor/github.com/cilium/ebpf/link/xdp.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" ) // XDPAttachFlags represents how XDP program will be attached to interface. @@ -50,5 +51,30 @@ func AttachXDP(opts XDPOptions) (Link, error) { Flags: uint32(opts.Flags), }) - return rawLink, err + if err != nil { + return nil, fmt.Errorf("failed to attach link: %w", err) + } + + return &xdpLink{*rawLink}, nil +} + +type xdpLink struct { + RawLink +} + +func (xdp *xdpLink) Info() (*Info, error) { + var info sys.XDPLinkInfo + if err := sys.ObjInfo(xdp.fd, &info); err != nil { + return nil, fmt.Errorf("xdp link info: %s", err) + } + extra := &XDPInfo{ + Ifindex: info.Ifindex, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil } diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go index e46fa3f12e..0b62101c3c 100644 --- a/vendor/github.com/cilium/ebpf/map.go +++ b/vendor/github.com/cilium/ebpf/map.go @@ -9,6 +9,7 @@ import ( "os" "path/filepath" "reflect" + "slices" "strings" "sync" "time" @@ -28,6 +29,10 @@ var ( ErrIterationAborted = errors.New("iteration aborted") ErrMapIncompatible = errors.New("map spec is incompatible with existing map") errMapNoBTFValue = errors.New("map spec does not contain a BTF Value") + + // pre-allocating these errors here since they may get called in hot code paths + // and cause unnecessary memory allocations + errMapLookupKeyNotExist = fmt.Errorf("lookup: %w", sysErrKeyNotExist) ) // MapOptions control loading a map into the kernel. @@ -96,11 +101,20 @@ func (ms *MapSpec) Copy() *MapSpec { } cpy := *ms + cpy.Contents = slices.Clone(cpy.Contents) + cpy.Key = btf.Copy(cpy.Key) + cpy.Value = btf.Copy(cpy.Value) - cpy.Contents = make([]MapKV, len(ms.Contents)) - copy(cpy.Contents, ms.Contents) + if cpy.InnerMap == ms { + cpy.InnerMap = &cpy + } else { + cpy.InnerMap = ms.InnerMap.Copy() + } - cpy.InnerMap = ms.InnerMap.Copy() + if cpy.Extra != nil { + extra := *cpy.Extra + cpy.Extra = &extra + } return &cpy } @@ -499,9 +513,6 @@ func handleMapCreateError(attr sys.MapCreateAttr, spec *MapSpec, err error) erro return fmt.Errorf("map create: %w (ring map size %d not a multiple of page size %d)", err, maxEntries, pageSize) } } - if attr.BtfFd == 0 { - return fmt.Errorf("map create: %w (without BTF k/v)", err) - } return fmt.Errorf("map create: %w", err) } @@ -571,6 +582,24 @@ func (m *Map) Info() (*MapInfo, error) { return newMapInfoFromFd(m.fd) } +// Handle returns a reference to the Map's type information in the kernel. +// +// Returns ErrNotSupported if the kernel has no BTF support, or if there is no +// BTF associated with the Map. +func (m *Map) Handle() (*btf.Handle, error) { + info, err := m.Info() + if err != nil { + return nil, err + } + + id, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("map %s: retrieve BTF ID: %w", m, ErrNotSupported) + } + + return btf.NewHandleFromID(id) +} + // MapLookupFlags controls the behaviour of the map lookup calls. type MapLookupFlags uint64 @@ -652,7 +681,7 @@ func (m *Map) LookupBytes(key interface{}) ([]byte, error) { } func (m *Map) lookupPerCPU(key, valueOut any, flags MapLookupFlags) error { - slice, err := ensurePerCPUSlice(valueOut, int(m.valueSize)) + slice, err := ensurePerCPUSlice(valueOut) if err != nil { return err } @@ -677,13 +706,16 @@ func (m *Map) lookup(key interface{}, valueOut sys.Pointer, flags MapLookupFlags } if err = sys.MapLookupElem(&attr); err != nil { + if errors.Is(err, unix.ENOENT) { + return errMapLookupKeyNotExist + } return fmt.Errorf("lookup: %w", wrapMapError(err)) } return nil } func (m *Map) lookupAndDeletePerCPU(key, valueOut any, flags MapLookupFlags) error { - slice, err := ensurePerCPUSlice(valueOut, int(m.valueSize)) + slice, err := ensurePerCPUSlice(valueOut) if err != nil { return err } @@ -695,7 +727,7 @@ func (m *Map) lookupAndDeletePerCPU(key, valueOut any, flags MapLookupFlags) err } // ensurePerCPUSlice allocates a slice for a per-CPU value if necessary. -func ensurePerCPUSlice(sliceOrPtr any, elemLength int) (any, error) { +func ensurePerCPUSlice(sliceOrPtr any) (any, error) { sliceOrPtrType := reflect.TypeOf(sliceOrPtr) if sliceOrPtrType.Kind() == reflect.Slice { // The target is a slice, the caller is responsible for ensuring that @@ -985,7 +1017,11 @@ func (m *Map) guessNonExistentKey() ([]byte, error) { // the end of all possible results, even when partial results // are returned. It should be used to evaluate when lookup is "done". func (m *Map) BatchLookup(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { - return m.batchLookup(sys.BPF_MAP_LOOKUP_BATCH, cursor, keysOut, valuesOut, opts) + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup: %w", err) + } + return n, nil } // BatchLookupAndDelete looks up many elements in a map at once, @@ -1005,7 +1041,11 @@ func (m *Map) BatchLookup(cursor *MapBatchCursor, keysOut, valuesOut interface{} // the end of all possible results, even when partial results // are returned. It should be used to evaluate when lookup is "done". func (m *Map) BatchLookupAndDelete(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { - return m.batchLookup(sys.BPF_MAP_LOOKUP_AND_DELETE_BATCH, cursor, keysOut, valuesOut, opts) + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_AND_DELETE_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup and delete: %w", err) + } + return n, nil } // MapBatchCursor represents a starting point for a batch operation. @@ -1027,7 +1067,11 @@ func (m *Map) batchLookup(cmd sys.Cmd, cursor *MapBatchCursor, keysOut, valuesOu valueBuf := sysenc.SyscallOutput(valuesOut, count*int(m.fullValueSize)) n, err := m.batchLookupCmd(cmd, cursor, count, keysOut, valueBuf.Pointer(), opts) - if err != nil { + if errors.Is(err, unix.ENOSPC) { + // Hash tables return ENOSPC when the size of the batch is smaller than + // any bucket. + return n, fmt.Errorf("%w (batch size too small?)", err) + } else if err != nil { return n, err } diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go index f4f3af7c36..9bc6325f88 100644 --- a/vendor/github.com/cilium/ebpf/prog.go +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -46,13 +46,13 @@ const ( outputPad = 256 + 2 ) -// DefaultVerifierLogSize is the default number of bytes allocated for the -// verifier log. +// Deprecated: the correct log size is now detected automatically and this +// constant is unused. const DefaultVerifierLogSize = 64 * 1024 -// maxVerifierLogSize is the maximum size of verifier log buffer the kernel -// will accept before returning EINVAL. -const maxVerifierLogSize = math.MaxUint32 >> 2 +// minVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const minVerifierLogSize = 64 * 1024 // ProgramOptions control loading a program into the kernel. type ProgramOptions struct { @@ -73,15 +73,8 @@ type ProgramOptions struct { // attempt at loading the program. LogLevel LogLevel - // Controls the output buffer size for the verifier log, in bytes. See the - // documentation on ProgramOptions.LogLevel for details about how this value - // is used. - // - // If this value is set too low to fit the verifier log, the resulting - // [ebpf.VerifierError]'s Truncated flag will be true, and the error string - // will also contain a hint to that effect. - // - // Defaults to DefaultVerifierLogSize. + // Deprecated: the correct log buffer size is determined automatically + // and this field is ignored. LogSize int // Disables the verifier log completely, regardless of other options. @@ -262,10 +255,6 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian) } - if opts.LogSize < 0 { - return nil, errors.New("ProgramOptions.LogSize must be a positive value; disable verifier logs using ProgramOptions.LogDisabled") - } - // Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load") // require the version field to be set to the value of the KERNEL_VERSION // macro for kprobe-type programs. @@ -404,37 +393,59 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } } - if opts.LogSize == 0 { - opts.LogSize = DefaultVerifierLogSize - } - - // The caller requested a specific verifier log level. Set up the log buffer. + // The caller requested a specific verifier log level. Set up the log buffer + // so that there is a chance of loading the program in a single shot. var logBuf []byte if !opts.LogDisabled && opts.LogLevel != 0 { - logBuf = make([]byte, opts.LogSize) + logBuf = make([]byte, minVerifierLogSize) attr.LogLevel = opts.LogLevel attr.LogSize = uint32(len(logBuf)) attr.LogBuf = sys.NewSlicePointer(logBuf) } - fd, err := sys.ProgLoad(attr) - if err == nil { - return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil - } + for { + var fd *sys.FD + fd, err = sys.ProgLoad(attr) + if err == nil { + return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil + } - // An error occurred loading the program, but the caller did not explicitly - // enable the verifier log. Re-run with branch-level verifier logs enabled to - // obtain more info. Preserve the original error to return it to the caller. - // An undersized log buffer will result in ENOSPC regardless of the underlying - // cause. - var err2 error - if !opts.LogDisabled && opts.LogLevel == 0 { - logBuf = make([]byte, opts.LogSize) - attr.LogLevel = LogLevelBranch - attr.LogSize = uint32(len(logBuf)) - attr.LogBuf = sys.NewSlicePointer(logBuf) + if opts.LogDisabled { + break + } - _, err2 = sys.ProgLoad(attr) + if attr.LogTrueSize != 0 && attr.LogSize >= attr.LogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.LogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Logging is enabled and the error is not ENOSPC, so we can infer + // that the log buffer is large enough. + break + } + + if attr.LogLevel == 0 { + // Logging is not enabled but loading the program failed. Enable + // basic logging. + attr.LogLevel = LogLevelBranch + } + + // Make an educated guess how large the buffer should be. Start + // at minVerifierLogSize and then double the size. + logSize := uint32(max(len(logBuf)*2, minVerifierLogSize)) + if int(logSize) < len(logBuf) { + return nil, errors.New("overflow while probing log buffer size") + } + + if attr.LogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.LogTrueSize + } + + logBuf = make([]byte, logSize) + attr.LogSize = logSize + attr.LogBuf = sys.NewSlicePointer(logBuf) } end := bytes.IndexByte(logBuf, 0) @@ -452,10 +463,6 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } case errors.Is(err, unix.EINVAL): - if opts.LogSize > maxVerifierLogSize { - return nil, fmt.Errorf("load program: %w (ProgramOptions.LogSize exceeds maximum value of %d)", err, maxVerifierLogSize) - } - if bytes.Contains(tail, coreBadCall) { err = errBadRelocation break @@ -479,8 +486,7 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er } } - truncated := errors.Is(err, unix.ENOSPC) || errors.Is(err2, unix.ENOSPC) - return nil, internal.ErrorWithLog("load program", err, logBuf, truncated) + return nil, internal.ErrorWithLog("load program", err, logBuf) } // NewProgramFromFD creates a program from a raw fd. diff --git a/vendor/github.com/cilium/ebpf/ringbuf/doc.go b/vendor/github.com/cilium/ebpf/ringbuf/doc.go new file mode 100644 index 0000000000..9e45012187 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/doc.go @@ -0,0 +1,6 @@ +// Package ringbuf allows interacting with Linux BPF ring buffer. +// +// BPF allows submitting custom events to a BPF ring buffer map set up +// by userspace. This is very useful to push things like packet samples +// from BPF to a daemon running in user space. +package ringbuf diff --git a/vendor/github.com/cilium/ebpf/ringbuf/reader.go b/vendor/github.com/cilium/ebpf/ringbuf/reader.go new file mode 100644 index 0000000000..3d3ba0ecfa --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/reader.go @@ -0,0 +1,197 @@ +package ringbuf + +import ( + "errors" + "fmt" + "os" + "sync" + "time" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/epoll" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + ErrClosed = os.ErrClosed + ErrFlushed = epoll.ErrFlushed + errEOR = errors.New("end of ring") + errBusy = errors.New("sample not committed yet") +) + +// ringbufHeader from 'struct bpf_ringbuf_hdr' in kernel/bpf/ringbuf.c +type ringbufHeader struct { + Len uint32 + _ uint32 // pg_off, only used by kernel internals +} + +func (rh *ringbufHeader) isBusy() bool { + return rh.Len&unix.BPF_RINGBUF_BUSY_BIT != 0 +} + +func (rh *ringbufHeader) isDiscard() bool { + return rh.Len&unix.BPF_RINGBUF_DISCARD_BIT != 0 +} + +func (rh *ringbufHeader) dataLen() int { + return int(rh.Len & ^uint32(unix.BPF_RINGBUF_BUSY_BIT|unix.BPF_RINGBUF_DISCARD_BIT)) +} + +type Record struct { + RawSample []byte + + // The minimum number of bytes remaining in the ring buffer after this Record has been read. + Remaining int +} + +// Reader allows reading bpf_ringbuf_output +// from user space. +type Reader struct { + poller *epoll.Poller + + // mu protects read/write access to the Reader structure + mu sync.Mutex + ring *ringbufEventRing + epollEvents []unix.EpollEvent + haveData bool + deadline time.Time + bufferSize int + pendingErr error +} + +// NewReader creates a new BPF ringbuf reader. +func NewReader(ringbufMap *ebpf.Map) (*Reader, error) { + if ringbufMap.Type() != ebpf.RingBuf { + return nil, fmt.Errorf("invalid Map type: %s", ringbufMap.Type()) + } + + maxEntries := int(ringbufMap.MaxEntries()) + if maxEntries == 0 || (maxEntries&(maxEntries-1)) != 0 { + return nil, fmt.Errorf("ringbuffer map size %d is zero or not a power of two", maxEntries) + } + + poller, err := epoll.New() + if err != nil { + return nil, err + } + + if err := poller.Add(ringbufMap.FD(), 0); err != nil { + poller.Close() + return nil, err + } + + ring, err := newRingBufEventRing(ringbufMap.FD(), maxEntries) + if err != nil { + poller.Close() + return nil, fmt.Errorf("failed to create ringbuf ring: %w", err) + } + + return &Reader{ + poller: poller, + ring: ring, + epollEvents: make([]unix.EpollEvent, 1), + bufferSize: ring.size(), + }, nil +} + +// Close frees resources used by the reader. +// +// It interrupts calls to Read. +func (r *Reader) Close() error { + if err := r.poller.Close(); err != nil { + if errors.Is(err, os.ErrClosed) { + return nil + } + return err + } + + // Acquire the lock. This ensures that Read isn't running. + r.mu.Lock() + defer r.mu.Unlock() + + if r.ring != nil { + r.ring.Close() + r.ring = nil + } + + return nil +} + +// SetDeadline controls how long Read and ReadInto will block waiting for samples. +// +// Passing a zero time.Time will remove the deadline. +func (r *Reader) SetDeadline(t time.Time) { + r.mu.Lock() + defer r.mu.Unlock() + + r.deadline = t +} + +// Read the next record from the BPF ringbuf. +// +// Calling [Close] interrupts the method with [os.ErrClosed]. Calling [Flush] +// makes it return all records currently in the ring buffer, followed by [ErrFlushed]. +// +// Returns [os.ErrDeadlineExceeded] if a deadline was set and after all records +// have been read from the ring. +// +// See [ReadInto] for a more efficient version of this method. +func (r *Reader) Read() (Record, error) { + var rec Record + return rec, r.ReadInto(&rec) +} + +// ReadInto is like Read except that it allows reusing Record and associated buffers. +func (r *Reader) ReadInto(rec *Record) error { + r.mu.Lock() + defer r.mu.Unlock() + + if r.ring == nil { + return fmt.Errorf("ringbuffer: %w", ErrClosed) + } + + for { + if !r.haveData { + if pe := r.pendingErr; pe != nil { + r.pendingErr = nil + return pe + } + + _, err := r.poller.Wait(r.epollEvents[:cap(r.epollEvents)], r.deadline) + if errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, ErrFlushed) { + // Ignoring this for reading a valid entry after timeout or flush. + // This can occur if the producer submitted to the ring buffer + // with BPF_RB_NO_WAKEUP. + r.pendingErr = err + } else if err != nil { + return err + } + r.haveData = true + } + + for { + err := r.ring.readRecord(rec) + // Not using errors.Is which is quite a bit slower + // For a tight loop it might make a difference + if err == errBusy { + continue + } + if err == errEOR { + r.haveData = false + break + } + return err + } + } +} + +// BufferSize returns the size in bytes of the ring buffer +func (r *Reader) BufferSize() int { + return r.bufferSize +} + +// Flush unblocks Read/ReadInto and successive Read/ReadInto calls will return pending samples at this point, +// until you receive a ErrFlushed error. +func (r *Reader) Flush() error { + return r.poller.Flush() +} diff --git a/vendor/github.com/cilium/ebpf/ringbuf/ring.go b/vendor/github.com/cilium/ebpf/ringbuf/ring.go new file mode 100644 index 0000000000..8f8f4bce36 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ringbuf/ring.go @@ -0,0 +1,137 @@ +package ringbuf + +import ( + "fmt" + "io" + "os" + "runtime" + "sync/atomic" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +type ringbufEventRing struct { + prod []byte + cons []byte + *ringReader +} + +func newRingBufEventRing(mapFD, size int) (*ringbufEventRing, error) { + cons, err := unix.Mmap(mapFD, 0, os.Getpagesize(), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) + if err != nil { + return nil, fmt.Errorf("can't mmap consumer page: %w", err) + } + + prod, err := unix.Mmap(mapFD, (int64)(os.Getpagesize()), os.Getpagesize()+2*size, unix.PROT_READ, unix.MAP_SHARED) + if err != nil { + _ = unix.Munmap(cons) + return nil, fmt.Errorf("can't mmap data pages: %w", err) + } + + cons_pos := (*uint64)(unsafe.Pointer(&cons[0])) + prod_pos := (*uint64)(unsafe.Pointer(&prod[0])) + + ring := &ringbufEventRing{ + prod: prod, + cons: cons, + ringReader: newRingReader(cons_pos, prod_pos, prod[os.Getpagesize():]), + } + runtime.SetFinalizer(ring, (*ringbufEventRing).Close) + + return ring, nil +} + +func (ring *ringbufEventRing) Close() { + runtime.SetFinalizer(ring, nil) + + _ = unix.Munmap(ring.prod) + _ = unix.Munmap(ring.cons) + + ring.prod = nil + ring.cons = nil +} + +type ringReader struct { + // These point into mmap'ed memory and must be accessed atomically. + prod_pos, cons_pos *uint64 + mask uint64 + ring []byte +} + +func newRingReader(cons_ptr, prod_ptr *uint64, ring []byte) *ringReader { + return &ringReader{ + prod_pos: prod_ptr, + cons_pos: cons_ptr, + // cap is always a power of two + mask: uint64(cap(ring)/2 - 1), + ring: ring, + } +} + +// To be able to wrap around data, data pages in ring buffers are mapped twice in +// a single contiguous virtual region. +// Therefore the returned usable size is half the size of the mmaped region. +func (rr *ringReader) size() int { + return cap(rr.ring) / 2 +} + +// Read a record from an event ring. +func (rr *ringReader) readRecord(rec *Record) error { + prod := atomic.LoadUint64(rr.prod_pos) + cons := atomic.LoadUint64(rr.cons_pos) + + for { + if remaining := prod - cons; remaining == 0 { + return errEOR + } else if remaining < unix.BPF_RINGBUF_HDR_SZ { + return fmt.Errorf("read record header: %w", io.ErrUnexpectedEOF) + } + + // read the len field of the header atomically to ensure a happens before + // relationship with the xchg in the kernel. Without this we may see len + // without BPF_RINGBUF_BUSY_BIT before the written data is visible. + // See https://github.com/torvalds/linux/blob/v6.8/kernel/bpf/ringbuf.c#L484 + start := cons & rr.mask + len := atomic.LoadUint32((*uint32)((unsafe.Pointer)(&rr.ring[start]))) + header := ringbufHeader{Len: len} + + if header.isBusy() { + // the next sample in the ring is not committed yet so we + // exit without storing the reader/consumer position + // and start again from the same position. + return errBusy + } + + cons += unix.BPF_RINGBUF_HDR_SZ + + // Data is always padded to 8 byte alignment. + dataLenAligned := uint64(internal.Align(header.dataLen(), 8)) + if remaining := prod - cons; remaining < dataLenAligned { + return fmt.Errorf("read sample data: %w", io.ErrUnexpectedEOF) + } + + start = cons & rr.mask + cons += dataLenAligned + + if header.isDiscard() { + // when the record header indicates that the data should be + // discarded, we skip it by just updating the consumer position + // to the next record. + atomic.StoreUint64(rr.cons_pos, cons) + continue + } + + if n := header.dataLen(); cap(rec.RawSample) < n { + rec.RawSample = make([]byte, n) + } else { + rec.RawSample = rec.RawSample[:n] + } + + copy(rec.RawSample, rr.ring[start:]) + rec.Remaining = int(prod - cons) + atomic.StoreUint64(rr.cons_pos, cons) + return nil + } +} diff --git a/vendor/github.com/cilium/ebpf/run-tests.sh b/vendor/github.com/cilium/ebpf/run-tests.sh deleted file mode 100644 index c7ff7ea333..0000000000 --- a/vendor/github.com/cilium/ebpf/run-tests.sh +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env bash -# Test the current package under a different kernel. -# Requires virtme and qemu to be installed. -# Examples: -# Run all tests on a 5.4 kernel -# $ ./run-tests.sh 5.4 -# Run a subset of tests: -# $ ./run-tests.sh 5.4 ./link -# Run using a local kernel image -# $ ./run-tests.sh /path/to/bzImage - -set -euo pipefail - -script="$(realpath "$0")" -readonly script - -source "$(dirname "$script")/testdata/sh/lib.sh" - -quote_env() { - for var in "$@"; do - if [ -v "$var" ]; then - printf "%s=%q " "$var" "${!var}" - fi - done -} - -declare -a preserved_env=( - PATH - CI_MAX_KERNEL_VERSION - TEST_SEED - KERNEL_VERSION -) - -# This script is a bit like a Matryoshka doll since it keeps re-executing itself -# in various different contexts: -# -# 1. invoked by the user like run-tests.sh 5.4 -# 2. invoked by go test like run-tests.sh --exec-vm -# 3. invoked by init in the vm like run-tests.sh --exec-test -# -# This allows us to use all available CPU on the host machine to compile our -# code, and then only use the VM to execute the test. This is because the VM -# is usually slower at compiling than the host. -if [[ "${1:-}" = "--exec-vm" ]]; then - shift - - input="$1" - shift - - # Use sudo if /dev/kvm isn't accessible by the current user. - sudo="" - if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then - sudo="sudo" - fi - readonly sudo - - testdir="$(dirname "$1")" - output="$(mktemp -d)" - printf -v cmd "%q " "$@" - - if [[ "$(stat -c '%t:%T' -L /proc/$$/fd/0)" == "1:3" ]]; then - # stdin is /dev/null, which doesn't play well with qemu. Use a fifo as a - # blocking substitute. - mkfifo "${output}/fake-stdin" - # Open for reading and writing to avoid blocking. - exec 0<> "${output}/fake-stdin" - rm "${output}/fake-stdin" - fi - - if ! $sudo virtme-run --kimg "${input}/boot/vmlinuz" --cpus 2 --memory 1G --pwd \ - --rwdir="${testdir}=${testdir}" \ - --rodir=/run/input="${input}" \ - --rwdir=/run/output="${output}" \ - --script-sh "$(quote_env "${preserved_env[@]}") \"$script\" \ - --exec-test $cmd"; then - exit 23 - fi - - if ! [[ -e "${output}/status" ]]; then - exit 42 - fi - - rc=$(<"${output}/status") - $sudo rm -r "$output" - exit "$rc" -elif [[ "${1:-}" = "--exec-test" ]]; then - shift - - mount -t bpf bpf /sys/fs/bpf - mount -t tracefs tracefs /sys/kernel/debug/tracing - - if [[ -d "/run/input/usr/src/linux/tools/testing/selftests/bpf" ]]; then - export KERNEL_SELFTESTS="/run/input/usr/src/linux/tools/testing/selftests/bpf" - fi - - if [[ -d "/run/input/lib/modules" ]]; then - find /run/input/lib/modules -type f -name bpf_testmod.ko -exec insmod {} \; - fi - - dmesg --clear - rc=0 - "$@" || rc=$? - dmesg - echo $rc > "/run/output/status" - exit $rc # this return code is "swallowed" by qemu -fi - -if [[ -z "${1:-}" ]]; then - echo "Expecting kernel version or path as first argument" - exit 1 -fi - -input="$(mktemp -d)" -readonly input - -if [[ -f "${1}" ]]; then - # First argument is a local file. - readonly kernel="${1}" - cp "${1}" "${input}/boot/vmlinuz" -else - readonly kernel="${1}" - - # LINUX_VERSION_CODE test compares this to discovered value. - export KERNEL_VERSION="${1}" - - if ! extract_oci_image "ghcr.io/cilium/ci-kernels:${kernel}-selftests" "${input}"; then - extract_oci_image "ghcr.io/cilium/ci-kernels:${kernel}" "${input}" - fi -fi -shift - -args=(-short -coverpkg=./... -coverprofile=coverage.out -count 1 ./...) -if (( $# > 0 )); then - args=("$@") -fi - -export GOFLAGS=-mod=readonly -export CGO_ENABLED=0 - -echo Testing on "${kernel}" -go test -exec "$script --exec-vm $input" "${args[@]}" -echo "Test successful on ${kernel}" - -rm -r "${input}" diff --git a/vendor/golang.org/x/exp/LICENSE b/vendor/golang.org/x/exp/LICENSE index 6a66aea5ea..2a7cf70da6 100644 --- a/vendor/golang.org/x/exp/LICENSE +++ b/vendor/golang.org/x/exp/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/net/http2/transport.go b/vendor/golang.org/x/net/http2/transport.go index 98a49c6b6e..61f511f97a 100644 --- a/vendor/golang.org/x/net/http2/transport.go +++ b/vendor/golang.org/x/net/http2/transport.go @@ -827,10 +827,6 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro cc.henc.SetMaxDynamicTableSizeLimit(t.maxEncoderHeaderTableSize()) cc.peerMaxHeaderTableSize = initialHeaderTableSize - if t.AllowHTTP { - cc.nextStreamID = 3 - } - if cs, ok := c.(connectionStater); ok { state := cs.ConnectionState() cc.tlsState = &state diff --git a/vendor/modules.txt b/vendor/modules.txt index 7378971bf2..5a84ffb51c 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -23,12 +23,13 @@ github.com/bits-and-blooms/bitset # github.com/cespare/xxhash/v2 v2.3.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 -# github.com/cilium/ebpf v0.15.0 -## explicit; go 1.21.0 +# github.com/cilium/ebpf v0.16.0 +## explicit; go 1.21 github.com/cilium/ebpf github.com/cilium/ebpf/asm github.com/cilium/ebpf/btf github.com/cilium/ebpf/internal +github.com/cilium/ebpf/internal/epoll github.com/cilium/ebpf/internal/kallsyms github.com/cilium/ebpf/internal/kconfig github.com/cilium/ebpf/internal/sys @@ -36,6 +37,7 @@ github.com/cilium/ebpf/internal/sysenc github.com/cilium/ebpf/internal/tracefs github.com/cilium/ebpf/internal/unix github.com/cilium/ebpf/link +github.com/cilium/ebpf/ringbuf github.com/cilium/ebpf/rlimit # github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc ## explicit @@ -260,11 +262,11 @@ github.com/spf13/pflag # github.com/stretchr/testify v1.9.0 ## explicit; go 1.17 github.com/stretchr/testify/assert -# golang.org/x/exp v0.0.0-20240119083558-1b970713d09a +# golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 ## explicit; go 1.20 golang.org/x/exp/constraints golang.org/x/exp/maps -# golang.org/x/net v0.26.0 +# golang.org/x/net v0.27.0 ## explicit; go 1.18 golang.org/x/net/context golang.org/x/net/html @@ -285,7 +287,7 @@ golang.org/x/oauth2/internal golang.org/x/sys/plan9 golang.org/x/sys/unix golang.org/x/sys/windows -# golang.org/x/term v0.21.0 +# golang.org/x/term v0.22.0 ## explicit; go 1.18 golang.org/x/term # golang.org/x/text v0.16.0 @@ -313,7 +315,7 @@ golang.org/x/text/unicode/norm # golang.org/x/time v0.5.0 ## explicit; go 1.18 golang.org/x/time/rate -# golang.org/x/tools v0.22.0 +# golang.org/x/tools v0.23.0 ## explicit; go 1.19 golang.org/x/tools/cover golang.org/x/tools/go/ast/inspector