diff options
Diffstat (limited to 'tools')
37 files changed, 1588 insertions, 49 deletions
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4fa4bc4505f5..4252fc22f78f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -880,15 +880,26 @@ struct bpf_object *bpf_object__open(const char *path) } struct bpf_object *bpf_object__open_buffer(void *obj_buf, - size_t obj_buf_sz) + size_t obj_buf_sz, + const char *name) { + char tmp_name[64]; + /* param validation */ if (!obj_buf || obj_buf_sz <= 0) return NULL; - pr_debug("loading object from buffer\n"); + if (!name) { + snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", + (unsigned long)obj_buf, + (unsigned long)obj_buf_sz); + tmp_name[sizeof(tmp_name) - 1] = '\0'; + name = tmp_name; + } + pr_debug("loading object '%s' from buffer\n", + name); - return __bpf_object__open("[buffer]", obj_buf, obj_buf_sz); + return __bpf_object__open(name, obj_buf, obj_buf_sz); } int bpf_object__unload(struct bpf_object *obj) @@ -975,6 +986,14 @@ bpf_object__next(struct bpf_object *prev) return next; } +const char * +bpf_object__get_name(struct bpf_object *obj) +{ + if (!obj) + return NULL; + return obj->path; +} + struct bpf_program * bpf_program__next(struct bpf_program *prev, struct bpf_object *obj) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index ea8adc206b62..f16170c95ffd 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -28,12 +28,14 @@ struct bpf_object; struct bpf_object *bpf_object__open(const char *path); struct bpf_object *bpf_object__open_buffer(void *obj_buf, - size_t obj_buf_sz); + size_t obj_buf_sz, + const char *name); void bpf_object__close(struct bpf_object *object); /* Load/unload object into/from kernel */ int bpf_object__load(struct bpf_object *obj); int bpf_object__unload(struct bpf_object *obj); +const char *bpf_object__get_name(struct bpf_object *obj); struct bpf_object *bpf_object__next(struct bpf_object *prev); #define bpf_object__for_each_safe(pos, tmp) \ diff --git a/tools/net/bpf_jit_disasm.c b/tools/net/bpf_jit_disasm.c index 618c2bcd4eab..2cd3d4c99738 100644 --- a/tools/net/bpf_jit_disasm.c +++ b/tools/net/bpf_jit_disasm.c @@ -22,9 +22,14 @@ #include <string.h> #include <bfd.h> #include <dis-asm.h> +#include <regex.h> +#include <fcntl.h> #include <sys/klog.h> #include <sys/types.h> -#include <regex.h> +#include <sys/stat.h> + +#define CMD_ACTION_SIZE_BUFFER 10 +#define CMD_ACTION_READ_ALL 3 static void get_exec_path(char *tpath, size_t size) { @@ -87,20 +92,66 @@ static void get_asm_insns(uint8_t *image, size_t len, int opcodes) bfd_close(bfdf); } -static char *get_klog_buff(int *klen) +static char *get_klog_buff(unsigned int *klen) { - int ret, len = klogctl(10, NULL, 0); - char *buff = malloc(len); + int ret, len; + char *buff; + + len = klogctl(CMD_ACTION_SIZE_BUFFER, NULL, 0); + buff = malloc(len); + if (!buff) + return NULL; + + ret = klogctl(CMD_ACTION_READ_ALL, buff, len); + if (ret < 0) { + free(buff); + return NULL; + } - assert(buff && klen); - ret = klogctl(3, buff, len); - assert(ret >= 0); *klen = ret; + return buff; +} +static char *get_flog_buff(const char *file, unsigned int *klen) +{ + int fd, ret, len; + struct stat fi; + char *buff; + + fd = open(file, O_RDONLY); + if (fd < 0) + return NULL; + + ret = fstat(fd, &fi); + if (ret < 0 || !S_ISREG(fi.st_mode)) + goto out; + + len = fi.st_size + 1; + buff = malloc(len); + if (!buff) + goto out; + + memset(buff, 0, len); + ret = read(fd, buff, len - 1); + if (ret <= 0) + goto out_free; + + close(fd); + *klen = ret; return buff; +out_free: + free(buff); +out: + close(fd); + return NULL; +} + +static char *get_log_buff(const char *file, unsigned int *klen) +{ + return file ? get_flog_buff(file, klen) : get_klog_buff(klen); } -static void put_klog_buff(char *buff) +static void put_log_buff(char *buff) { free(buff); } @@ -138,8 +189,10 @@ static int get_last_jit_image(char *haystack, size_t hlen, ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so); ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx", &flen, &proglen, &pass, &base); - if (ret != 4) + if (ret != 4) { + regfree(®ex); return 0; + } tmp = ptr = haystack + off; while ((ptr = strtok(tmp, "\n")) != NULL && ulen < ilen) { @@ -169,31 +222,49 @@ static int get_last_jit_image(char *haystack, size_t hlen, return ulen; } +static void usage(void) +{ + printf("Usage: bpf_jit_disasm [...]\n"); + printf(" -o Also display related opcodes (default: off).\n"); + printf(" -f <file> Read last image dump from file or stdin (default: klog).\n"); + printf(" -h Display this help.\n"); +} + int main(int argc, char **argv) { - int len, klen, opcodes = 0; - char *kbuff; + unsigned int len, klen, opt, opcodes = 0; static uint8_t image[32768]; + char *kbuff, *file = NULL; - if (argc > 1) { - if (!strncmp("-o", argv[argc - 1], 2)) { + while ((opt = getopt(argc, argv, "of:")) != -1) { + switch (opt) { + case 'o': opcodes = 1; - } else { - printf("usage: bpf_jit_disasm [-o: show opcodes]\n"); - exit(0); + break; + case 'f': + file = optarg; + break; + default: + usage(); + return -1; } } bfd_init(); memset(image, 0, sizeof(image)); - kbuff = get_klog_buff(&klen); + kbuff = get_log_buff(file, &klen); + if (!kbuff) { + fprintf(stderr, "Could not retrieve log buffer!\n"); + return -1; + } len = get_last_jit_image(kbuff, klen, image, sizeof(image)); if (len > 0) get_asm_insns(image, len, opcodes); + else + fprintf(stderr, "No JIT image found!\n"); - put_klog_buff(kbuff); - + put_log_buff(kbuff); return 0; } diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 347a27322ed8..2e9ce77b5e14 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -276,7 +276,11 @@ filter out the startup phase of the program, which is often very different. --intr-regs:: Capture machine state (registers) at interrupt, i.e., on counter overflows for each sample. List of captured registers depends on the architecture. This option -is off by default. +is off by default. It is possible to select the registers to sample using their +symbolic names, e.g. on x86, ax, si. To list the available registers use +--intr-regs=\?. To name registers, pass a comma separated list such as +--intr-regs=ax,bx. The list of register is architecture dependent. + --running-time:: Record running and enabled time for read events (:S) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 614b2c7b0293..dc3ec783b7bd 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -116,7 +116,7 @@ OPTIONS --fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, - srcline, period, flags. + srcline, period, iregs, flags. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace diff --git a/tools/perf/arch/sh/util/dwarf-regs.c b/tools/perf/arch/sh/util/dwarf-regs.c index 0d0897f57a10..f8dfa89696f4 100644 --- a/tools/perf/arch/sh/util/dwarf-regs.c +++ b/tools/perf/arch/sh/util/dwarf-regs.c @@ -51,5 +51,5 @@ const char *sh_regs_table[SH_MAX_REGS] = { /* Return architecture dependent register string (for kprobe-tracer) */ const char *get_arch_regstr(unsigned int n) { - return (n <= SH_MAX_REGS) ? sh_regs_table[n] : NULL; + return (n < SH_MAX_REGS) ? sh_regs_table[n] : NULL; } diff --git a/tools/perf/arch/sparc/util/dwarf-regs.c b/tools/perf/arch/sparc/util/dwarf-regs.c index 92eda412fed3..b704fdb9237a 100644 --- a/tools/perf/arch/sparc/util/dwarf-regs.c +++ b/tools/perf/arch/sparc/util/dwarf-regs.c @@ -39,5 +39,5 @@ const char *sparc_regs_table[SPARC_MAX_REGS] = { */ const char *get_arch_regstr(unsigned int n) { - return (n <= SPARC_MAX_REGS) ? sparc_regs_table[n] : NULL; + return (n < SPARC_MAX_REGS) ? sparc_regs_table[n] : NULL; } diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 2c55e1b336c5..ff63649fa9ac 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -2,6 +2,7 @@ libperf-y += header.o libperf-y += tsc.o libperf-y += pmu.o libperf-y += kvm-stat.o +libperf-y += perf_regs.o libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c index be22dd463232..a08de0a35b83 100644 --- a/tools/perf/arch/x86/util/dwarf-regs.c +++ b/tools/perf/arch/x86/util/dwarf-regs.c @@ -71,5 +71,5 @@ const char *x86_64_regs_table[X86_64_MAX_REGS] = { /* Return architecture dependent register string (for kprobe-tracer) */ const char *get_arch_regstr(unsigned int n) { - return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL; + return (n < ARCH_MAX_REGS) ? arch_regs_table[n] : NULL; } diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c new file mode 100644 index 000000000000..c5db14f36cc7 --- /dev/null +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -0,0 +1,28 @@ +#include "../../perf.h" +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG(AX, PERF_REG_X86_AX), + SMPL_REG(BX, PERF_REG_X86_BX), + SMPL_REG(CX, PERF_REG_X86_CX), + SMPL_REG(DX, PERF_REG_X86_DX), + SMPL_REG(SI, PERF_REG_X86_SI), + SMPL_REG(DI, PERF_REG_X86_DI), + SMPL_REG(BP, PERF_REG_X86_BP), + SMPL_REG(SP, PERF_REG_X86_SP), + SMPL_REG(IP, PERF_REG_X86_IP), + SMPL_REG(FLAGS, PERF_REG_X86_FLAGS), + SMPL_REG(CS, PERF_REG_X86_CS), + SMPL_REG(SS, PERF_REG_X86_SS), +#ifdef HAVE_ARCH_X86_64_SUPPORT + SMPL_REG(R8, PERF_REG_X86_R8), + SMPL_REG(R9, PERF_REG_X86_R9), + SMPL_REG(R10, PERF_REG_X86_R10), + SMPL_REG(R11, PERF_REG_X86_R11), + SMPL_REG(R12, PERF_REG_X86_R12), + SMPL_REG(R13, PERF_REG_X86_R13), + SMPL_REG(R14, PERF_REG_X86_R14), + SMPL_REG(R15, PERF_REG_X86_R15), +#endif + SMPL_REG_END +}; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a660022f2c92..142eeb341b29 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -27,8 +27,10 @@ #include "util/cpumap.h" #include "util/thread_map.h" #include "util/data.h" +#include "util/perf_regs.h" #include "util/auxtrace.h" #include "util/parse-branch-options.h" +#include "util/parse-regs-options.h" #include <unistd.h> #include <sched.h> @@ -279,7 +281,7 @@ static int record__open(struct record *rec) evlist__for_each(evlist, pos) { try_again: - if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) { + if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) { if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { if (verbose) ui__warning("%s\n", msg); @@ -1080,8 +1082,9 @@ struct option __record_options[] = { "sample transaction flags (special events only)"), OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread, "use per-thread mmaps"), - OPT_BOOLEAN('I', "intr-regs", &record.opts.sample_intr_regs, - "Sample machine registers on interrupt"), + OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register", + "sample selected machine registers on interrupt," + " use -I ? to list register names", parse_regs), OPT_BOOLEAN(0, "running-time", &record.opts.running_time, "Record running/enabled time of read (:S) events"), OPT_CALLBACK('k', "clockid", &record.opts, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 4430340292c0..eb51325e8ad9 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -6,6 +6,7 @@ #include "util/exec_cmd.h" #include "util/header.h" #include "util/parse-options.h" +#include "util/perf_regs.h" #include "util/session.h" #include "util/tool.h" #include "util/symbol.h" @@ -46,6 +47,7 @@ enum perf_output_field { PERF_OUTPUT_SYMOFFSET = 1U << 11, PERF_OUTPUT_SRCLINE = 1U << 12, PERF_OUTPUT_PERIOD = 1U << 13, + PERF_OUTPUT_IREGS = 1U << 14, }; struct output_option { @@ -66,6 +68,7 @@ struct output_option { {.str = "symoff", .field = PERF_OUTPUT_SYMOFFSET}, {.str = "srcline", .field = PERF_OUTPUT_SRCLINE}, {.str = "period", .field = PERF_OUTPUT_PERIOD}, + {.str = "iregs", .field = PERF_OUTPUT_IREGS}, }; /* default set to maintain compatibility with current format */ @@ -255,6 +258,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, PERF_OUTPUT_PERIOD)) return -EINVAL; + if (PRINT_FIELD(IREGS) && + perf_evsel__check_stype(evsel, PERF_SAMPLE_REGS_INTR, "IREGS", + PERF_OUTPUT_IREGS)) + return -EINVAL; + return 0; } @@ -352,6 +360,24 @@ out: return 0; } +static void print_sample_iregs(union perf_event *event __maybe_unused, + struct perf_sample *sample, + struct thread *thread __maybe_unused, + struct perf_event_attr *attr) +{ + struct regs_dump *regs = &sample->intr_regs; + uint64_t mask = attr->sample_regs_intr; + unsigned i = 0, r; + + if (!regs) + return; + + for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) { + u64 val = regs->regs[i++]; + printf("%5s:0x%"PRIx64" ", perf_reg_name(r), val); + } +} + static void print_sample_start(struct perf_sample *sample, struct thread *thread, struct perf_evsel *evsel) @@ -525,6 +551,9 @@ static void process_event(union perf_event *event, struct perf_sample *sample, PERF_MAX_STACK_DEPTH); } + if (PRINT_FIELD(IREGS)) + print_sample_iregs(event, sample, thread, attr); + printf("\n"); } @@ -1643,7 +1672,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "comma separated output fields prepend with 'type:'. " "Valid types: hw,sw,trace,raw. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," - "addr,symoff,period,flags", parse_output_fields), + "addr,symoff,period,iregs,flags", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", diff --git a/tools/perf/perf.h b/tools/perf/perf.h index cccb4cf575d3..90129accffbe 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -54,7 +54,6 @@ struct record_opts { bool sample_time_set; bool callgraph_set; bool period; - bool sample_intr_regs; bool running_time; bool full_auxtrace; bool auxtrace_snapshot_mode; @@ -64,6 +63,7 @@ struct record_opts { unsigned int auxtrace_mmap_pages; unsigned int user_freq; u64 branch_stack; + u64 sample_intr_regs; u64 default_interval; u64 user_interval; size_t auxtrace_snapshot_size; diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c index a337356fd979..52d55971f66f 100644 --- a/tools/perf/tests/llvm.c +++ b/tools/perf/tests/llvm.c @@ -26,7 +26,7 @@ static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz) { struct bpf_object *obj; - obj = bpf_object__open_buffer(obj_buf, obj_buf_sz); + obj = bpf_object__open_buffer(obj_buf, obj_buf_sz, NULL); if (!obj) return -1; bpf_object__close(obj); diff --git a/tools/perf/util/Build b/tools/perf/util/Build index e912856cc4e5..349bc96ca1fe 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -75,6 +75,7 @@ libperf-y += record.o libperf-y += srcline.o libperf-y += data.o libperf-$(CONFIG_X86) += tsc.o +libperf-$(CONFIG_AUXTRACE) += tsc.o libperf-y += cloexec.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o @@ -82,6 +83,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ libperf-$(CONFIG_AUXTRACE) += intel-pt.o libperf-$(CONFIG_AUXTRACE) += intel-bts.o libperf-y += parse-branch-options.o +libperf-y += parse-regs-options.o libperf-$(CONFIG_LIBELF) += symbol-elf.o libperf-$(CONFIG_LIBELF) += probe-file.o diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 8d00039d6a20..d51a5200c8af 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1181,6 +1181,10 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **e if (evsel->filter == NULL) continue; + /* + * filters only work for tracepoint event, which doesn't have cpu limit. + * So evlist and evsel should always be same. + */ err = perf_evsel__apply_filter(evsel, ncpus, nthreads, evsel->filter); if (err) { *err_evsel = evsel; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index bac25f41a751..c53f79123b37 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -787,7 +787,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) perf_evsel__config_callgraph(evsel, opts, &callchain_param); if (opts->sample_intr_regs) { - attr->sample_regs_intr = PERF_REGS_MASK; + attr->sample_regs_intr = opts->sample_intr_regs; perf_evsel__set_sample_bit(evsel, REGS_INTR); } diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build index 240730d682c1..2386322ece4f 100644 --- a/tools/perf/util/intel-pt-decoder/Build +++ b/tools/perf/util/intel-pt-decoder/Build @@ -4,6 +4,7 @@ inat_tables_script = util/intel-pt-decoder/gen-insn-attr-x86.awk inat_tables_maps = util/intel-pt-decoder/x86-opcode-map.txt $(OUTPUT)util/intel-pt-decoder/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call rule_mkdir) @$(call echo-cmd,gen)$(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ $(OUTPUT)util/intel-pt-decoder/intel-pt-insn-decoder.o: util/intel-pt-decoder/inat.c $(OUTPUT)util/intel-pt-decoder/inat-tables.c diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c index 9e4eb8fcd559..d23138c06665 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c @@ -146,6 +146,9 @@ static void intel_pt_insn_decoder(struct insn *insn, case 4: intel_pt_insn->rel = bswap_32(insn->immediate.value); break; + default: + intel_pt_insn->rel = 0; + break; } #else intel_pt_insn->rel = insn->immediate.value; diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c new file mode 100644 index 000000000000..4f2c1c255d81 --- /dev/null +++ b/tools/perf/util/parse-regs-options.c @@ -0,0 +1,71 @@ +#include "perf.h" +#include "util/util.h" +#include "util/debug.h" +#include "util/parse-options.h" +#include "util/parse-regs-options.h" + +int +parse_regs(const struct option *opt, const char *str, int unset) +{ + uint64_t *mode = (uint64_t *)opt->value; + const struct sample_reg *r; + char *s, *os = NULL, *p; + int ret = -1; + + if (unset) + return 0; + + /* + * cannot set it twice + */ + if (*mode) + return -1; + + /* str may be NULL in case no arg is passed to -I */ + if (str) { + /* because str is read-only */ + s = os = strdup(str); + if (!s) + return -1; + + for (;;) { + p = strchr(s, ','); + if (p) + *p = '\0'; + + if (!strcmp(s, "?")) { + fprintf(stderr, "available registers: "); + for (r = sample_reg_masks; r->name; r++) { + fprintf(stderr, "%s ", r->name); + } + fputc('\n', stderr); + /* just printing available regs */ + return -1; + } + for (r = sample_reg_masks; r->name; r++) { + if (!strcasecmp(s, r->name)) + break; + } + if (!r->name) { + ui__warning("unknown register %s," + " check man page\n", s); + goto error; + } + + *mode |= r->mask; + + if (!p) + break; + + s = p + 1; + } + } + ret = 0; + + /* default to all possible regs */ + if (*mode == 0) + *mode = PERF_REGS_MASK; +error: + free(os); + return ret; +} diff --git a/tools/perf/util/parse-regs-options.h b/tools/perf/util/parse-regs-options.h new file mode 100644 index 000000000000..7d762b188007 --- /dev/null +++ b/tools/perf/util/parse-regs-options.h @@ -0,0 +1,5 @@ +#ifndef _PERF_PARSE_REGS_OPTIONS_H +#define _PERF_PARSE_REGS_OPTIONS_H 1 +struct option; +int parse_regs(const struct option *opt, const char *str, int unset); +#endif /* _PERF_PARSE_REGS_OPTIONS_H */ diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 43168fb0d9a2..885e8ac83997 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -2,6 +2,10 @@ #include "perf_regs.h" #include "event.h" +const struct sample_reg __weak sample_reg_masks[] = { + SMPL_REG_END +}; + int perf_reg_value(u64 *valp, struct regs_dump *regs, int id) { int i, idx = 0; diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h index 980dbf76bc98..2984dcc54d67 100644 --- a/tools/perf/util/perf_regs.h +++ b/tools/perf/util/perf_regs.h @@ -5,6 +5,15 @@ struct regs_dump; +struct sample_reg { + const char *name; + uint64_t mask; +}; +#define SMPL_REG(n, b) { .name = #n, .mask = 1ULL << (b) } +#define SMPL_REG_END { .name = NULL } + +extern const struct sample_reg sample_reg_masks[]; + #ifdef HAVE_PERF_REGS_SUPPORT #include <perf_regs.h> diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 24ae9e829e9a..b8f12e0897e6 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -20,6 +20,7 @@ ifneq (1, $(quicktest)) TARGETS += timers endif TARGETS += user +TARGETS += jumplabel TARGETS += vm TARGETS += x86 #Please keep the TARGETS list alphabetically sorted diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore new file mode 100644 index 000000000000..b732dd0d4738 --- /dev/null +++ b/tools/testing/selftests/capabilities/.gitignore @@ -0,0 +1,2 @@ +test_execve +validate_cap diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile new file mode 100644 index 000000000000..8c8f0c1f0889 --- /dev/null +++ b/tools/testing/selftests/capabilities/Makefile @@ -0,0 +1,18 @@ +all: + +include ../lib.mk + +.PHONY: all clean + +TARGETS := validate_cap test_execve +TEST_PROGS := test_execve + +CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng + +all: $(TARGETS) + +clean: + $(RM) $(TARGETS) + +$(TARGETS): %: %.c + $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c new file mode 100644 index 000000000000..10a21a958aaf --- /dev/null +++ b/tools/testing/selftests/capabilities/test_execve.c @@ -0,0 +1,427 @@ +#define _GNU_SOURCE + +#include <cap-ng.h> +#include <err.h> +#include <linux/capability.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdarg.h> +#include <sched.h> +#include <sys/mount.h> +#include <limits.h> +#include <libgen.h> +#include <malloc.h> +#include <sys/wait.h> +#include <sys/prctl.h> +#include <sys/stat.h> + +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 +#endif + +static int nerrs; + +static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap) +{ + char buf[4096]; + int fd; + ssize_t written; + int buf_len; + + buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); + if (buf_len < 0) { + err(1, "vsnprintf failed"); + } + if (buf_len >= sizeof(buf)) { + errx(1, "vsnprintf output truncated"); + } + + fd = open(filename, O_WRONLY); + if (fd < 0) { + if ((errno == ENOENT) && enoent_ok) + return; + err(1, "open of %s failed", filename); + } + written = write(fd, buf, buf_len); + if (written != buf_len) { + if (written >= 0) { + errx(1, "short write to %s", filename); + } else { + err(1, "write to %s failed", filename); + } + } + if (close(fd) != 0) { + err(1, "close of %s failed", filename); + } +} + +static void maybe_write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(true, filename, fmt, ap); + va_end(ap); +} + +static void write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(false, filename, fmt, ap); + va_end(ap); +} + +static bool create_and_enter_ns(uid_t inner_uid) +{ + uid_t outer_uid; + gid_t outer_gid; + int i; + bool have_outer_privilege; + + outer_uid = getuid(); + outer_gid = getgid(); + + /* + * TODO: If we're already root, we could skip creating the userns. + */ + + if (unshare(CLONE_NEWNS) == 0) { + printf("[NOTE]\tUsing global UIDs for tests\n"); + if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0) + err(1, "PR_SET_KEEPCAPS"); + if (setresuid(inner_uid, inner_uid, -1) != 0) + err(1, "setresuid"); + + // Re-enable effective caps + capng_get_caps_process(); + for (i = 0; i < CAP_LAST_CAP; i++) + if (capng_have_capability(CAPNG_PERMITTED, i)) + capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + have_outer_privilege = true; + } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) { + printf("[NOTE]\tUsing a user namespace for tests\n"); + maybe_write_file("/proc/self/setgroups", "deny"); + write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid); + write_file("/proc/self/gid_map", "0 %d 1", outer_gid); + + have_outer_privilege = false; + } else { + errx(1, "must be root or be able to create a userns"); + } + + if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0) + err(1, "remount everything private"); + + return have_outer_privilege; +} + +static void chdir_to_tmpfs(void) +{ + char cwd[PATH_MAX]; + if (getcwd(cwd, sizeof(cwd)) != cwd) + err(1, "getcwd"); + + if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0) + err(1, "mount private tmpfs"); + + if (chdir(cwd) != 0) + err(1, "chdir to private tmpfs"); + + if (umount2(".", MNT_DETACH) != 0) + err(1, "detach private tmpfs"); +} + +static void copy_fromat_to(int fromfd, const char *fromname, const char *toname) +{ + int from = openat(fromfd, fromname, O_RDONLY); + if (from == -1) + err(1, "open copy source"); + + int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700); + + while (true) { + char buf[4096]; + ssize_t sz = read(from, buf, sizeof(buf)); + if (sz == 0) + break; + if (sz < 0) + err(1, "read"); + + if (write(to, buf, sz) != sz) + err(1, "write"); /* no short writes on tmpfs */ + } + + close(from); + close(to); +} + +static bool fork_wait(void) +{ + pid_t child = fork(); + if (child == 0) { + nerrs = 0; + return true; + } else if (child > 0) { + int status; + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tChild died\n"); + nerrs++; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild succeeded\n"); + } + + return false; + } else { + err(1, "fork"); + } +} + +static void exec_other_validate_cap(const char *name, + bool eff, bool perm, bool inh, bool ambient) +{ + execl(name, name, (eff ? "1" : "0"), + (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"), + NULL); + err(1, "execl"); +} + +static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient) +{ + exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient); +} + +static int do_tests(int uid, const char *our_path) +{ + bool have_outer_privilege = create_and_enter_ns(uid); + + int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY); + if (ourpath_fd == -1) + err(1, "open '%s'", our_path); + + chdir_to_tmpfs(); + + copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap"); + + if (have_outer_privilege) { + uid_t gid = getegid(); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_suidroot"); + if (chown("validate_cap_suidroot", 0, -1) != 0) + err(1, "chown"); + if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_suidnonroot"); + if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0) + err(1, "chown"); + if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_sgidroot"); + if (chown("validate_cap_sgidroot", -1, 0) != 0) + err(1, "chown"); + if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0) + err(1, "chmod"); + + copy_fromat_to(ourpath_fd, "validate_cap", + "validate_cap_sgidnonroot"); + if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0) + err(1, "chown"); + if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0) + err(1, "chmod"); +} + + capng_get_caps_process(); + + /* Make sure that i starts out clear */ + capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + if (uid == 0) { + printf("[RUN]\tRoot => ep\n"); + if (fork_wait()) + exec_validate_cap(true, true, false, false); + } else { + printf("[RUN]\tNon-root => no caps\n"); + if (fork_wait()) + exec_validate_cap(false, false, false, false); + } + + printf("[OK]\tCheck cap_ambient manipulation rules\n"); + + /* We should not be able to add ambient caps yet. */ + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) { + if (errno == EINVAL) + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n"); + else + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW); + capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW); + capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) { + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n"); + return 1; + } + printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) { + printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n"); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_CLEAR_ALL"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n"); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_RAISE"); + + capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) { + printf("[FAIL]\tDropping I should have dropped A\n"); + return 1; + } + + printf("[OK]\tBasic manipulation appears to work\n"); + + capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE); + if (capng_apply(CAPNG_SELECT_CAPS) != 0) + err(1, "capng_apply"); + if (uid == 0) { + printf("[RUN]\tRoot +i => eip\n"); + if (fork_wait()) + exec_validate_cap(true, true, true, false); + } else { + printf("[RUN]\tNon-root +i => i\n"); + if (fork_wait()) + exec_validate_cap(false, false, true, false); + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) + err(1, "PR_CAP_AMBIENT_RAISE"); + + printf("[RUN]\tUID %d +ia => eipa\n", uid); + if (fork_wait()) + exec_validate_cap(true, true, true, true); + + /* The remaining tests need real privilege */ + + if (!have_outer_privilege) { + printf("[SKIP]\tSUID/SGID tests (needs privilege)\n"); + goto done; + } + + if (uid == 0) { + printf("[RUN]\tRoot +ia, suidroot => eipa\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_suidroot", + true, true, true, true); + + printf("[RUN]\tRoot +ia, suidnonroot => ip\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_suidnonroot", + false, true, true, false); + + printf("[RUN]\tRoot +ia, sgidroot => eipa\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_sgidroot", + true, true, true, true); + + if (fork_wait()) { + printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n"); + if (setresgid(1, 1, 1) != 0) + err(1, "setresgid"); + exec_other_validate_cap("./validate_cap_sgidroot", + true, true, true, false); + } + + printf("[RUN]\tRoot +ia, sgidnonroot => eip\n"); + if (fork_wait()) + exec_other_validate_cap("./validate_cap_sgidnonroot", + true, true, true, false); + } else { + printf("[RUN]\tNon-root +ia, sgidnonroot => i\n"); + exec_other_validate_cap("./validate_cap_sgidnonroot", + false, false, true, false); + + if (fork_wait()) { + printf("[RUN]\tNon-root +ia, sgidroot => i\n"); + if (setresgid(1, 1, 1) != 0) + err(1, "setresgid"); + exec_other_validate_cap("./validate_cap_sgidroot", + false, false, true, false); + } + } + +done: + return nerrs ? 1 : 0; +} + +int main(int argc, char **argv) +{ + char *tmp1, *tmp2, *our_path; + + /* Find our path */ + tmp1 = strdup(argv[0]); + if (!tmp1) + err(1, "strdup"); + tmp2 = dirname(tmp1); + our_path = strdup(tmp2); + if (!our_path) + err(1, "strdup"); + free(tmp1); + + if (fork_wait()) { + printf("[RUN]\t+++ Tests with uid == 0 +++\n"); + return do_tests(0, our_path); + } + + if (fork_wait()) { + printf("[RUN]\t+++ Tests with uid != 0 +++\n"); + return do_tests(1, our_path); + } + + return nerrs ? 1 : 0; +} diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c new file mode 100644 index 000000000000..dd3c45f7b23c --- /dev/null +++ b/tools/testing/selftests/capabilities/validate_cap.c @@ -0,0 +1,73 @@ +#include <cap-ng.h> +#include <err.h> +#include <linux/capability.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include <sys/prctl.h> +#include <sys/auxv.h> + +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +# define PR_CAP_AMBIENT_IS_SET 1 +# define PR_CAP_AMBIENT_RAISE 2 +# define PR_CAP_AMBIENT_LOWER 3 +# define PR_CAP_AMBIENT_CLEAR_ALL 4 +#endif + +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19) +# define HAVE_GETAUXVAL +#endif + +static bool bool_arg(char **argv, int i) +{ + if (!strcmp(argv[i], "0")) + return false; + else if (!strcmp(argv[i], "1")) + return true; + else + errx(1, "wrong argv[%d]", i); +} + +int main(int argc, char **argv) +{ + const char *atsec = ""; + + /* + * Be careful just in case a setgid or setcapped copy of this + * helper gets out. + */ + + if (argc != 5) + errx(1, "wrong argc"); + +#ifdef HAVE_GETAUXVAL + if (getauxval(AT_SECURE)) + atsec = " (AT_SECURE is set)"; + else + atsec = " (AT_SECURE is not set)"; +#endif + + capng_get_caps_process(); + + if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) { + printf("[FAIL]\tWrong effective state%s\n", atsec); + return 1; + } + if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) { + printf("[FAIL]\tWrong permitted state%s\n", atsec); + return 1; + } + if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) { + printf("[FAIL]\tWrong inheritable state%s\n", atsec); + return 1; + } + + if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) { + printf("[FAIL]\tWrong ambient state%s\n", atsec); + return 1; + } + + printf("[OK]\tCapabilities after execve were correct\n"); + return 0; +} diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c index 08c2a36ef7a9..412459369686 100644 --- a/tools/testing/selftests/net/psock_fanout.c +++ b/tools/testing/selftests/net/psock_fanout.c @@ -19,6 +19,8 @@ * - PACKET_FANOUT_LB * - PACKET_FANOUT_CPU * - PACKET_FANOUT_ROLLOVER + * - PACKET_FANOUT_CBPF + * - PACKET_FANOUT_EBPF * * Todo: * - functionality: PACKET_FANOUT_FLAG_DEFRAG @@ -44,7 +46,9 @@ #include <arpa/inet.h> #include <errno.h> #include <fcntl.h> +#include <linux/unistd.h> /* for __NR_bpf */ #include <linux/filter.h> +#include <linux/bpf.h> #include <linux/if_packet.h> #include <net/ethernet.h> #include <netinet/ip.h> @@ -91,6 +95,51 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets) return fd; } +static void sock_fanout_set_ebpf(int fd) +{ + const int len_off = __builtin_offsetof(struct __sk_buff, len); + struct bpf_insn prog[] = { + { BPF_ALU64 | BPF_MOV | BPF_X, 6, 1, 0, 0 }, + { BPF_LDX | BPF_W | BPF_MEM, 0, 6, len_off, 0 }, + { BPF_JMP | BPF_JGE | BPF_K, 0, 0, 1, DATA_LEN }, + { BPF_JMP | BPF_JA | BPF_K, 0, 0, 4, 0 }, + { BPF_LD | BPF_B | BPF_ABS, 0, 0, 0, 0x50 }, + { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 2, DATA_CHAR }, + { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1, DATA_CHAR_1 }, + { BPF_ALU | BPF_MOV | BPF_K, 0, 0, 0, 0 }, + { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 } + }; + char log_buf[512]; + union bpf_attr attr; + int pfd; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = (unsigned long) prog; + attr.insn_cnt = sizeof(prog) / sizeof(prog[0]); + attr.license = (unsigned long) "GPL"; + attr.log_buf = (unsigned long) log_buf, + attr.log_size = sizeof(log_buf), + attr.log_level = 1, + + pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (pfd < 0) { + perror("bpf"); + fprintf(stderr, "bpf verifier:\n%s\n", log_buf); + exit(1); + } + + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &pfd, sizeof(pfd))) { + perror("fanout data ebpf"); + exit(1); + } + + if (close(pfd)) { + perror("close ebpf"); + exit(1); + } +} + static char *sock_fanout_open_ring(int fd) { struct tpacket_req req = { @@ -115,8 +164,8 @@ static char *sock_fanout_open_ring(int fd) ring = mmap(0, req.tp_block_size * req.tp_block_nr, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (!ring) { - fprintf(stderr, "packetsock ring mmap\n"); + if (ring == MAP_FAILED) { + perror("packetsock ring mmap"); exit(1); } @@ -209,6 +258,7 @@ static int test_datapath(uint16_t typeflags, int port_off, { const int expect0[] = { 0, 0 }; char *rings[2]; + uint8_t type = typeflags & 0xFF; int fds[2], fds_udp[2][2], ret; fprintf(stderr, "test: datapath 0x%hx\n", typeflags); @@ -219,6 +269,11 @@ static int test_datapath(uint16_t typeflags, int port_off, fprintf(stderr, "ERROR: failed open\n"); exit(1); } + if (type == PACKET_FANOUT_CBPF) + sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA); + else if (type == PACKET_FANOUT_EBPF) + sock_fanout_set_ebpf(fds[0]); + rings[0] = sock_fanout_open_ring(fds[0]); rings[1] = sock_fanout_open_ring(fds[1]); pair_udp_open(fds_udp[0], PORT_BASE); @@ -227,11 +282,11 @@ static int test_datapath(uint16_t typeflags, int port_off, /* Send data, but not enough to overflow a queue */ pair_udp_send(fds_udp[0], 15); - pair_udp_send(fds_udp[1], 5); + pair_udp_send_char(fds_udp[1], 5, DATA_CHAR_1); ret = sock_fanout_read(fds, rings, expect1); /* Send more data, overflow the queue */ - pair_udp_send(fds_udp[0], 15); + pair_udp_send_char(fds_udp[0], 15, DATA_CHAR_1); /* TODO: ensure consistent order between expect1 and expect2 */ ret |= sock_fanout_read(fds, rings, expect2); @@ -275,6 +330,7 @@ int main(int argc, char **argv) const int expect_rb[2][2] = { { 15, 5 }, { 20, 15 } }; const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } }; const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } }; + const int expect_bpf[2][2] = { { 15, 5 }, { 15, 20 } }; int port_off = 2, tries = 5, ret; test_control_single(); @@ -296,6 +352,11 @@ int main(int argc, char **argv) ret |= test_datapath(PACKET_FANOUT_ROLLOVER, port_off, expect_rb[0], expect_rb[1]); + ret |= test_datapath(PACKET_FANOUT_CBPF, + port_off, expect_bpf[0], expect_bpf[1]); + ret |= test_datapath(PACKET_FANOUT_EBPF, + port_off, expect_bpf[0], expect_bpf[1]); + set_cpuaffinity(0); ret |= test_datapath(PACKET_FANOUT_CPU, port_off, expect_cpu0[0], expect_cpu0[1]); diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h index 37da54ac85a9..24bc7ec1be7d 100644 --- a/tools/testing/selftests/net/psock_lib.h +++ b/tools/testing/selftests/net/psock_lib.h @@ -30,6 +30,7 @@ #define DATA_LEN 100 #define DATA_CHAR 'a' +#define DATA_CHAR_1 'b' #define PORT_BASE 8000 @@ -37,29 +38,36 @@ # define __maybe_unused __attribute__ ((__unused__)) #endif -static __maybe_unused void pair_udp_setfilter(int fd) +static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum) { struct sock_filter bpf_filter[] = { { 0x80, 0, 0, 0x00000000 }, /* LD pktlen */ - { 0x35, 0, 5, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/ + { 0x35, 0, 4, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/ { 0x30, 0, 0, 0x00000050 }, /* LD ip[80] */ - { 0x15, 0, 3, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ - { 0x30, 0, 0, 0x00000051 }, /* LD ip[81] */ - { 0x15, 0, 1, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/ + { 0x15, 1, 0, DATA_CHAR }, /* JEQ DATA_CHAR [t goto match]*/ + { 0x15, 0, 1, DATA_CHAR_1}, /* JEQ DATA_CHAR_1 [t goto match]*/ { 0x06, 0, 0, 0x00000060 }, /* RET match */ { 0x06, 0, 0, 0x00000000 }, /* RET no match */ }; struct sock_fprog bpf_prog; + if (lvl == SOL_PACKET && optnum == PACKET_FANOUT_DATA) + bpf_filter[5].code = 0x16; /* RET A */ + bpf_prog.filter = bpf_filter; bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter); - if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog, + if (setsockopt(fd, lvl, optnum, &bpf_prog, sizeof(bpf_prog))) { perror("setsockopt SO_ATTACH_FILTER"); exit(1); } } +static __maybe_unused void pair_udp_setfilter(int fd) +{ + sock_setfilter(fd, SOL_SOCKET, SO_ATTACH_FILTER); +} + static __maybe_unused void pair_udp_open(int fds[], uint16_t port) { struct sockaddr_in saddr, daddr; @@ -96,11 +104,11 @@ static __maybe_unused void pair_udp_open(int fds[], uint16_t port) } } -static __maybe_unused void pair_udp_send(int fds[], int num) +static __maybe_unused void pair_udp_send_char(int fds[], int num, char payload) { char buf[DATA_LEN], rbuf[DATA_LEN]; - memset(buf, DATA_CHAR, sizeof(buf)); + memset(buf, payload, sizeof(buf)); while (num--) { /* Should really handle EINTR and EAGAIN */ if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) { @@ -118,6 +126,11 @@ static __maybe_unused void pair_udp_send(int fds[], int num) } } +static __maybe_unused void pair_udp_send(int fds[], int num) +{ + return pair_udp_send_char(fds, num, DATA_CHAR); +} + static __maybe_unused void pair_udp_close(int fds[]) { close(fds[0]); diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 41cc3ed66818..ee179e22308c 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -2,8 +2,9 @@ noarg: $(MAKE) -C ../ TEST_PROGS := hugetlb_vs_thp_test subpage_prot +TEST_FILES := tempfile -all: $(TEST_PROGS) tempfile +all: $(TEST_PROGS) $(TEST_FILES) $(TEST_PROGS): ../harness.c diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index c5abe7fd7590..a004b4cce99e 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -14,6 +14,7 @@ #include <linux/filter.h> #include <sys/prctl.h> #include <sys/ptrace.h> +#include <sys/types.h> #include <sys/user.h> #include <linux/prctl.h> #include <linux/ptrace.h> @@ -82,7 +83,13 @@ struct seccomp_data { }; #endif +#if __BYTE_ORDER == __LITTLE_ENDIAN #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32)) +#else +#error "wut? Unknown __BYTE_ORDER?!" +#endif #define SIBLING_EXIT_UNKILLED 0xbadbeef #define SIBLING_EXIT_FAILURE 0xbadface @@ -1199,6 +1206,10 @@ TEST_F(TRACE_poke, getpid_runs_normally) # define ARCH_REGS struct user_pt_regs # define SYSCALL_NUM regs[8] # define SYSCALL_RET regs[0] +#elif defined(__powerpc__) +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM gpr[0] +# define SYSCALL_RET gpr[3] #else # error "Do not know how to find your architecture's registers and syscalls" #endif @@ -1232,7 +1243,7 @@ void change_syscall(struct __test_metadata *_metadata, ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov); EXPECT_EQ(0, ret); -#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || defined(__powerpc__) { regs.SYSCALL_NUM = syscall; } @@ -1396,6 +1407,8 @@ TEST_F(TRACE_syscall, syscall_dropped) # define __NR_seccomp 383 # elif defined(__aarch64__) # define __NR_seccomp 277 +# elif defined(__powerpc__) +# define __NR_seccomp 358 # else # warning "seccomp syscall number unknown for this architecture" # define __NR_seccomp 0xffff diff --git a/tools/testing/selftests/static_keys/Makefile b/tools/testing/selftests/static_keys/Makefile new file mode 100644 index 000000000000..9cdadf37f114 --- /dev/null +++ b/tools/testing/selftests/static_keys/Makefile @@ -0,0 +1,8 @@ +# Makefile for static keys selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := test_static_keys.sh + +include ../lib.mk diff --git a/tools/testing/selftests/static_keys/test_static_keys.sh b/tools/testing/selftests/static_keys/test_static_keys.sh new file mode 100644 index 000000000000..1261e3fa1e3a --- /dev/null +++ b/tools/testing/selftests/static_keys/test_static_keys.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# Runs static keys kernel module tests + +if /sbin/modprobe -q test_static_key_base; then + if /sbin/modprobe -q test_static_keys; then + echo "static_key: ok" + /sbin/modprobe -q -r test_static_keys + /sbin/modprobe -q -r test_static_key_base + else + echo "static_keys: [FAIL]" + /sbin/modprobe -q -r test_static_key_base + fi +else + echo "static_key: [FAIL]" + exit 1 +fi diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 231b9a031f6a..0d6854744b37 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -8,10 +8,13 @@ BINARIES += hugetlbfstest BINARIES += map_hugetlb BINARIES += thuge-gen BINARIES += transhuge-stress +BINARIES += userfaultfd all: $(BINARIES) %: %.c $(CC) $(CFLAGS) -o $@ $^ -lrt +userfaultfd: userfaultfd.c + $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread TEST_PROGS := run_vmtests TEST_FILES := $(BINARIES) diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 49ece11ff7fd..831adeb5fc55 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -86,6 +86,17 @@ else echo "[PASS]" fi +echo "--------------------" +echo "running userfaultfd" +echo "--------------------" +./userfaultfd 128 32 +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + #cleanup umount $mnt rm -rf $mnt diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c new file mode 100644 index 000000000000..0c0b83953352 --- /dev/null +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -0,0 +1,636 @@ +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + * + * The program takes two parameters: the amounts of physical memory in + * megabytes (MiB) of the area and the number of bounces to execute. + * + * # 100MiB 99999 bounces + * ./userfaultfd 100 99999 + * + * # 1GiB 99 bounces + * ./userfaultfd 1000 99 + * + * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers + * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <signal.h> +#include <poll.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <pthread.h> +#include "../../../../include/uapi/linux/userfaultfd.h" + +#ifdef __x86_64__ +#define __NR_userfaultfd 323 +#elif defined(__i386__) +#define __NR_userfaultfd 359 +#elif defined(__powewrpc__) +#define __NR_userfaultfd 364 +#else +#error "missing __NR_userfaultfd definition" +#endif + +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +static unsigned long long *count_verify; +static int uffd, finished, *pipefd; +static char *area_src, *area_dst; +static char *zeropage; +pthread_attr_t attr; + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + struct random_data rand; + unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */ + int32_t rand_nr; + unsigned long long count; + char randstate[64]; + unsigned int seed; + time_t start; + + if (bounces & BOUNCE_RANDOM) { + seed = (unsigned int) time(NULL) - bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + seed += cpu; + bzero(&rand, sizeof(rand)); + bzero(&randstate, sizeof(randstate)); + if (initstate_r(seed, randstate, sizeof(randstate), &rand)) + fprintf(stderr, "srandom_r error\n"), exit(1); + } else { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (random_r(&rand, &rand_nr)) + fprintf(stderr, "random_r 1 error\n"), exit(1); + page_nr = rand_nr; + if (sizeof(page_nr) > sizeof(rand_nr)) { + if (random_r(&rand, &rand_nr)) + fprintf(stderr, "random_r 2 error\n"), exit(1); + page_nr |= ((unsigned long) rand_nr) << 32; + } + } else + page_nr += 1; + page_nr %= nr_pages; + + start = time(NULL); + if (bounces & BOUNCE_VERIFY) { + count = *area_count(area_dst, page_nr); + if (!count) + fprintf(stderr, + "page_nr %lu wrong count %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); + + + /* + * We can't use bcmp (or memcmp) because that + * returns 0 erroneously if the memory is + * changing under it (even if the end of the + * page is never changing and always + * different). + */ +#if 1 + if (!my_bcmp(area_dst + page_nr * page_size, zeropage, + page_size)) + fprintf(stderr, + "my_bcmp page_nr %lu wrong count %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); +#else + unsigned long loops; + + loops = 0; + /* uncomment the below line to test with mutex */ + /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */ + while (!bcmp(area_dst + page_nr * page_size, zeropage, + page_size)) { + loops += 1; + if (loops > 10) + break; + } + /* uncomment below line to test with mutex */ + /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */ + if (loops) { + fprintf(stderr, + "page_nr %lu all zero thread %lu %p %lu\n", + page_nr, cpu, area_dst + page_nr * page_size, + loops); + if (loops > 10) + exit(1); + } +#endif + } + + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) { + fprintf(stderr, + "page_nr %lu memory corruption %Lu %Lu\n", + page_nr, count, + count_verify[page_nr]), exit(1); + } + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + + if (time(NULL) - start > 1) + fprintf(stderr, + "userfault too slow %ld " + "possible false positive with overcommit\n", + time(NULL) - start); + } + + return NULL; +} + +static int copy_page(unsigned long offset) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + fprintf(stderr, "unexpected offset %lu\n", + offset), exit(1); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + fprintf(stderr, "UFFDIO_COPY error %Ld\n", + uffdio_copy.copy), exit(1); + } else if (uffdio_copy.copy != page_size) { + fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n", + uffdio_copy.copy), exit(1); + } else + return 1; + return 0; +} + +static void *uffd_poll_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + struct pollfd pollfd[2]; + struct uffd_msg msg; + int ret; + unsigned long offset; + char tmp_chr; + unsigned long userfaults = 0; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (!ret) + fprintf(stderr, "poll error %d\n", ret), exit(1); + if (ret < 0) + perror("poll"), exit(1); + if (pollfd[1].revents & POLLIN) { + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + fprintf(stderr, "read pipefd error\n"), + exit(1); + break; + } + if (!(pollfd[0].revents & POLLIN)) + fprintf(stderr, "pollfd[0].revents %d\n", + pollfd[0].revents), exit(1); + ret = read(uffd, &msg, sizeof(msg)); + if (ret < 0) { + if (errno == EAGAIN) + continue; + perror("nonblocking read error"), exit(1); + } + if (msg.event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg.event), exit(1); + if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)msg.arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + if (copy_page(offset)) + userfaults++; + } + return (void *)userfaults; +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + unsigned long *this_cpu_userfaults; + struct uffd_msg msg; + unsigned long offset; + int ret; + + this_cpu_userfaults = (unsigned long *) arg; + *this_cpu_userfaults = 0; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + ret = read(uffd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (ret < 0) + perror("blocking read error"), exit(1); + else + fprintf(stderr, "short read\n"), exit(1); + } + if (msg.event != UFFD_EVENT_PAGEFAULT) + fprintf(stderr, "unexpected msg event %u\n", + msg.event), exit(1); + if (bounces & BOUNCE_VERIFY && + msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + fprintf(stderr, "unexpected write fault\n"), exit(1); + offset = (char *)msg.arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + if (copy_page(offset)) + (*this_cpu_userfaults)++; + } + return (void *)NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + + for (page_nr = cpu * nr_pages_per_cpu; + page_nr < (cpu+1) * nr_pages_per_cpu; + page_nr++) + copy_page(page_nr * page_size); + + return NULL; +} + +static int stress(unsigned long *userfaults) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + void **_userfaults = (void **) userfaults; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, (void *)cpu)) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + &_userfaults[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise"); + return 1; + } + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) { + fprintf(stderr, "pipefd write error\n"); + return 1; + } + if (pthread_join(uffd_threads[cpu], &_userfaults[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + return 0; +} + +static int userfaultfd_stress(void) +{ + void *area; + char *tmp_area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffdio_api uffdio_api; + unsigned long cpu; + int uffd_flags; + unsigned long userfaults[nr_cpus]; + + if (posix_memalign(&area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + area_src = area; + if (posix_memalign(&area, page_size, nr_pages * page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + area_dst = area; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + fprintf(stderr, + "userfaultfd syscall not available in this kernel\n"); + return 1; + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + fprintf(stderr, "UFFDIO_API\n"); + return 1; + } + if (uffdio_api.api != UFFD_API) { + fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api); + return 1; + } + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) { + perror("count_verify"); + return 1; + } + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = (pthread_mutex_t) + PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + } + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) { + perror("pipefd"); + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) { + perror("pipe"); + return 1; + } + } + + if (posix_memalign(&area, page_size, page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + unsigned long expected_ioctls; + + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + return 1; + } + expected_ioctls = (1 << _UFFDIO_WAKE) | + (1 << _UFFDIO_COPY) | + (1 << _UFFDIO_ZEROPAGE); + if ((uffdio_register.ioctls & expected_ioctls) != + expected_ioctls) { + fprintf(stderr, + "unexpected missing ioctl for anon memory\n"); + return 1; + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) { + perror("madvise 2"); + return 1; + } + + /* bounce pass */ + if (stress(userfaults)) + return 1; + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { + fprintf(stderr, "register failure\n"); + return 1; + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) { + for (nr = 0; nr < nr_pages; nr++) { + if (my_bcmp(area_dst, + area_dst + nr * page_size, + sizeof(pthread_mutex_t))) { + fprintf(stderr, + "error mutex 2 %lu\n", + nr); + bounces = 0; + } + if (*area_count(area_dst, nr) != count_verify[nr]) { + fprintf(stderr, + "error area_count %Lu %Lu %lu\n", + *area_count(area_src, nr), + count_verify[nr], + nr); + bounces = 0; + } + } + } + + /* prepare next bounce */ + tmp_area = area_src; + area_src = area_dst; + area_dst = tmp_area; + + printf("userfaults:"); + for (cpu = 0; cpu < nr_cpus; cpu++) + printf(" %lu", userfaults[cpu]); + printf("\n"); + } + + return 0; +} + +int main(int argc, char **argv) +{ + if (argc < 3) + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + page_size = sysconf(_SC_PAGE_SIZE); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) > + page_size) + fprintf(stderr, "Impossible to run this test\n"), exit(2); + nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / + nr_cpus; + if (!nr_pages_per_cpu) { + fprintf(stderr, "invalid MiB\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + bounces = atoi(argv[2]); + if (bounces <= 0) { + fprintf(stderr, "invalid bounces\n"); + fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} |