summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/nvdimm/Kbuild13
-rw-r--r--tools/testing/nvdimm/test/iomap.c85
-rw-r--r--tools/testing/nvdimm/test/nfit.c209
-rw-r--r--tools/testing/selftests/capabilities/.gitignore2
-rw-r--r--tools/testing/selftests/capabilities/Makefile18
-rw-r--r--tools/testing/selftests/capabilities/test_execve.c427
-rw-r--r--tools/testing/selftests/capabilities/validate_cap.c73
-rw-r--r--tools/testing/selftests/vm/Makefile3
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests11
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c636
10 files changed, 1401 insertions, 76 deletions
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index f56914c7929b..38b00ecb2ed5 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -1,9 +1,12 @@
-ldflags-y += --wrap=ioremap_wt
ldflags-y += --wrap=ioremap_wc
+ldflags-y += --wrap=memremap
ldflags-y += --wrap=devm_ioremap_nocache
-ldflags-y += --wrap=ioremap_cache
+ldflags-y += --wrap=devm_memremap
+ldflags-y += --wrap=devm_memunmap
ldflags-y += --wrap=ioremap_nocache
ldflags-y += --wrap=iounmap
+ldflags-y += --wrap=memunmap
+ldflags-y += --wrap=__devm_request_region
ldflags-y += --wrap=__request_region
ldflags-y += --wrap=__release_region
@@ -15,6 +18,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
obj-$(CONFIG_ND_BTT) += nd_btt.o
obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
obj-$(CONFIG_ACPI_NFIT) += nfit.o
nfit-y := $(ACPI_SRC)/nfit.o
@@ -29,6 +33,9 @@ nd_btt-y += config_check.o
nd_blk-y := $(NVDIMM_SRC)/blk.o
nd_blk-y += config_check.o
+nd_e820-y := $(NVDIMM_SRC)/e820.o
+nd_e820-y += config_check.o
+
libnvdimm-y := $(NVDIMM_SRC)/core.o
libnvdimm-y += $(NVDIMM_SRC)/bus.o
libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
@@ -37,7 +44,9 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
libnvdimm-y += $(NVDIMM_SRC)/region.o
libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
+libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
libnvdimm-y += config_check.o
obj-m += test/
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 64bfaa50831c..b7251314bbc0 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -80,23 +80,52 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
}
EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
-void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size)
+void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
{
- return __nfit_test_ioremap(offset, size, ioremap_cache);
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res(offset);
+ rcu_read_unlock();
+ if (nfit_res)
+ return nfit_res->buf + offset - nfit_res->res->start;
+ return devm_memremap(dev, offset, size, flags);
}
-EXPORT_SYMBOL(__wrap_ioremap_cache);
+EXPORT_SYMBOL(__wrap_devm_memremap);
-void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
+void *__wrap_memremap(resource_size_t offset, size_t size,
+ unsigned long flags)
{
- return __nfit_test_ioremap(offset, size, ioremap_nocache);
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res(offset);
+ rcu_read_unlock();
+ if (nfit_res)
+ return nfit_res->buf + offset - nfit_res->res->start;
+ return memremap(offset, size, flags);
}
-EXPORT_SYMBOL(__wrap_ioremap_nocache);
+EXPORT_SYMBOL(__wrap_memremap);
+
+void __wrap_devm_memunmap(struct device *dev, void *addr)
+{
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res((unsigned long) addr);
+ rcu_read_unlock();
+ if (nfit_res)
+ return;
+ return devm_memunmap(dev, addr);
+}
+EXPORT_SYMBOL(__wrap_devm_memunmap);
-void __iomem *__wrap_ioremap_wt(resource_size_t offset, unsigned long size)
+void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
{
- return __nfit_test_ioremap(offset, size, ioremap_wt);
+ return __nfit_test_ioremap(offset, size, ioremap_nocache);
}
-EXPORT_SYMBOL(__wrap_ioremap_wt);
+EXPORT_SYMBOL(__wrap_ioremap_nocache);
void __iomem *__wrap_ioremap_wc(resource_size_t offset, unsigned long size)
{
@@ -117,9 +146,22 @@ void __wrap_iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(__wrap_iounmap);
-struct resource *__wrap___request_region(struct resource *parent,
- resource_size_t start, resource_size_t n, const char *name,
- int flags)
+void __wrap_memunmap(void *addr)
+{
+ struct nfit_test_resource *nfit_res;
+
+ rcu_read_lock();
+ nfit_res = get_nfit_res((unsigned long) addr);
+ rcu_read_unlock();
+ if (nfit_res)
+ return;
+ return memunmap(addr);
+}
+EXPORT_SYMBOL(__wrap_memunmap);
+
+static struct resource *nfit_test_request_region(struct device *dev,
+ struct resource *parent, resource_size_t start,
+ resource_size_t n, const char *name, int flags)
{
struct nfit_test_resource *nfit_res;
@@ -147,10 +189,29 @@ struct resource *__wrap___request_region(struct resource *parent,
return res;
}
}
+ if (dev)
+ return __devm_request_region(dev, parent, start, n, name);
return __request_region(parent, start, n, name, flags);
}
+
+struct resource *__wrap___request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n, const char *name,
+ int flags)
+{
+ return nfit_test_request_region(NULL, parent, start, n, name, flags);
+}
EXPORT_SYMBOL(__wrap___request_region);
+struct resource *__wrap___devm_request_region(struct device *dev,
+ struct resource *parent, resource_size_t start,
+ resource_size_t n, const char *name)
+{
+ if (!dev)
+ return NULL;
+ return nfit_test_request_region(dev, parent, start, n, name, 0);
+}
+EXPORT_SYMBOL(__wrap___devm_request_region);
+
void __wrap___release_region(struct resource *parent, resource_size_t start,
resource_size_t n)
{
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index d0bdae40ccc9..021e6f97f33e 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -147,75 +147,153 @@ static struct nfit_test *to_nfit_test(struct device *dev)
return container_of(pdev, struct nfit_test, pdev);
}
+static int nfit_test_cmd_get_config_size(struct nd_cmd_get_config_size *nd_cmd,
+ unsigned int buf_len)
+{
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+
+ nd_cmd->status = 0;
+ nd_cmd->config_size = LABEL_SIZE;
+ nd_cmd->max_xfer = SZ_4K;
+
+ return 0;
+}
+
+static int nfit_test_cmd_get_config_data(struct nd_cmd_get_config_data_hdr
+ *nd_cmd, unsigned int buf_len, void *label)
+{
+ unsigned int len, offset = nd_cmd->in_offset;
+ int rc;
+
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+ if (offset >= LABEL_SIZE)
+ return -EINVAL;
+ if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
+ return -EINVAL;
+
+ nd_cmd->status = 0;
+ len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+ memcpy(nd_cmd->out_buf, label + offset, len);
+ rc = buf_len - sizeof(*nd_cmd) - len;
+
+ return rc;
+}
+
+static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
+ unsigned int buf_len, void *label)
+{
+ unsigned int len, offset = nd_cmd->in_offset;
+ u32 *status;
+ int rc;
+
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+ if (offset >= LABEL_SIZE)
+ return -EINVAL;
+ if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
+ return -EINVAL;
+
+ status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd);
+ *status = 0;
+ len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+ memcpy(label + offset, nd_cmd->in_buf, len);
+ rc = buf_len - sizeof(*nd_cmd) - (len + 4);
+
+ return rc;
+}
+
+static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
+ unsigned int buf_len)
+{
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+
+ nd_cmd->max_ars_out = 256;
+ nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
+
+ return 0;
+}
+
+static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd,
+ unsigned int buf_len)
+{
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+
+ nd_cmd->status = 0;
+
+ return 0;
+}
+
+static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
+ unsigned int buf_len)
+{
+ if (buf_len < sizeof(*nd_cmd))
+ return -EINVAL;
+
+ nd_cmd->out_length = 256;
+ nd_cmd->num_records = 0;
+ nd_cmd->status = 0;
+
+ return 0;
+}
+
static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
struct nvdimm *nvdimm, unsigned int cmd, void *buf,
unsigned int buf_len)
{
struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
- struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
- int i, rc;
+ int i, rc = 0;
- if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
- return -ENOTTY;
+ if (nvdimm) {
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
- /* lookup label space for the given dimm */
- for (i = 0; i < ARRAY_SIZE(handle); i++)
- if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i])
+ if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+ return -ENOTTY;
+
+ /* lookup label space for the given dimm */
+ for (i = 0; i < ARRAY_SIZE(handle); i++)
+ if (__to_nfit_memdev(nfit_mem)->device_handle ==
+ handle[i])
+ break;
+ if (i >= ARRAY_SIZE(handle))
+ return -ENXIO;
+
+ switch (cmd) {
+ case ND_CMD_GET_CONFIG_SIZE:
+ rc = nfit_test_cmd_get_config_size(buf, buf_len);
break;
- if (i >= ARRAY_SIZE(handle))
- return -ENXIO;
+ case ND_CMD_GET_CONFIG_DATA:
+ rc = nfit_test_cmd_get_config_data(buf, buf_len,
+ t->label[i]);
+ break;
+ case ND_CMD_SET_CONFIG_DATA:
+ rc = nfit_test_cmd_set_config_data(buf, buf_len,
+ t->label[i]);
+ break;
+ default:
+ return -ENOTTY;
+ }
+ } else {
+ if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+ return -ENOTTY;
- switch (cmd) {
- case ND_CMD_GET_CONFIG_SIZE: {
- struct nd_cmd_get_config_size *nd_cmd = buf;
-
- if (buf_len < sizeof(*nd_cmd))
- return -EINVAL;
- nd_cmd->status = 0;
- nd_cmd->config_size = LABEL_SIZE;
- nd_cmd->max_xfer = SZ_4K;
- rc = 0;
- break;
- }
- case ND_CMD_GET_CONFIG_DATA: {
- struct nd_cmd_get_config_data_hdr *nd_cmd = buf;
- unsigned int len, offset = nd_cmd->in_offset;
-
- if (buf_len < sizeof(*nd_cmd))
- return -EINVAL;
- if (offset >= LABEL_SIZE)
- return -EINVAL;
- if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
- return -EINVAL;
-
- nd_cmd->status = 0;
- len = min(nd_cmd->in_length, LABEL_SIZE - offset);
- memcpy(nd_cmd->out_buf, t->label[i] + offset, len);
- rc = buf_len - sizeof(*nd_cmd) - len;
- break;
- }
- case ND_CMD_SET_CONFIG_DATA: {
- struct nd_cmd_set_config_hdr *nd_cmd = buf;
- unsigned int len, offset = nd_cmd->in_offset;
- u32 *status;
-
- if (buf_len < sizeof(*nd_cmd))
- return -EINVAL;
- if (offset >= LABEL_SIZE)
- return -EINVAL;
- if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
- return -EINVAL;
-
- status = buf + nd_cmd->in_length + sizeof(*nd_cmd);
- *status = 0;
- len = min(nd_cmd->in_length, LABEL_SIZE - offset);
- memcpy(t->label[i] + offset, nd_cmd->in_buf, len);
- rc = buf_len - sizeof(*nd_cmd) - (len + 4);
- break;
- }
- default:
- return -ENOTTY;
+ switch (cmd) {
+ case ND_CMD_ARS_CAP:
+ rc = nfit_test_cmd_ars_cap(buf, buf_len);
+ break;
+ case ND_CMD_ARS_START:
+ rc = nfit_test_cmd_ars_start(buf, buf_len);
+ break;
+ case ND_CMD_ARS_STATUS:
+ rc = nfit_test_cmd_ars_status(buf, buf_len);
+ break;
+ default:
+ return -ENOTTY;
+ }
}
return rc;
@@ -876,6 +954,9 @@ static void nfit_test0_setup(struct nfit_test *t)
set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+ set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
+ set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
+ set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
nd_desc = &acpi_desc->nd_desc;
nd_desc->ndctl = nfit_test_ctl;
}
@@ -948,9 +1029,13 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
lane = nd_region_acquire_lane(nd_region);
if (rw)
- memcpy(mmio->base + dpa, iobuf, len);
- else
- memcpy(iobuf, mmio->base + dpa, len);
+ memcpy(mmio->addr.base + dpa, iobuf, len);
+ else {
+ memcpy(iobuf, mmio->addr.base + dpa, len);
+
+ /* give us some some coverage of the mmio_flush_range() API */
+ mmio_flush_range(mmio->addr.base + dpa, len);
+ }
nd_region_release_lane(nd_region, lane);
return 0;
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644
index 000000000000..b732dd0d4738
--- /dev/null
+++ b/tools/testing/selftests/capabilities/.gitignore
@@ -0,0 +1,2 @@
+test_execve
+validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644
index 000000000000..8c8f0c1f0889
--- /dev/null
+++ b/tools/testing/selftests/capabilities/Makefile
@@ -0,0 +1,18 @@
+all:
+
+include ../lib.mk
+
+.PHONY: all clean
+
+TARGETS := validate_cap test_execve
+TEST_PROGS := test_execve
+
+CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng
+
+all: $(TARGETS)
+
+clean:
+ $(RM) $(TARGETS)
+
+$(TARGETS): %: %.c
+ $(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644
index 000000000000..10a21a958aaf
--- /dev/null
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -0,0 +1,427 @@
+#define _GNU_SOURCE
+
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <limits.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+#endif
+
+static int nerrs;
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+ char buf[4096];
+ int fd;
+ ssize_t written;
+ int buf_len;
+
+ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (buf_len < 0) {
+ err(1, "vsnprintf failed");
+ }
+ if (buf_len >= sizeof(buf)) {
+ errx(1, "vsnprintf output truncated");
+ }
+
+ fd = open(filename, O_WRONLY);
+ if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return;
+ err(1, "open of %s failed", filename);
+ }
+ written = write(fd, buf, buf_len);
+ if (written != buf_len) {
+ if (written >= 0) {
+ errx(1, "short write to %s", filename);
+ } else {
+ err(1, "write to %s failed", filename);
+ }
+ }
+ if (close(fd) != 0) {
+ err(1, "close of %s failed", filename);
+ }
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(true, filename, fmt, ap);
+ va_end(ap);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+}
+
+static bool create_and_enter_ns(uid_t inner_uid)
+{
+ uid_t outer_uid;
+ gid_t outer_gid;
+ int i;
+ bool have_outer_privilege;
+
+ outer_uid = getuid();
+ outer_gid = getgid();
+
+ /*
+ * TODO: If we're already root, we could skip creating the userns.
+ */
+
+ if (unshare(CLONE_NEWNS) == 0) {
+ printf("[NOTE]\tUsing global UIDs for tests\n");
+ if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
+ err(1, "PR_SET_KEEPCAPS");
+ if (setresuid(inner_uid, inner_uid, -1) != 0)
+ err(1, "setresuid");
+
+ // Re-enable effective caps
+ capng_get_caps_process();
+ for (i = 0; i < CAP_LAST_CAP; i++)
+ if (capng_have_capability(CAPNG_PERMITTED, i))
+ capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+
+ have_outer_privilege = true;
+ } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
+ printf("[NOTE]\tUsing a user namespace for tests\n");
+ maybe_write_file("/proc/self/setgroups", "deny");
+ write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
+ write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
+
+ have_outer_privilege = false;
+ } else {
+ errx(1, "must be root or be able to create a userns");
+ }
+
+ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
+ err(1, "remount everything private");
+
+ return have_outer_privilege;
+}
+
+static void chdir_to_tmpfs(void)
+{
+ char cwd[PATH_MAX];
+ if (getcwd(cwd, sizeof(cwd)) != cwd)
+ err(1, "getcwd");
+
+ if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
+ err(1, "mount private tmpfs");
+
+ if (chdir(cwd) != 0)
+ err(1, "chdir to private tmpfs");
+
+ if (umount2(".", MNT_DETACH) != 0)
+ err(1, "detach private tmpfs");
+}
+
+static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
+{
+ int from = openat(fromfd, fromname, O_RDONLY);
+ if (from == -1)
+ err(1, "open copy source");
+
+ int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
+
+ while (true) {
+ char buf[4096];
+ ssize_t sz = read(from, buf, sizeof(buf));
+ if (sz == 0)
+ break;
+ if (sz < 0)
+ err(1, "read");
+
+ if (write(to, buf, sz) != sz)
+ err(1, "write"); /* no short writes on tmpfs */
+ }
+
+ close(from);
+ close(to);
+}
+
+static bool fork_wait(void)
+{
+ pid_t child = fork();
+ if (child == 0) {
+ nerrs = 0;
+ return true;
+ } else if (child > 0) {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+
+ return false;
+ } else {
+ err(1, "fork");
+ }
+}
+
+static void exec_other_validate_cap(const char *name,
+ bool eff, bool perm, bool inh, bool ambient)
+{
+ execl(name, name, (eff ? "1" : "0"),
+ (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
+ NULL);
+ err(1, "execl");
+}
+
+static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
+{
+ exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
+}
+
+static int do_tests(int uid, const char *our_path)
+{
+ bool have_outer_privilege = create_and_enter_ns(uid);
+
+ int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
+ if (ourpath_fd == -1)
+ err(1, "open '%s'", our_path);
+
+ chdir_to_tmpfs();
+
+ copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
+
+ if (have_outer_privilege) {
+ uid_t gid = getegid();
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_suidroot");
+ if (chown("validate_cap_suidroot", 0, -1) != 0)
+ err(1, "chown");
+ if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
+ err(1, "chmod");
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_suidnonroot");
+ if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
+ err(1, "chown");
+ if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
+ err(1, "chmod");
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_sgidroot");
+ if (chown("validate_cap_sgidroot", -1, 0) != 0)
+ err(1, "chown");
+ if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
+ err(1, "chmod");
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_sgidnonroot");
+ if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
+ err(1, "chown");
+ if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
+ err(1, "chmod");
+}
+
+ capng_get_caps_process();
+
+ /* Make sure that i starts out clear */
+ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+
+ if (uid == 0) {
+ printf("[RUN]\tRoot => ep\n");
+ if (fork_wait())
+ exec_validate_cap(true, true, false, false);
+ } else {
+ printf("[RUN]\tNon-root => no caps\n");
+ if (fork_wait())
+ exec_validate_cap(false, false, false, false);
+ }
+
+ printf("[OK]\tCheck cap_ambient manipulation rules\n");
+
+ /* We should not be able to add ambient caps yet. */
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
+ if (errno == EINVAL)
+ printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n");
+ else
+ printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
+ return 1;
+ }
+ printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
+ capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
+ capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
+ printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
+ return 1;
+ }
+ printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n");
+ return 1;
+ }
+ printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n");
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
+ printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n");
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
+ err(1, "PR_CAP_AMBIENT_CLEAR_ALL");
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+ err(1, "PR_CAP_AMBIENT_RAISE");
+
+ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ printf("[FAIL]\tDropping I should have dropped A\n");
+ return 1;
+ }
+
+ printf("[OK]\tBasic manipulation appears to work\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ err(1, "capng_apply");
+ if (uid == 0) {
+ printf("[RUN]\tRoot +i => eip\n");
+ if (fork_wait())
+ exec_validate_cap(true, true, true, false);
+ } else {
+ printf("[RUN]\tNon-root +i => i\n");
+ if (fork_wait())
+ exec_validate_cap(false, false, true, false);
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+ err(1, "PR_CAP_AMBIENT_RAISE");
+
+ printf("[RUN]\tUID %d +ia => eipa\n", uid);
+ if (fork_wait())
+ exec_validate_cap(true, true, true, true);
+
+ /* The remaining tests need real privilege */
+
+ if (!have_outer_privilege) {
+ printf("[SKIP]\tSUID/SGID tests (needs privilege)\n");
+ goto done;
+ }
+
+ if (uid == 0) {
+ printf("[RUN]\tRoot +ia, suidroot => eipa\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_suidroot",
+ true, true, true, true);
+
+ printf("[RUN]\tRoot +ia, suidnonroot => ip\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_suidnonroot",
+ false, true, true, false);
+
+ printf("[RUN]\tRoot +ia, sgidroot => eipa\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ true, true, true, true);
+
+ if (fork_wait()) {
+ printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
+ if (setresgid(1, 1, 1) != 0)
+ err(1, "setresgid");
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ true, true, true, false);
+ }
+
+ printf("[RUN]\tRoot +ia, sgidnonroot => eip\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_sgidnonroot",
+ true, true, true, false);
+ } else {
+ printf("[RUN]\tNon-root +ia, sgidnonroot => i\n");
+ exec_other_validate_cap("./validate_cap_sgidnonroot",
+ false, false, true, false);
+
+ if (fork_wait()) {
+ printf("[RUN]\tNon-root +ia, sgidroot => i\n");
+ if (setresgid(1, 1, 1) != 0)
+ err(1, "setresgid");
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ false, false, true, false);
+ }
+ }
+
+done:
+ return nerrs ? 1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *tmp1, *tmp2, *our_path;
+
+ /* Find our path */
+ tmp1 = strdup(argv[0]);
+ if (!tmp1)
+ err(1, "strdup");
+ tmp2 = dirname(tmp1);
+ our_path = strdup(tmp2);
+ if (!our_path)
+ err(1, "strdup");
+ free(tmp1);
+
+ if (fork_wait()) {
+ printf("[RUN]\t+++ Tests with uid == 0 +++\n");
+ return do_tests(0, our_path);
+ }
+
+ if (fork_wait()) {
+ printf("[RUN]\t+++ Tests with uid != 0 +++\n");
+ return do_tests(1, our_path);
+ }
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644
index 000000000000..dd3c45f7b23c
--- /dev/null
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -0,0 +1,73 @@
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include <sys/auxv.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+#endif
+
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
+# define HAVE_GETAUXVAL
+#endif
+
+static bool bool_arg(char **argv, int i)
+{
+ if (!strcmp(argv[i], "0"))
+ return false;
+ else if (!strcmp(argv[i], "1"))
+ return true;
+ else
+ errx(1, "wrong argv[%d]", i);
+}
+
+int main(int argc, char **argv)
+{
+ const char *atsec = "";
+
+ /*
+ * Be careful just in case a setgid or setcapped copy of this
+ * helper gets out.
+ */
+
+ if (argc != 5)
+ errx(1, "wrong argc");
+
+#ifdef HAVE_GETAUXVAL
+ if (getauxval(AT_SECURE))
+ atsec = " (AT_SECURE is set)";
+ else
+ atsec = " (AT_SECURE is not set)";
+#endif
+
+ capng_get_caps_process();
+
+ if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
+ printf("[FAIL]\tWrong effective state%s\n", atsec);
+ return 1;
+ }
+ if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
+ printf("[FAIL]\tWrong permitted state%s\n", atsec);
+ return 1;
+ }
+ if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
+ printf("[FAIL]\tWrong inheritable state%s\n", atsec);
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
+ printf("[FAIL]\tWrong ambient state%s\n", atsec);
+ return 1;
+ }
+
+ printf("[OK]\tCapabilities after execve were correct\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..0d6854744b37 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest
BINARIES += map_hugetlb
BINARIES += thuge-gen
BINARIES += transhuge-stress
+BINARIES += userfaultfd
all: $(BINARIES)
%: %.c
$(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+ $(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
TEST_PROGS := run_vmtests
TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..831adeb5fc55 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -86,6 +86,17 @@ else
echo "[PASS]"
fi
+echo "--------------------"
+echo "running userfaultfd"
+echo "--------------------"
+./userfaultfd 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
#cleanup
umount $mnt
rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..76071b14cb93
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,636 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include "../../../../include/uapi/linux/userfaultfd.h"
+
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 374
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+ unsigned long i;
+ for (i = 0; i < n; i++)
+ if (str1[i] != str2[i])
+ return 1;
+ return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct random_data rand;
+ unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+ int32_t rand_nr;
+ unsigned long long count;
+ char randstate[64];
+ unsigned int seed;
+ time_t start;
+
+ if (bounces & BOUNCE_RANDOM) {
+ seed = (unsigned int) time(NULL) - bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ seed += cpu;
+ bzero(&rand, sizeof(rand));
+ bzero(&randstate, sizeof(randstate));
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+ fprintf(stderr, "srandom_r error\n"), exit(1);
+ } else {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 1 error\n"), exit(1);
+ page_nr = rand_nr;
+ if (sizeof(page_nr) > sizeof(rand_nr)) {
+ if (random_r(&rand, &rand_nr))
+ fprintf(stderr, "random_r 2 error\n"), exit(1);
+ page_nr |= ((unsigned long) rand_nr) << 32;
+ }
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+
+ start = time(NULL);
+ if (bounces & BOUNCE_VERIFY) {
+ count = *area_count(area_dst, page_nr);
+ if (!count)
+ fprintf(stderr,
+ "page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+
+
+ /*
+ * We can't use bcmp (or memcmp) because that
+ * returns 0 erroneously if the memory is
+ * changing under it (even if the end of the
+ * page is never changing and always
+ * different).
+ */
+#if 1
+ if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size))
+ fprintf(stderr,
+ "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+#else
+ unsigned long loops;
+
+ loops = 0;
+ /* uncomment the below line to test with mutex */
+ /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+ while (!bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ loops += 1;
+ if (loops > 10)
+ break;
+ }
+ /* uncomment below line to test with mutex */
+ /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+ if (loops) {
+ fprintf(stderr,
+ "page_nr %lu all zero thread %lu %p %lu\n",
+ page_nr, cpu, area_dst + page_nr * page_size,
+ loops);
+ if (loops > 10)
+ exit(1);
+ }
+#endif
+ }
+
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr]) {
+ fprintf(stderr,
+ "page_nr %lu memory corruption %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]), exit(1);
+ }
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+ if (time(NULL) - start > 1)
+ fprintf(stderr,
+ "userfault too slow %ld "
+ "possible false positive with overcommit\n",
+ time(NULL) - start);
+ }
+
+ return NULL;
+}
+
+static int copy_page(unsigned long offset)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ fprintf(stderr, "unexpected offset %lu\n",
+ offset), exit(1);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else if (uffdio_copy.copy != page_size) {
+ fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+ uffdio_copy.copy), exit(1);
+ } else
+ return 1;
+ return 0;
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ int ret;
+ unsigned long offset;
+ char tmp_chr;
+ unsigned long userfaults = 0;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (!ret)
+ fprintf(stderr, "poll error %d\n", ret), exit(1);
+ if (ret < 0)
+ perror("poll"), exit(1);
+ if (pollfd[1].revents & POLLIN) {
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ fprintf(stderr, "read pipefd error\n"),
+ exit(1);
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ fprintf(stderr, "pollfd[0].revents %d\n",
+ pollfd[0].revents), exit(1);
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("nonblocking read error"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ userfaults++;
+ }
+ return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ unsigned long *this_cpu_userfaults;
+ struct uffd_msg msg;
+ unsigned long offset;
+ int ret;
+
+ this_cpu_userfaults = (unsigned long *) arg;
+ *this_cpu_userfaults = 0;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ ret = read(uffd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (ret < 0)
+ perror("blocking read error"), exit(1);
+ else
+ fprintf(stderr, "short read\n"), exit(1);
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT)
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event), exit(1);
+ if (bounces & BOUNCE_VERIFY &&
+ msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ fprintf(stderr, "unexpected write fault\n"), exit(1);
+ offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+ if (copy_page(offset))
+ (*this_cpu_userfaults)++;
+ }
+ return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+
+ for (page_nr = cpu * nr_pages_per_cpu;
+ page_nr < (cpu+1) * nr_pages_per_cpu;
+ page_nr++)
+ copy_page(page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+ void **_userfaults = (void **) userfaults;
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread, (void *)cpu))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ &_userfaults[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ return 1;
+ }
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+ fprintf(stderr, "pipefd write error\n");
+ return 1;
+ }
+ if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ struct uffdio_api uffdio_api;
+ unsigned long cpu;
+ int uffd_flags;
+ unsigned long userfaults[nr_cpus];
+
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_src = area;
+ if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ area_dst = area;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+ return 1;
+ }
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) = (pthread_mutex_t)
+ PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ if (posix_memalign(&area, page_size, page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ while (bounces--) {
+ unsigned long expected_ioctls;
+
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+ expected_ioctls = (1 << _UFFDIO_WAKE) |
+ (1 << _UFFDIO_COPY) |
+ (1 << _UFFDIO_ZEROPAGE);
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ return 1;
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise 2");
+ return 1;
+ }
+
+ /* bounce pass */
+ if (stress(userfaults))
+ return 1;
+
+ /* unregister */
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY) {
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (my_bcmp(area_dst,
+ area_dst + nr * page_size,
+ sizeof(pthread_mutex_t))) {
+ fprintf(stderr,
+ "error mutex 2 %lu\n",
+ nr);
+ bounces = 0;
+ }
+ if (*area_count(area_dst, nr) != count_verify[nr]) {
+ fprintf(stderr,
+ "error area_count %Lu %Lu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr],
+ nr);
+ bounces = 0;
+ }
+ }
+ }
+
+ /* prepare next bounce */
+ tmp_area = area_src;
+ area_src = area_dst;
+ area_dst = tmp_area;
+
+ printf("userfaults:");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ printf(" %lu", userfaults[cpu]);
+ printf("\n");
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 3)
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+ page_size)
+ fprintf(stderr, "Impossible to run this test\n"), exit(2);
+ nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ bounces = atoi(argv[2]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
OpenPOWER on IntegriCloud