From 0898a16a362d436464b34fa644d0d46efc81df92 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:50 +0000 Subject: lib/vdso: Add unlikely() hint into vdso_read_begin() Place the branch with no concurrent write before the contended case. Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz (more clock_gettime() cycles - the better): | before | after ----------------------------------- | 150252214 | 153242367 | 150301112 | 153324800 | 150392773 | 153125401 | 150373957 | 153399355 | 150303157 | 153489417 | 150365237 | 153494270 ----------------------------------- avg | 150331408 | 153345935 diff % | 2 | 0 ----------------------------------- stdev % | 0.3 | 0.1 Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Link: https://lore.kernel.org/r/20191112012724.250792-2-dima@arista.com --- include/vdso/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 01641dbb68ef..9a2af9fca45e 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -10,7 +10,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) { u32 seq; - while ((seq = READ_ONCE(vd->seq)) & 1) + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) cpu_relax(); smp_rmb(); -- cgit v1.2.3 From 769071ac9f20b6a447410c7eaa55d1a5233ef40c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:52 +0000 Subject: ns: Introduce Time Namespace Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). Providing per namespace notions of CLOCK_REALTIME would be complex with a massive overhead, but has a dubious value. But in the context of checkpoint/restore functionality, monotonic and boottime clocks become interesting. Both clocks are monotonic with unspecified starting points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, it has to be guaranteed that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that it is required to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, which can be achieved by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in the way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that it can be used only with the unshare() and the clone3() system calls. [ tglx: Adjusted paragraph about clone3() to reality and massaged the changelog a bit. ] Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Link: https://lore.kernel.org/r/20191112012724.250792-4-dima@arista.com --- MAINTAINERS | 2 + fs/proc/namespaces.c | 4 + include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 3 + include/linux/time_namespace.h | 71 ++++++++++++++ include/linux/user_namespace.h | 1 + include/uapi/linux/sched.h | 6 ++ init/Kconfig | 7 ++ kernel/fork.c | 16 ++- kernel/nsproxy.c | 41 ++++++-- kernel/time/Makefile | 1 + kernel/time/namespace.c | 217 +++++++++++++++++++++++++++++++++++++++++ 12 files changed, 361 insertions(+), 10 deletions(-) create mode 100644 include/linux/time_namespace.h create mode 100644 kernel/time/namespace.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 8982c6e013b3..f6d00023e56f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13214,6 +13214,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core S: Maintained F: fs/timerfd.c F: include/linux/timer* +F: include/linux/time_namespace.h +F: kernel/time_namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 2ae1b1a4d84d..074f395b9ad2 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -35,6 +35,8 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct time_namespace *time_ns; + struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..d312e6281e69 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; /* * We always define these enumerators @@ -43,6 +45,7 @@ enum { PROC_USER_INIT_INO = 0xEFFFFFFDU, PROC_PID_INIT_INO = 0xEFFFFFFCU, PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, + PROC_TIME_INIT_INO = 0xEFFFFFFAU, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h new file mode 100644 index 000000000000..8c74cc12ad24 --- /dev/null +++ b/include/linux/time_namespace.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMENS_H +#define _LINUX_TIMENS_H + + +#include +#include +#include +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct time_namespace { + struct kref kref; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct time_namespace init_time_ns; + +#ifdef CONFIG_TIME_NS +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + kref_get(&ns->kref); + return ns; +} + +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns); +void free_time_ns(struct kref *kref); +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); + +static inline void put_time_ns(struct time_namespace *ns) +{ + kref_put(&ns->kref, free_time_ns); +} + +#else +static inline struct time_namespace *get_time_ns(struct time_namespace *ns) +{ + return NULL; +} + +static inline void put_time_ns(struct time_namespace *ns) +{ +} + +static inline +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + if (flags & CLONE_NEWTIME) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline int timens_on_fork(struct nsproxy *nsproxy, + struct task_struct *tsk) +{ + return 0; +} + +#endif + +#endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index fb9f4f799554..6ef1c7109fc4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -45,6 +45,7 @@ enum ucount_type { UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, + UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 4a0217832464..2e3bc22c6f20 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -36,6 +36,12 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/init/Kconfig b/init/Kconfig index a34064a031a5..b34314fc75f7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1080,6 +1080,13 @@ config UTS_NS In this namespace tasks see different info provided with the uname() system call +config TIME_NS + bool "TIME namespace" + default y + help + In this namespace boottime and monotonic clocks can be set. + The time will keep going with the same pace. + config IPC_NS bool "IPC namespace" depends on (SYSVIPC || POSIX_MQUEUE) diff --git a/kernel/fork.c b/kernel/fork.c index 2508a4f238a3..363595815144 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process( struct multiprocess_signals delayed; struct file *pidfile = NULL; u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different @@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } + + /* + * If the new process will be in a different time namespace + * do not allow it to share VM or a thread group with the forking task. + */ + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } @@ -2811,7 +2820,8 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| + CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c815f58e6bc0..ed9882108cd2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_CGROUPS .cgroup_ns = &init_cgroup_ns, #endif +#ifdef CONFIG_TIME_NS + .time_ns = &init_time_ns, + .time_ns_for_children = &init_time_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_net; } + new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns, + tsk->nsproxy->time_ns_for_children); + if (IS_ERR(new_nsp->time_ns_for_children)) { + err = PTR_ERR(new_nsp->time_ns_for_children); + goto out_time; + } + new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + return new_nsp; +out_time: + put_net(new_nsp->net_ns); out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; + int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP)))) { - get_nsproxy(old_ns); - return 0; - } - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + get_nsproxy(old_ns); + return 0; + } + } else if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); + ret = timens_on_fork(new_ns, tsk); + if (ret) { + free_nsproxy(new_ns); + return ret; + } + tsk->nsproxy = new_ns; return 0; } @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + if (ns->time_ns) + put_time_ns(ns->time_ns); + if (ns->time_ns_for_children) + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..2662a69e0382 --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin + * Author: Dmitry Safonov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); -- cgit v1.2.3 From af993f58d69ee9c1f421dfc87c3ed231c113989c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:53 +0000 Subject: time: Add timens_offsets to be used for tasks in time namespace Introduce offsets for time namespace. They will contain an adjustment needed to convert clocks to/from host's. A new namespace is created with the same offsets as the time namespace of the current process. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-5-dima@arista.com --- include/linux/time_namespace.h | 22 ++++++++++++++++++++++ kernel/time/namespace.c | 2 ++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 8c74cc12ad24..d7e3b4994e31 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -12,11 +12,17 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct timens_offsets { + struct timespec64 monotonic; + struct timespec64 boottime; +}; + struct time_namespace { struct kref kref; struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; + struct timens_offsets offsets; } __randomize_layout; extern struct time_namespace init_time_ns; @@ -39,6 +45,20 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +static inline void timens_add_monotonic(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->monotonic); +} + +static inline void timens_add_boottime(struct timespec64 *ts) +{ + struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; + + *ts = timespec64_add(*ts, ns_offsets->boottime); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -66,6 +86,8 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, return 0; } +static inline void timens_add_monotonic(struct timespec64 *ts) { } +static inline void timens_add_boottime(struct timespec64 *ts) { } #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 2662a69e0382..c2a58e45fc4b 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -60,6 +61,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; return ns; fail_free: -- cgit v1.2.3 From 89dd8eecfe961fab4924dcd14f80cf2ab2820044 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:01 +0000 Subject: time: Add do_timens_ktime_to_host() helper The helper subtracts namespace's clock offset from the given time and ensures that the result is within [0, KTIME_MAX]. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-13-dima@arista.com --- include/linux/time_namespace.h | 17 +++++++++++++++++ kernel/time/namespace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index d7e3b4994e31..34ee110b5c35 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -59,6 +59,19 @@ static inline void timens_add_boottime(struct timespec64 *ts) *ts = timespec64_add(*ts, ns_offsets->boottime); } +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *offsets); + +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + struct time_namespace *ns = current->nsproxy->time_ns; + + if (likely(ns == &init_time_ns)) + return tim; + + return do_timens_ktime_to_host(clockid, tim, &ns->offsets); +} + #else static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { @@ -88,6 +101,10 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) +{ + return tim; +} #endif #endif /* _LINUX_TIMENS_H */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index c2a58e45fc4b..1a0fbaa5d2d4 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,42 @@ #include #include +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); -- cgit v1.2.3 From ea2d1f7fce0f18b67f915c00c6a7a6860116bc92 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:05 +0000 Subject: hrtimers: Prepare hrtimer_nanosleep() for time namespaces clock_nanosleep() accepts absolute values of expiration time when TIMER_ABSTIME flag is set. This absolute value is inside the task's time namespace, and has to be converted to the host's time. There is timens_ktime_to_host() helper for converting time, but it accepts ktime argument. As a preparation, make hrtimer_nanosleep() accept a clock value in ktime instead of timespec64. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-17-dima@arista.com --- include/linux/hrtimer.h | 3 +-- kernel/time/hrtimer.c | 12 +++++++----- kernel/time/posix-stubs.c | 4 ++-- kernel/time/posix-timers.c | 4 +++- tools/perf/examples/bpf/5sec.c | 6 ++++-- 5 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1f98b52118f0..15c8ac313678 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -508,8 +508,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); -extern long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, +extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8de90ea31280..d8b62f93fc8d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index bcbaa2045f5e..5745a138f254 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,7 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 473082b0b57f..75fee6e39e5a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1221,7 +1221,9 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c index b9c203219691..e6b6181c6dc6 100644 --- a/tools/perf/examples/bpf/5sec.c +++ b/tools/perf/examples/bpf/5sec.c @@ -41,9 +41,11 @@ #include -int probe(hrtimer_nanosleep, rqtp->tv_sec)(void *ctx, int err, long sec) +#define NSEC_PER_SEC 1000000000L + +int probe(hrtimer_nanosleep, rqtp)(void *ctx, int err, long long sec) { - return sec == 5; + return sec / NSEC_PER_SEC == 5ULL; } license(GPL); -- cgit v1.2.3 From 660fd04f9317172ae90f414c68b18a26ae88a829 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Nov 2019 01:27:09 +0000 Subject: lib/vdso: Prepare for time namespace support To support time namespaces in the vdso with a minimal impact on regular non time namespace affected tasks, the namespace handling needs to be hidden in a slow path. The most obvious place is vdso_seq_begin(). If a task belongs to a time namespace then the VVAR page which contains the system wide vdso data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the vdso data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. If VDSO time namespace support is disabled the whole magic is compiled out. Initial testing shows that the disabled case is almost identical to the host case which does not take the slow timens path. With the special timens page installed the performance hit is constant time and in the range of 5-7%. For the vdso functions which are not using the sequence count an unconditional check for vdso_data->clock_mode is added which switches to the real vdso when the clock_mode is VCLOCK_TIMENS. [avagin: Make do_hres_timens() work with raw clocks too: choose vdso_data pointer by CS_RAW offset.] Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-21-dima@arista.com --- include/linux/time.h | 6 ++ include/vdso/datapage.h | 19 ++++++- init/Kconfig | 1 + lib/vdso/Kconfig | 6 ++ lib/vdso/gettimeofday.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 169 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/time.h b/include/linux/time.h index 8e10b9dbd8c2..8ef5e5cc9f57 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,4 +110,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its) * Equivalent to !(time_before32(@t, @l) || time_after32(@t, @h)). */ #define time_between32(t, l, h) ((u32)(h) - (u32)(l) >= (u32)(t) - (u32)(l)) + +struct timens_offset { + s64 sec; + u64 nsec; +}; + #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 2e302c0f41f7..c5f347cc5e55 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -21,6 +21,8 @@ #define CS_RAW 1 #define CS_BASES (CS_RAW + 1) +#define VCLOCK_TIMENS UINT_MAX + /** * struct vdso_timestamp - basetime per clock_id * @sec: seconds @@ -48,6 +50,7 @@ struct vdso_timestamp { * @mult: clocksource multiplier * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id + * @offset[clock_id]: time namespace offset per clock_id * @tz_minuteswest: minutes west of Greenwich * @tz_dsttime: type of DST correction * @hrtimer_res: hrtimer resolution @@ -55,6 +58,17 @@ struct vdso_timestamp { * * vdso_data will be accessed by 64 bit and compat code at the same time * so we should be careful before modifying this structure. + * + * @basetime is used to store the base time for the system wide time getter + * VVAR page. + * + * @offset is used by the special time namespace VVAR pages which are + * installed instead of the real VVAR page. These namespace pages must set + * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the + * time namespace slow path. The namespace aware functions retrieve the + * real system wide VVAR page, read host time and add the per clock offset. + * For clocks which are not affected by time namespace adjustment the + * offset must be zero. */ struct vdso_data { u32 seq; @@ -65,7 +79,10 @@ struct vdso_data { u32 mult; u32 shift; - struct vdso_timestamp basetime[VDSO_BASES]; + union { + struct vdso_timestamp basetime[VDSO_BASES]; + struct timens_offset offset[VDSO_BASES]; + }; s32 tz_minuteswest; s32 tz_dsttime; diff --git a/init/Kconfig b/init/Kconfig index b34314fc75f7..9b7f144a6d35 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1082,6 +1082,7 @@ config UTS_NS config TIME_NS bool "TIME namespace" + depends on GENERIC_VDSO_TIME_NS default y help In this namespace boottime and monotonic clocks can be set. diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 9fe698ff62ec..d883ac299508 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO help This config option enables the compat VDSO layer. +config GENERIC_VDSO_TIME_NS + bool + help + Selected by architectures which support time namespaces in the + VDSO + endif diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index b453d2469b63..f342ac1fce77 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,15 +38,89 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif +#ifdef CONFIG_TIME_NS +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_timestamp *vdso_ts; + u64 cycles, last, ns; + u32 seq; + s64 sec; + + if (clk != CLOCK_MONOTONIC_RAW) + vd = &vd[CS_HRES_COARSE]; + else + vd = &vd[CS_RAW]; + vdso_ts = &vd->basetime[clk]; + + do { + seq = vdso_read_begin(vd); + cycles = __arch_get_hw_counter(vd->clock_mode); + ns = vdso_ts->nsec; + last = vd->cycle_last; + if (unlikely((s64)cycles < 0)) + return -1; + + ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult); + ns >>= vd->shift; + sec = vdso_ts->sec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + ns += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return 0; +} +#else +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return NULL; +} + +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -EINVAL; +} +#endif + static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) + struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u64 cycles, last, sec, ns; u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. Time namespace + * enabled tasks have a special VVAR page installed which + * has vd->seq set to 1 and vd->clock_mode set to + * VCLOCK_TIMENS. For non time namespace affected tasks + * this does not affect performance because if vd->seq is + * odd, i.e. a concurrent update is in progress the extra + * check for vd->clock_mode is just a few extra + * instructions while spin waiting for vd->seq to become + * even again. + */ + while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_hres_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + cycles = __arch_get_hw_counter(vd->clock_mode); ns = vdso_ts->nsec; last = vd->cycle_last; @@ -68,6 +142,43 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } +#ifdef CONFIG_TIME_NS +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + const struct vdso_data *vd = __arch_get_timens_vdso_data(); + const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct timens_offset *offs = &vdns->offset[clk]; + u64 nsec; + s64 sec; + s32 seq; + + do { + seq = vdso_read_begin(vd); + sec = vdso_ts->sec; + nsec = vdso_ts->nsec; + } while (unlikely(vdso_read_retry(vd, seq))); + + /* Add the namespace offset */ + sec += offs->sec; + nsec += offs->nsec; + + /* + * Do this outside the loop: a race inside the loop could result + * in __iter_div_u64_rem() being extremely slow. + */ + ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + ts->tv_nsec = nsec; + return 0; +} +#else +static int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, + struct __kernel_timespec *ts) +{ + return -1; +} +#endif + static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { @@ -75,7 +186,18 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, u32 seq; do { - seq = vdso_read_begin(vd); + /* + * Open coded to handle VCLOCK_TIMENS. See comment in + * do_hres(). + */ + while ((seq = READ_ONCE(vd->seq)) & 1) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + return do_coarse_timens(vd, clk, ts); + cpu_relax(); + } + smp_rmb(); + ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vd, seq))); @@ -156,6 +278,10 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) } if (unlikely(tz != NULL)) { + if (IS_ENABLED(CONFIG_TIME_NS) && + vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; } @@ -167,7 +293,12 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { const struct vdso_data *vd = __arch_get_vdso_data(); - __kernel_old_time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + __kernel_old_time_t t; + + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + + t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) *time = t; @@ -189,6 +320,9 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) if (unlikely((u32) clock >= MAX_CLOCKS)) return -1; + if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VCLOCK_TIMENS) + vd = __arch_get_timens_vdso_data(); + hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); /* * Convert the clockid to a bitmask and use it to check which -- cgit v1.2.3 From 64b302ab66c5965702693e79690823ca120288b9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:10 +0000 Subject: x86/vdso: Provide vdso_data offset on vvar_page VDSO support for time namespaces needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. To prepare the time namespace page the kernel needs to know the vdso_data offset. Provide arch_get_vdso_data() helper for locating vdso_data on VVAR page. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-22-dima@arista.com --- arch/x86/entry/vdso/vdso-layout.lds.S | 2 -- arch/x86/entry/vdso/vma.c | 11 +++++++++++ arch/x86/include/asm/vvar.h | 8 ++++---- arch/x86/kernel/vmlinux.lds.S | 4 +--- include/linux/time_namespace.h | 1 + 5 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 93c6dc7812d0..2330daad67c3 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -21,9 +21,7 @@ SECTIONS /* Place all vvars at the offsets in asm/vvar.h. */ #define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR pvclock_page = vvar_start + PAGE_SIZE; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 76cbe54e0c39..04e3498c6c41 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -24,6 +24,17 @@ #include #include +#undef _ASM_X86_VVAR_H +#define EMIT_VVAR(name, offset) \ + const size_t name ## _offset = offset; +#include + +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page + _vdso_data_offset); +} +#undef EMIT_VVAR + #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; #endif diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 32f5d9a0b90e..ff2de3025388 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -19,10 +19,10 @@ #ifndef _ASM_X86_VVAR_H #define _ASM_X86_VVAR_H -#if defined(__VVAR_KERNEL_LDS) - -/* The kernel linker script defines its own magic to put vvars in the - * right place. +#ifdef EMIT_VVAR +/* + * EMIT_VVAR() is used by the kernel linker script to put vvars in the + * right place. Also, it's used by kernel code to import offsets values. */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3a1a819da137..e3296aa028fe 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -193,12 +193,10 @@ SECTIONS __vvar_beginning_hack = .; /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ +#define EMIT_VVAR(name, offset) \ . = __vvar_beginning_hack + offset; \ *(.vvar_ ## name) -#define __VVAR_KERNEL_LDS #include -#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR /* diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 34ee110b5c35..063a343d1d78 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -39,6 +39,7 @@ struct time_namespace *copy_time_ns(unsigned long flags, struct time_namespace *old_ns); void free_time_ns(struct kref *kref); int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); +struct vdso_data *arch_get_vdso_data(void *vvar_page); static inline void put_time_ns(struct time_namespace *ns) { -- cgit v1.2.3 From afaa7b5ac7c87479fb5a626f87d2157af30d6401 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:12 +0000 Subject: time: Allocate per-timens vvar page VDSO support for Time namespace needs to set up a page with the same layout as VVAR. That timens page will be placed on position of VVAR page inside namespace. That page contains time namespace clock offsets and it has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. Allocate the timens page during namespace creation. Setup the offsets when the first task enters the ns and freeze them to guarantee the pace of monotonic/boottime clocks and to avoid breakage of applications. The design decision is to have a global offset_lock which is used during namespace offsets setup and to freeze offsets when the first task joins the new time namespace. That is better in terms of memory usage compared to having a per namespace mutex that's used only during the setup period. Suggested-by: Andy Lutomirski Based-on-work-by: Thomas Gleixner Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-24-dima@arista.com --- include/linux/time_namespace.h | 3 ++ kernel/time/namespace.c | 104 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 063a343d1d78..6b7767f7df4a 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -23,6 +23,9 @@ struct time_namespace { struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; + struct page *vvar_page; + /* If set prevents changing offsets after any task joined namespace. */ + bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 1a0fbaa5d2d4..d705c15d0273 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -16,6 +16,8 @@ #include #include +#include + ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { @@ -90,16 +92,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, kref_init(&ns->kref); + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + err = ns_alloc_inum(&ns->ns); if (err) - goto fail_free; + goto fail_free_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; return ns; +fail_free_page: + __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: @@ -128,6 +137,93 @@ struct time_namespace *copy_time_ns(unsigned long flags, return clone_time_ns(user_ns, old_ns); } +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + void free_time_ns(struct kref *kref) { struct time_namespace *ns; @@ -136,6 +232,7 @@ void free_time_ns(struct kref *kref) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); kfree(ns); } @@ -192,6 +289,8 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + timens_set_vvar_page(current, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -211,6 +310,8 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) if (nsproxy->time_ns == nsproxy->time_ns_for_children) return 0; + timens_set_vvar_page(tsk, ns); + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -246,6 +347,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, + .frozen_offsets = true, }; static int __init time_ns_init(void) -- cgit v1.2.3 From 70ddf65184ec1e8989322f35193e4fde7377f0cc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:15 +0000 Subject: x86/vdso: Zap vvar pages when switching to a time namespace The VVAR page layout depends on whether a task belongs to the root or non-root time namespace. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will be re-faulted with a corresponding layout. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-27-dima@arista.com --- arch/x86/entry/vdso/vma.c | 27 +++++++++++++++++++++++++++ include/linux/time_namespace.h | 9 +++++++++ kernel/time/namespace.c | 10 ++++++++++ 3 files changed, 46 insertions(+) (limited to 'include') diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index d2fd8a57af7d..c1b8496b5606 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -51,6 +51,7 @@ void __init init_vdso_image(const struct vdso_image *image) image->alt_len)); } +static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -128,6 +129,32 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) return NULL; } + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, &vvar_mapping)) + zap_page_range(vma, vma->vm_start, size); + } + + up_write(&mm->mmap_sem); + return 0; +} #else static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 6b7767f7df4a..04a2ba8b8a06 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -31,6 +31,9 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +extern int vdso_join_timens(struct task_struct *task, + struct time_namespace *ns); + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { kref_get(&ns->kref); @@ -77,6 +80,12 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline int vdso_join_timens(struct task_struct *task, + struct time_namespace *ns) +{ + return 0; +} + static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d705c15d0273..0732964803b9 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -281,6 +281,7 @@ static void timens_put(struct ns_common *ns) static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) { struct time_namespace *ns = to_time_ns(new); + int err; if (!current_is_single_threaded()) return -EUSERS; @@ -291,6 +292,10 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) timens_set_vvar_page(current, ns); + err = vdso_join_timens(current, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; @@ -305,6 +310,7 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); + int err; /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) @@ -312,6 +318,10 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) timens_set_vvar_page(tsk, ns); + err = vdso_join_timens(tsk, ns); + if (err) + return err; + get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; -- cgit v1.2.3 From 04a8682a71becdb639ec9c0d82b315a2baef7a5d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:16 +0000 Subject: fs/proc: Introduce /proc/pid/timens_offsets API to set time namespace offsets for children processes, i.e.: echo "$clockid $offset_sec $offset_nsec" > /proc/self/timens_offsets Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-28-dima@arista.com --- fs/proc/base.c | 94 ++++++++++++++++++++++++++++++++++++++ include/linux/time_namespace.h | 10 ++++ kernel/time/namespace.c | 101 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) (limited to 'include') diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..5adc6390ac3a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -1533,6 +1534,96 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%u %lld %lu", &off->clockid, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3015,6 +3106,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 04a2ba8b8a06..824d54e057eb 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -52,6 +52,16 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); + +struct proc_timens_offset { + int clockid; + struct timespec64 val; +}; + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int n); + static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0732964803b9..12858507d75a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,106 @@ static struct user_namespace *timens_owner(struct ns_common *ns) return to_time_ns(ns)->user_ns; } +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + const struct proc_ns_operations timens_operations = { .name = "time", .type = CLONE_NEWTIME, -- cgit v1.2.3 From 0af3e137c144377fbaf5025ba784ff5ba7ad40c9 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Thu, 9 Jan 2020 17:06:49 +0100 Subject: clocksource/drivers/hyper-v: Untangle stimers and timesync from clocksources hyperv_timer.c exports hyperv_cs, which is used by stimers and the timesync mechanism. However, the clocksource dependency is not needed: these mechanisms only depend on the partition reference counter (which can be read via a MSR or via the TSC Reference Page). Introduce the (function) pointer hv_read_reference_counter, as an embodiment of the partition reference counter read, and export it in place of the hyperv_cs pointer. The latter can be removed. This should clarify that there's no relationship between Hyper-V stimers & timesync and the Linux clocksource abstractions. No functional or semantic change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri Reviewed-by: Michael Kelley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200109160650.16150-2-parri.andrea@gmail.com --- drivers/clocksource/hyperv_timer.c | 36 +++++++++++++++++++++++------------- drivers/hv/hv_util.c | 8 ++++---- include/clocksource/hyperv_timer.h | 2 +- 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 12d75b50a317..42748adccc98 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -66,7 +66,7 @@ static int hv_ce_set_next_event(unsigned long delta, { u64 current_tick; - current_tick = hyperv_cs->read(NULL); + current_tick = hv_read_reference_counter(); current_tick += delta; hv_init_timer(0, current_tick); return 0; @@ -304,8 +304,8 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); * Hyper-V and 32-bit x86. The TSC reference page version is preferred. */ -struct clocksource *hyperv_cs; -EXPORT_SYMBOL_GPL(hyperv_cs); +u64 (*hv_read_reference_counter)(void); +EXPORT_SYMBOL_GPL(hv_read_reference_counter); static union { struct ms_hyperv_tsc_page page; @@ -318,7 +318,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) } EXPORT_SYMBOL_GPL(hv_get_tsc_page); -static u64 notrace read_hv_clock_tsc(struct clocksource *arg) +static u64 notrace read_hv_clock_tsc(void) { u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); @@ -328,9 +328,14 @@ static u64 notrace read_hv_clock_tsc(struct clocksource *arg) return current_tick; } +static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) +{ + return read_hv_clock_tsc(); +} + static u64 read_hv_sched_clock_tsc(void) { - return read_hv_clock_tsc(NULL) - hv_sched_clock_offset; + return read_hv_clock_tsc() - hv_sched_clock_offset; } static void suspend_hv_clock_tsc(struct clocksource *arg) @@ -359,14 +364,14 @@ static void resume_hv_clock_tsc(struct clocksource *arg) static struct clocksource hyperv_cs_tsc = { .name = "hyperv_clocksource_tsc_page", .rating = 400, - .read = read_hv_clock_tsc, + .read = read_hv_clock_tsc_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, }; -static u64 notrace read_hv_clock_msr(struct clocksource *arg) +static u64 notrace read_hv_clock_msr(void) { u64 current_tick; /* @@ -378,15 +383,20 @@ static u64 notrace read_hv_clock_msr(struct clocksource *arg) return current_tick; } +static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) +{ + return read_hv_clock_msr(); +} + static u64 read_hv_sched_clock_msr(void) { - return read_hv_clock_msr(NULL) - hv_sched_clock_offset; + return read_hv_clock_msr() - hv_sched_clock_offset; } static struct clocksource hyperv_cs_msr = { .name = "hyperv_clocksource_msr", .rating = 400, - .read = read_hv_clock_msr, + .read = read_hv_clock_msr_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -399,7 +409,7 @@ static bool __init hv_init_tsc_clocksource(void) if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) return false; - hyperv_cs = &hyperv_cs_tsc; + hv_read_reference_counter = read_hv_clock_tsc; phys_addr = virt_to_phys(hv_get_tsc_page()); /* @@ -417,7 +427,7 @@ static bool __init hv_init_tsc_clocksource(void) hv_set_clocksource_vdso(hyperv_cs_tsc); clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); - hv_sched_clock_offset = hyperv_cs->read(hyperv_cs); + hv_sched_clock_offset = hv_read_reference_counter(); hv_setup_sched_clock(read_hv_sched_clock_tsc); return true; @@ -439,10 +449,10 @@ void __init hv_init_clocksource(void) if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) return; - hyperv_cs = &hyperv_cs_msr; + hv_read_reference_counter = read_hv_clock_msr; clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); - hv_sched_clock_offset = hyperv_cs->read(hyperv_cs); + hv_sched_clock_offset = hv_read_reference_counter(); hv_setup_sched_clock(read_hv_sched_clock_msr); } EXPORT_SYMBOL_GPL(hv_init_clocksource); diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 766bd8457346..296f9098c9e4 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -211,7 +211,7 @@ static struct timespec64 hv_get_adj_host_time(void) unsigned long flags; spin_lock_irqsave(&host_ts.lock, flags); - reftime = hyperv_cs->read(hyperv_cs); + reftime = hv_read_reference_counter(); newtime = host_ts.host_time + (reftime - host_ts.ref_time); ts = ns_to_timespec64((newtime - WLTIMEDELTA) * 100); spin_unlock_irqrestore(&host_ts.lock, flags); @@ -250,7 +250,7 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags) */ spin_lock_irqsave(&host_ts.lock, flags); - cur_reftime = hyperv_cs->read(hyperv_cs); + cur_reftime = hv_read_reference_counter(); host_ts.host_time = hosttime; host_ts.ref_time = cur_reftime; @@ -315,7 +315,7 @@ static void timesync_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr)]; adj_guesttime(timedatap->parenttime, - hyperv_cs->read(hyperv_cs), + hv_read_reference_counter(), timedatap->flags); } } @@ -524,7 +524,7 @@ static struct ptp_clock *hv_ptp_clock; static int hv_timesync_init(struct hv_util_service *srv) { /* TimeSync requires Hyper-V clocksource. */ - if (!hyperv_cs) + if (!hv_read_reference_counter) return -ENODEV; spin_lock_init(&host_ts.lock); diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 553e539469f0..34eef083c988 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -30,7 +30,7 @@ extern void hv_stimer_global_cleanup(void); extern void hv_stimer0_isr(void); #ifdef CONFIG_HYPERV_TIMER -extern struct clocksource *hyperv_cs; +extern u64 (*hv_read_reference_counter)(void); extern void hv_init_clocksource(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -- cgit v1.2.3 From fd928f3e32ba09381b287f8b732418434d932855 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 Jan 2020 21:58:48 -0800 Subject: alarmtimer: Make alarmtimer_get_rtcdev() a stub when CONFIG_RTC_CLASS=n The stubbed version of alarmtimer_get_rtcdev() is not exported. so this won't work if this function is used in a module when CONFIG_RTC_CLASS=n. Move the stub function to the header file and make it inline so that callers don't have to worry about linking against this symbol. rtcdev isn't used outside of this ifdef so it's not required to be redefined to NULL. Drop that while touching this area. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200124055849.154411-4-swboyd@chromium.org --- include/linux/alarmtimer.h | 4 ++++ kernel/time/alarmtimer.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 74748e306f4b..05e758b8b894 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -60,7 +60,11 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval); ktime_t alarm_expires_remaining(const struct alarm *alarm); +#ifdef CONFIG_RTC_CLASS /* Provide way to access the rtc device being used by alarmtimers */ struct rtc_device *alarmtimer_get_rtcdev(void); +#else +static inline struct rtc_device *alarmtimer_get_rtcdev(void) { return NULL; } +#endif #endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 685ff57a1d87..2ffb466af77e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -143,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } -- cgit v1.2.3