summaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-07-01 10:49:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-07-01 10:49:25 -0700
commit02201e3f1b46aed7c6348f406b7b40de80ba6de3 (patch)
tree2392c9098359725c195dd82a72b20ccedc1a1509 /include/linux
parent0890a264794f33df540fbaf274699146903b4e6b (diff)
parent20bdc2cfdbc484777b30b96fcdbb8994038f3ce1 (diff)
downloadblackbird-op-linux-02201e3f1b46aed7c6348f406b7b40de80ba6de3.tar.gz
blackbird-op-linux-02201e3f1b46aed7c6348f406b7b40de80ba6de3.zip
Merge tag 'modules-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull module updates from Rusty Russell: "Main excitement here is Peter Zijlstra's lockless rbtree optimization to speed module address lookup. He found some abusers of the module lock doing that too. A little bit of parameter work here too; including Dan Streetman's breaking up the big param mutex so writing a parameter can load another module (yeah, really). Unfortunately that broke the usual suspects, !CONFIG_MODULES and !CONFIG_SYSFS, so those fixes were appended too" * tag 'modules-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (26 commits) modules: only use mod->param_lock if CONFIG_MODULES param: fix module param locks when !CONFIG_SYSFS. rcu: merge fix for Convert ACCESS_ONCE() to READ_ONCE() and WRITE_ONCE() module: add per-module param_lock module: make perm const params: suppress unused variable error, warn once just in case code changes. modules: clarify CONFIG_MODULE_COMPRESS help, suggest 'N'. kernel/module.c: avoid ifdefs for sig_enforce declaration kernel/workqueue.c: remove ifdefs over wq_power_efficient kernel/params.c: export param_ops_bool_enable_only kernel/params.c: generalize bool_enable_only kernel/module.c: use generic module param operaters for sig_enforce kernel/params: constify struct kernel_param_ops uses sysfs: tightened sysfs permission checks module: Rework module_addr_{min,max} module: Use __module_address() for module_address_lookup() module: Make the mod_tree stuff conditional on PERF_EVENTS || TRACING module: Optimize __module_address() using a latched RB-tree rbtree: Implement generic latch_tree seqlock: Introduce raw_read_seqcount_latch() ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/compiler.h15
-rw-r--r--include/linux/kernel.h18
-rw-r--r--include/linux/module.h46
-rw-r--r--include/linux/moduleparam.h99
-rw-r--r--include/linux/rbtree.h16
-rw-r--r--include/linux/rbtree_augmented.h21
-rw-r--r--include/linux/rbtree_latch.h212
-rw-r--r--include/linux/rcupdate.h15
-rw-r--r--include/linux/seqlock.h81
9 files changed, 416 insertions, 107 deletions
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 26fc8bc77f85..7f8ad9593da7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -475,6 +475,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
(volatile typeof(x) *)&(x); })
#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
+/**
+ * lockless_dereference() - safely load a pointer for later dereference
+ * @p: The pointer to load
+ *
+ * Similar to rcu_dereference(), but for situations where the pointed-to
+ * object's lifetime is managed by something other than RCU. That
+ * "something other" might be reference counting or simple immortality.
+ */
+#define lockless_dereference(p) \
+({ \
+ typeof(p) _________p1 = READ_ONCE(p); \
+ smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+ (_________p1); \
+})
+
/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
#ifdef CONFIG_KPROBES
# define __kprobes __attribute__((__section__(".kprobes.text")))
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5acf5b70866d..cfa9351c7536 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -813,13 +813,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#endif
/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
-#define VERIFY_OCTAL_PERMISSIONS(perms) \
- (BUILD_BUG_ON_ZERO((perms) < 0) + \
- BUILD_BUG_ON_ZERO((perms) > 0777) + \
- /* User perms >= group perms >= other perms */ \
- BUILD_BUG_ON_ZERO(((perms) >> 6) < (((perms) >> 3) & 7)) + \
- BUILD_BUG_ON_ZERO((((perms) >> 3) & 7) < ((perms) & 7)) + \
- /* Other writable? Generally considered a bad idea. */ \
- BUILD_BUG_ON_ZERO((perms) & 2) + \
+#define VERIFY_OCTAL_PERMISSIONS(perms) \
+ (BUILD_BUG_ON_ZERO((perms) < 0) + \
+ BUILD_BUG_ON_ZERO((perms) > 0777) + \
+ /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \
+ BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \
+ BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \
+ /* USER_WRITABLE >= GROUP_WRITABLE */ \
+ BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \
+ /* OTHER_WRITABLE? Generally considered a bad idea. */ \
+ BUILD_BUG_ON_ZERO((perms) & 2) + \
(perms))
#endif
diff --git a/include/linux/module.h b/include/linux/module.h
index 7ffe0851d244..d67b1932cc59 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
+#include <linux/rbtree_latch.h>
#include <linux/percpu.h>
#include <asm/module.h>
@@ -210,6 +211,13 @@ enum module_state {
MODULE_STATE_UNFORMED, /* Still setting it up. */
};
+struct module;
+
+struct mod_tree_node {
+ struct module *mod;
+ struct latch_tree_node node;
+};
+
struct module {
enum module_state state;
@@ -232,6 +240,9 @@ struct module {
unsigned int num_syms;
/* Kernel parameters. */
+#ifdef CONFIG_SYSFS
+ struct mutex param_lock;
+#endif
struct kernel_param *kp;
unsigned int num_kp;
@@ -271,8 +282,15 @@ struct module {
/* Startup function. */
int (*init)(void);
- /* If this is non-NULL, vfree after init() returns */
- void *module_init;
+ /*
+ * If this is non-NULL, vfree() after init() returns.
+ *
+ * Cacheline align here, such that:
+ * module_init, module_core, init_size, core_size,
+ * init_text_size, core_text_size and mtn_core::{mod,node[0]}
+ * are on the same cacheline.
+ */
+ void *module_init ____cacheline_aligned;
/* Here is the actual code + data, vfree'd on unload. */
void *module_core;
@@ -283,6 +301,16 @@ struct module {
/* The size of the executable code in each section. */
unsigned int init_text_size, core_text_size;
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ /*
+ * We want mtn_core::{mod,node[0]} to be in the same cacheline as the
+ * above entries such that a regular lookup will only touch one
+ * cacheline.
+ */
+ struct mod_tree_node mtn_core;
+ struct mod_tree_node mtn_init;
+#endif
+
/* Size of RO sections of the module (text+rodata) */
unsigned int init_ro_size, core_ro_size;
@@ -369,7 +397,7 @@ struct module {
ctor_fn_t *ctors;
unsigned int num_ctors;
#endif
-};
+} ____cacheline_aligned;
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
#endif
@@ -423,14 +451,22 @@ struct symsearch {
bool unused;
};
-/* Search for an exported symbol by name. */
+/*
+ * Search for an exported symbol by name.
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
bool gplok,
bool warn);
-/* Walk the exported symbol table */
+/*
+ * Walk the exported symbol table
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
struct module *owner,
void *data), void *data);
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6480dcaca275..c12f2147c350 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -67,8 +67,9 @@ enum {
struct kernel_param {
const char *name;
+ struct module *mod;
const struct kernel_param_ops *ops;
- u16 perm;
+ const u16 perm;
s8 level;
u8 flags;
union {
@@ -108,7 +109,7 @@ struct kparam_array
*
* @perm is 0 if the the variable is not to appear in sysfs, or 0444
* for world-readable, 0644 for root-writable, etc. Note that if it
- * is writable, you may need to use kparam_block_sysfs_write() around
+ * is writable, you may need to use kernel_param_lock() around
* accesses (esp. charp, which can be kfreed when it changes).
*
* The @type is simply pasted to refer to a param_ops_##type and a
@@ -216,16 +217,16 @@ struct kparam_array
parameters. */
#define __module_param_call(prefix, name, ops, arg, perm, level, flags) \
/* Default value instead of permissions? */ \
- static const char __param_str_##name[] = prefix #name; \
+ static const char __param_str_##name[] = prefix #name; \
static struct kernel_param __moduleparam_const __param_##name \
__used \
__attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
- = { __param_str_##name, ops, VERIFY_OCTAL_PERMISSIONS(perm), \
- level, flags, { arg } }
+ = { __param_str_##name, THIS_MODULE, ops, \
+ VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
/* Obsolete - use module_param_cb() */
#define module_param_call(name, set, get, arg, perm) \
- static struct kernel_param_ops __param_ops_##name = \
+ static const struct kernel_param_ops __param_ops_##name = \
{ .flags = 0, (void *)set, (void *)get }; \
__module_param_call(MODULE_PARAM_PREFIX, \
name, &__param_ops_##name, arg, \
@@ -238,58 +239,14 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
return 0;
}
-/**
- * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs.
- * @name: the name of the parameter
- *
- * There's no point blocking write on a paramter that isn't writable via sysfs!
- */
-#define kparam_block_sysfs_write(name) \
- do { \
- BUG_ON(!(__param_##name.perm & 0222)); \
- __kernel_param_lock(); \
- } while (0)
-
-/**
- * kparam_unblock_sysfs_write - allows sysfs to write to a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_write(name) \
- do { \
- BUG_ON(!(__param_##name.perm & 0222)); \
- __kernel_param_unlock(); \
- } while (0)
-
-/**
- * kparam_block_sysfs_read - make sure a parameter isn't read via sysfs.
- * @name: the name of the parameter
- *
- * This also blocks sysfs writes.
- */
-#define kparam_block_sysfs_read(name) \
- do { \
- BUG_ON(!(__param_##name.perm & 0444)); \
- __kernel_param_lock(); \
- } while (0)
-
-/**
- * kparam_unblock_sysfs_read - allows sysfs to read a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_read(name) \
- do { \
- BUG_ON(!(__param_##name.perm & 0444)); \
- __kernel_param_unlock(); \
- } while (0)
-
#ifdef CONFIG_SYSFS
-extern void __kernel_param_lock(void);
-extern void __kernel_param_unlock(void);
+extern void kernel_param_lock(struct module *mod);
+extern void kernel_param_unlock(struct module *mod);
#else
-static inline void __kernel_param_lock(void)
+static inline void kernel_param_lock(struct module *mod)
{
}
-static inline void __kernel_param_unlock(void)
+static inline void kernel_param_unlock(struct module *mod)
{
}
#endif
@@ -386,64 +343,70 @@ static inline void destroy_params(const struct kernel_param *params,
#define __param_check(name, p, type) \
static inline type __always_unused *__check_##name(void) { return(p); }
-extern struct kernel_param_ops param_ops_byte;
+extern const struct kernel_param_ops param_ops_byte;
extern int param_set_byte(const char *val, const struct kernel_param *kp);
extern int param_get_byte(char *buffer, const struct kernel_param *kp);
#define param_check_byte(name, p) __param_check(name, p, unsigned char)
-extern struct kernel_param_ops param_ops_short;
+extern const struct kernel_param_ops param_ops_short;
extern int param_set_short(const char *val, const struct kernel_param *kp);
extern int param_get_short(char *buffer, const struct kernel_param *kp);
#define param_check_short(name, p) __param_check(name, p, short)
-extern struct kernel_param_ops param_ops_ushort;
+extern const struct kernel_param_ops param_ops_ushort;
extern int param_set_ushort(const char *val, const struct kernel_param *kp);
extern int param_get_ushort(char *buffer, const struct kernel_param *kp);
#define param_check_ushort(name, p) __param_check(name, p, unsigned short)
-extern struct kernel_param_ops param_ops_int;
+extern const struct kernel_param_ops param_ops_int;
extern int param_set_int(const char *val, const struct kernel_param *kp);
extern int param_get_int(char *buffer, const struct kernel_param *kp);
#define param_check_int(name, p) __param_check(name, p, int)
-extern struct kernel_param_ops param_ops_uint;
+extern const struct kernel_param_ops param_ops_uint;
extern int param_set_uint(const char *val, const struct kernel_param *kp);
extern int param_get_uint(char *buffer, const struct kernel_param *kp);
#define param_check_uint(name, p) __param_check(name, p, unsigned int)
-extern struct kernel_param_ops param_ops_long;
+extern const struct kernel_param_ops param_ops_long;
extern int param_set_long(const char *val, const struct kernel_param *kp);
extern int param_get_long(char *buffer, const struct kernel_param *kp);
#define param_check_long(name, p) __param_check(name, p, long)
-extern struct kernel_param_ops param_ops_ulong;
+extern const struct kernel_param_ops param_ops_ulong;
extern int param_set_ulong(const char *val, const struct kernel_param *kp);
extern int param_get_ulong(char *buffer, const struct kernel_param *kp);
#define param_check_ulong(name, p) __param_check(name, p, unsigned long)
-extern struct kernel_param_ops param_ops_ullong;
+extern const struct kernel_param_ops param_ops_ullong;
extern int param_set_ullong(const char *val, const struct kernel_param *kp);
extern int param_get_ullong(char *buffer, const struct kernel_param *kp);
#define param_check_ullong(name, p) __param_check(name, p, unsigned long long)
-extern struct kernel_param_ops param_ops_charp;
+extern const struct kernel_param_ops param_ops_charp;
extern int param_set_charp(const char *val, const struct kernel_param *kp);
extern int param_get_charp(char *buffer, const struct kernel_param *kp);
#define param_check_charp(name, p) __param_check(name, p, char *)
/* We used to allow int as well as bool. We're taking that away! */
-extern struct kernel_param_ops param_ops_bool;
+extern const struct kernel_param_ops param_ops_bool;
extern int param_set_bool(const char *val, const struct kernel_param *kp);
extern int param_get_bool(char *buffer, const struct kernel_param *kp);
#define param_check_bool(name, p) __param_check(name, p, bool)
-extern struct kernel_param_ops param_ops_invbool;
+extern const struct kernel_param_ops param_ops_bool_enable_only;
+extern int param_set_bool_enable_only(const char *val,
+ const struct kernel_param *kp);
+/* getter is the same as for the regular bool */
+#define param_check_bool_enable_only param_check_bool
+
+extern const struct kernel_param_ops param_ops_invbool;
extern int param_set_invbool(const char *val, const struct kernel_param *kp);
extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
#define param_check_invbool(name, p) __param_check(name, p, bool)
/* An int, which can only be set like a bool (though it shows as an int). */
-extern struct kernel_param_ops param_ops_bint;
+extern const struct kernel_param_ops param_ops_bint;
extern int param_set_bint(const char *val, const struct kernel_param *kp);
#define param_get_bint param_get_int
#define param_check_bint param_check_int
@@ -487,9 +450,9 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp);
perm, -1, 0); \
__MODULE_PARM_TYPE(name, "array of " #type)
-extern struct kernel_param_ops param_array_ops;
+extern const struct kernel_param_ops param_array_ops;
-extern struct kernel_param_ops param_ops_string;
+extern const struct kernel_param_ops param_ops_string;
extern int param_set_copystring(const char *val, const struct kernel_param *);
extern int param_get_string(char *buffer, const struct kernel_param *kp);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index fb31765e935a..830c4992088d 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/stddef.h>
+#include <linux/rcupdate.h>
struct rb_node {
unsigned long __rb_parent_color;
@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
- struct rb_node ** rb_link)
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+ struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
*rb_link = node;
}
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ node->__rb_parent_color = (unsigned long)parent;
+ node->rb_left = node->rb_right = NULL;
+
+ rcu_assign_pointer(*rb_link, node);
+}
+
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 378c5ee75f78..14d7b831b63a 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
{
if (parent) {
if (parent->rb_left == old)
- parent->rb_left = new;
+ WRITE_ONCE(parent->rb_left, new);
else
- parent->rb_right = new;
+ WRITE_ONCE(parent->rb_right, new);
} else
- root->rb_node = new;
+ WRITE_ONCE(root->rb_node, new);
}
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
@@ -137,7 +137,8 @@ static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+ struct rb_node *child = node->rb_right;
+ struct rb_node *tmp = node->rb_left;
struct rb_node *parent, *rebalance;
unsigned long pc;
@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
tmp = parent;
} else {
struct rb_node *successor = child, *child2;
+
tmp = child->rb_left;
if (!tmp) {
/*
@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
*/
parent = successor;
child2 = successor->rb_right;
+
augment->copy(node, successor);
} else {
/*
@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
successor = tmp;
tmp = tmp->rb_left;
} while (tmp);
- parent->rb_left = child2 = successor->rb_right;
- successor->rb_right = child;
+ child2 = successor->rb_right;
+ WRITE_ONCE(parent->rb_left, child2);
+ WRITE_ONCE(successor->rb_right, child);
rb_set_parent(child, successor);
+
augment->copy(node, successor);
augment->propagate(parent, successor);
}
- successor->rb_left = tmp = node->rb_left;
+ tmp = node->rb_left;
+ WRITE_ONCE(successor->rb_left, tmp);
rb_set_parent(tmp, successor);
pc = node->__rb_parent_color;
tmp = __rb_parent(pc);
__rb_change_child(node, successor, tmp, root);
+
if (child2) {
successor->__rb_parent_color = pc;
rb_set_parent_color(child2, parent, RB_BLACK);
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
new file mode 100644
index 000000000000..4f3432c61d12
--- /dev/null
+++ b/include/linux/rbtree_latch.h
@@ -0,0 +1,212 @@
+/*
+ * Latched RB-trees
+ *
+ * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
+ *
+ * Since RB-trees have non-atomic modifications they're not immediately suited
+ * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
+ * lockless lookups; we cannot guarantee they return a correct result.
+ *
+ * The simplest solution is a seqlock + RB-tree, this will allow lockless
+ * lookups; but has the constraint (inherent to the seqlock) that read sides
+ * cannot nest in write sides.
+ *
+ * If we need to allow unconditional lookups (say as required for NMI context
+ * usage) we need a more complex setup; this data structure provides this by
+ * employing the latch technique -- see @raw_write_seqcount_latch -- to
+ * implement a latched RB-tree which does allow for unconditional lookups by
+ * virtue of always having (at least) one stable copy of the tree.
+ *
+ * However, while we have the guarantee that there is at all times one stable
+ * copy, this does not guarantee an iteration will not observe modifications.
+ * What might have been a stable copy at the start of the iteration, need not
+ * remain so for the duration of the iteration.
+ *
+ * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
+ * see the comment in lib/rbtree.c. Note however that we only require the first
+ * condition -- not seeing partial stores -- because the latch thing isolates
+ * us from loops. If we were to interrupt a modification the lookup would be
+ * pointed at the stable tree and complete while the modification was halted.
+ */
+
+#ifndef RB_TREE_LATCH_H
+#define RB_TREE_LATCH_H
+
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+
+struct latch_tree_node {
+ struct rb_node node[2];
+};
+
+struct latch_tree_root {
+ seqcount_t seq;
+ struct rb_root tree[2];
+};
+
+/**
+ * latch_tree_ops - operators to define the tree order
+ * @less: used for insertion; provides the (partial) order between two elements.
+ * @comp: used for lookups; provides the order between the search key and an element.
+ *
+ * The operators are related like:
+ *
+ * comp(a->key,b) < 0 := less(a,b)
+ * comp(a->key,b) > 0 := less(b,a)
+ * comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * latch_tree_find().
+ */
+struct latch_tree_ops {
+ bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
+ int (*comp)(void *key, struct latch_tree_node *b);
+};
+
+static __always_inline struct latch_tree_node *
+__lt_from_rb(struct rb_node *node, int idx)
+{
+ return container_of(node, struct latch_tree_node, node[idx]);
+}
+
+static __always_inline void
+__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
+ bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
+{
+ struct rb_root *root = &ltr->tree[idx];
+ struct rb_node **link = &root->rb_node;
+ struct rb_node *node = &ltn->node[idx];
+ struct rb_node *parent = NULL;
+ struct latch_tree_node *ltp;
+
+ while (*link) {
+ parent = *link;
+ ltp = __lt_from_rb(parent, idx);
+
+ if (less(ltn, ltp))
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+
+ rb_link_node_rcu(node, parent, link);
+ rb_insert_color(node, root);
+}
+
+static __always_inline void
+__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
+{
+ rb_erase(&ltn->node[idx], &ltr->tree[idx]);
+}
+
+static __always_inline struct latch_tree_node *
+__lt_find(void *key, struct latch_tree_root *ltr, int idx,
+ int (*comp)(void *key, struct latch_tree_node *node))
+{
+ struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
+ struct latch_tree_node *ltn;
+ int c;
+
+ while (node) {
+ ltn = __lt_from_rb(node, idx);
+ c = comp(key, ltn);
+
+ if (c < 0)
+ node = rcu_dereference_raw(node->rb_left);
+ else if (c > 0)
+ node = rcu_dereference_raw(node->rb_right);
+ else
+ return ltn;
+ }
+
+ return NULL;
+}
+
+/**
+ * latch_tree_insert() - insert @node into the trees @root
+ * @node: nodes to insert
+ * @root: trees to insert @node into
+ * @ops: operators defining the node order
+ *
+ * It inserts @node into @root in an ordered fashion such that we can always
+ * observe one complete tree. See the comment for raw_write_seqcount_latch().
+ *
+ * The inserts use rcu_assign_pointer() to publish the element such that the
+ * tree structure is stored before we can observe the new @node.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_insert(struct latch_tree_node *node,
+ struct latch_tree_root *root,
+ const struct latch_tree_ops *ops)
+{
+ raw_write_seqcount_latch(&root->seq);
+ __lt_insert(node, root, 0, ops->less);
+ raw_write_seqcount_latch(&root->seq);
+ __lt_insert(node, root, 1, ops->less);
+}
+
+/**
+ * latch_tree_erase() - removes @node from the trees @root
+ * @node: nodes to remote
+ * @root: trees to remove @node from
+ * @ops: operators defining the node order
+ *
+ * Removes @node from the trees @root in an ordered fashion such that we can
+ * always observe one complete tree. See the comment for
+ * raw_write_seqcount_latch().
+ *
+ * It is assumed that @node will observe one RCU quiescent state before being
+ * reused of freed.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_erase(struct latch_tree_node *node,
+ struct latch_tree_root *root,
+ const struct latch_tree_ops *ops)
+{
+ raw_write_seqcount_latch(&root->seq);
+ __lt_erase(node, root, 0);
+ raw_write_seqcount_latch(&root->seq);
+ __lt_erase(node, root, 1);
+}
+
+/**
+ * latch_tree_find() - find the node matching @key in the trees @root
+ * @key: search key
+ * @root: trees to search for @key
+ * @ops: operators defining the node order
+ *
+ * Does a lockless lookup in the trees @root for the node matching @key.
+ *
+ * It is assumed that this is called while holding the appropriate RCU read
+ * side lock.
+ *
+ * If the operators define a partial order on the elements (there are multiple
+ * elements which have the same key value) it is undefined which of these
+ * elements will be found. Nor is it possible to iterate the tree to find
+ * further elements with the same key value.
+ *
+ * Returns: a pointer to the node matching @key or NULL.
+ */
+static __always_inline struct latch_tree_node *
+latch_tree_find(void *key, struct latch_tree_root *root,
+ const struct latch_tree_ops *ops)
+{
+ struct latch_tree_node *node;
+ unsigned int seq;
+
+ do {
+ seq = raw_read_seqcount_latch(&root->seq);
+ node = __lt_find(key, root, seq & 1, ops->comp);
+ } while (read_seqcount_retry(&root->seq, seq));
+
+ return node;
+}
+
+#endif /* RB_TREE_LATCH_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 33a056bb886f..4cf5f51b4c9c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -633,21 +633,6 @@ static inline void rcu_preempt_sleep_check(void)
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
/**
- * lockless_dereference() - safely load a pointer for later dereference
- * @p: The pointer to load
- *
- * Similar to rcu_dereference(), but for situations where the pointed-to
- * object's lifetime is managed by something other than RCU. That
- * "something other" might be reference counting or simple immortality.
- */
-#define lockless_dereference(p) \
-({ \
- typeof(p) _________p1 = READ_ONCE(p); \
- smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
- (_________p1); \
-})
-
-/**
* rcu_assign_pointer() - assign to RCU-protected pointer
* @p: pointer to assign to
* @v: value to assign (publish)
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 486e685a226a..e0582106ef4f 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -35,6 +35,7 @@
#include <linux/spinlock.h>
#include <linux/preempt.h>
#include <linux/lockdep.h>
+#include <linux/compiler.h>
#include <asm/processor.h>
/*
@@ -274,9 +275,87 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
s->sequence++;
}
-/*
+static inline int raw_read_seqcount_latch(seqcount_t *s)
+{
+ return lockless_dereference(s->sequence);
+}
+
+/**
* raw_write_seqcount_latch - redirect readers to even/odd copy
* @s: pointer to seqcount_t
+ *
+ * The latch technique is a multiversion concurrency control method that allows
+ * queries during non-atomic modifications. If you can guarantee queries never
+ * interrupt the modification -- e.g. the concurrency is strictly between CPUs
+ * -- you most likely do not need this.
+ *
+ * Where the traditional RCU/lockless data structures rely on atomic
+ * modifications to ensure queries observe either the old or the new state the
+ * latch allows the same for non-atomic updates. The trade-off is doubling the
+ * cost of storage; we have to maintain two copies of the entire data
+ * structure.
+ *
+ * Very simply put: we first modify one copy and then the other. This ensures
+ * there is always one copy in a stable state, ready to give us an answer.
+ *
+ * The basic form is a data structure like:
+ *
+ * struct latch_struct {
+ * seqcount_t seq;
+ * struct data_struct data[2];
+ * };
+ *
+ * Where a modification, which is assumed to be externally serialized, does the
+ * following:
+ *
+ * void latch_modify(struct latch_struct *latch, ...)
+ * {
+ * smp_wmb(); <- Ensure that the last data[1] update is visible
+ * latch->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ *
+ * modify(latch->data[0], ...);
+ *
+ * smp_wmb(); <- Ensure that the data[0] update is visible
+ * latch->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ *
+ * modify(latch->data[1], ...);
+ * }
+ *
+ * The query will have a form like:
+ *
+ * struct entry *latch_query(struct latch_struct *latch, ...)
+ * {
+ * struct entry *entry;
+ * unsigned seq, idx;
+ *
+ * do {
+ * seq = lockless_dereference(latch->seq);
+ *
+ * idx = seq & 0x01;
+ * entry = data_query(latch->data[idx], ...);
+ *
+ * smp_rmb();
+ * } while (seq != latch->seq);
+ *
+ * return entry;
+ * }
+ *
+ * So during the modification, queries are first redirected to data[1]. Then we
+ * modify data[0]. When that is complete, we redirect queries back to data[0]
+ * and we can modify data[1].
+ *
+ * NOTE: The non-requirement for atomic modifications does _NOT_ include
+ * the publishing of new entries in the case where data is a dynamic
+ * data structure.
+ *
+ * An iteration might start in data[0] and get suspended long enough
+ * to miss an entire modification sequence, once it resumes it might
+ * observe the new entry.
+ *
+ * NOTE: When data is a dynamic data structure; one should use regular RCU
+ * patterns to manage the lifetimes of the objects within.
*/
static inline void raw_write_seqcount_latch(seqcount_t *s)
{
OpenPOWER on IntegriCloud