summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/events/core.c34
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c16
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/main.c25
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c494
-rw-r--r--kernel/power/suspend.c152
-rw-r--r--kernel/power/suspend_test.c12
-rw-r--r--kernel/rcu/tree.c140
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c20
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_events.c1
29 files changed, 766 insertions, 347 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
+
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
config ARCH_USE_QUEUE_RWLOCK
bool
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
- if (!parent && !next_parent)
+ if (!parent || !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -7458,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- perf_remove_from_context(child_event, true);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -7474,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event, *next;
- struct perf_event_context *child_ctx;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -7509,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
{
unsigned long *iter;
struct kprobe_blacklist_entry *ent;
- unsigned long offset = 0, size = 0;
+ unsigned long entry, offset = 0, size = 0;
for (iter = start; iter < end; iter++) {
- if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
- pr_err("Failed to find blacklist %p\n", (void *)*iter);
+ entry = arch_deref_entry_point((void *)*iter);
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+ pr_err("Failed to find blacklist at %p\n",
+ (void *)entry);
continue;
}
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
- ent->start_addr = *iter;
- ent->end_addr = *iter + size;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
INIT_LIST_HEAD(&ent->list);
list_add_tail(&ent->list, &kprobe_blacklist);
}
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
- struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
{
- struct optimistic_spin_queue *next = NULL;
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
node->locked = 0;
node->next = NULL;
+ node->cpu = curr;
- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
return true;
+ prev = decode_cpu(old);
+ node->prev = prev;
ACCESS_ONCE(prev->next) = node;
/*
@@ -149,20 +180,21 @@ unqueue:
return false;
}
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
return;
/*
* Second most likely case.
*/
+ node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
* mutex_lock()/rwsem_down_{read,write}() etc.
*/
-struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
};
-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ osq_lock_init(&lock->osq);
#endif
debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
unsigned long flags;
if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->activity != 0);
+ ret = (sem->count != 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
}
return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->activity = 0;
+ sem->count = 0;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
waiter = list_entry(next, struct rwsem_waiter, list);
} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
- sem->activity += woken;
+ sem->count += woken;
out:
return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
goto out;
}
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
ret = 1;
}
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
* itself into sleep and waiting for system woke it or someone
* else in the head of the wait list up.
*/
- if (sem->activity == 0)
+ if (sem->count == 0)
break;
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity == 0) {
+ if (sem->count == 0) {
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
ret = 1;
}
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
sem = __rwsem_wake_one_writer(sem);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 0;
+ sem->count = 0;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 1;
+ sem->count = 1;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
sem->owner = NULL;
- sem->osq = NULL;
+ osq_lock_init(&sem->osq);
#endif
}
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
return false;
}
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
*/
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = true;
+ bool on_cpu = false;
if (need_resched())
- return 0;
+ return false;
rcu_read_lock();
owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_unlock();
/*
- * If sem->owner is not set, the rwsem owner may have
- * just acquired it and not set the owner yet or the rwsem
- * has been released.
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath, then there is a possibility reader(s) may have the lock.
+ * To be safe, avoid spinning in these situations.
*/
return on_cpu;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
#include <linux/atomic.h>
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
sem->owner = current;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config ARCH_HAS_OPP
- bool
-
config PM_OPP
bool
---help---
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8e90f330f139..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (pm_states[i].state)
- s += sprintf(s,"%s ", pm_states[i].label);
+ if (pm_states[i])
+ s += sprintf(s,"%s ", pm_states[i]);
#endif
if (hibernation_available())
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_MIN;
- struct pm_sleep_state *s;
+ suspend_state_t state;
#endif
char *p;
int len;
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (s->state && len == strlen(s->label)
- && !strncmp(buf, s->label, len))
- return s->state;
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = pm_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
#endif
return PM_SUSPEND_ON;
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state].state ?
- pm_states[state].label : "error");
+ return sprintf(buf, "%s\n", pm_states[state] ?
+ pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-#ifdef CONFIG_PM_RUNTIME
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)
return pm_wq ? 0 : -ENOMEM;
}
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
static int __init pm_init(void)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
-struct pm_sleep_state {
- const char *label;
- suspend_state_t state;
-};
-
/* kernel/power/suspend.c */
-extern struct pm_sleep_state pm_states[];
+extern const char *pm_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
printk("Restarting tasks ... ");
+ __usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
read_lock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..4fc5c32422b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
* information is stored (in the form of a block of bitmap)
* It also contains the pfns that correspond to the start and end of
* the represented memory area.
+ *
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
+ *
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
+ *
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
- struct list_head hook; /* hook into a list of bitmap blocks */
- unsigned long start_pfn; /* pfn represented by the first bit */
- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
- unsigned long *data; /* bitmap representing pages */
+/*
+ * struct rtree_node is a wrapper struct to link the nodes
+ * of the rtree together for easy linear iteration over
+ * bits and easy freeing
+ */
+struct rtree_node {
+ struct list_head list;
+ unsigned long *data;
};
-static inline unsigned long bm_block_bits(struct bm_block *bb)
-{
- return bb->end_pfn - bb->start_pfn;
-}
+/*
+ * struct mem_zone_bm_rtree represents a bitmap used for one
+ * populated memory zone.
+ */
+struct mem_zone_bm_rtree {
+ struct list_head list; /* Link Zones together */
+ struct list_head nodes; /* Radix Tree inner nodes */
+ struct list_head leaves; /* Radix Tree leaves */
+ unsigned long start_pfn; /* Zone start page frame */
+ unsigned long end_pfn; /* Zone end page frame + 1 */
+ struct rtree_node *rtree; /* Radix Tree Root */
+ int levels; /* Number of Radix Tree Levels */
+ unsigned int blocks; /* Number of Bitmap Blocks */
+};
/* strcut bm_position is used for browsing memory bitmaps */
struct bm_position {
- struct bm_block *block;
- int bit;
+ struct mem_zone_bm_rtree *zone;
+ struct rtree_node *node;
+ unsigned long node_pfn;
+ int node_bit;
};
struct memory_bitmap {
- struct list_head blocks; /* list of bitmap blocks */
+ struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
/* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+
+/*
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ struct list_head *list)
{
- bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
- bm->cur.bit = 0;
-}
+ struct rtree_node *node;
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+ node = chain_alloc(ca, sizeof(struct rtree_node));
+ if (!node)
+ return NULL;
-/**
- * create_bm_block_list - create a list of block bitmap objects
- * @pages - number of pages to track
- * @list - list to put the allocated blocks into
- * @ca - chain allocator to be used for allocating memory
+ node->data = get_image_page(gfp_mask, safe_needed);
+ if (!node->data)
+ return NULL;
+
+ list_add_tail(&node->list, list);
+
+ return node;
+}
+
+/*
+ * add_rtree_block - Add a new leave node to the radix tree
+ *
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
-static int create_bm_block_list(unsigned long pages,
- struct list_head *list,
- struct chain_allocator *ca)
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
+ int safe_needed, struct chain_allocator *ca)
{
- unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+ struct rtree_node *node, *block, **dst;
+ unsigned int levels_needed, block_nr;
+ int i;
- while (nr_blocks-- > 0) {
- struct bm_block *bb;
+ block_nr = zone->blocks;
+ levels_needed = 0;
- bb = chain_alloc(ca, sizeof(struct bm_block));
- if (!bb)
+ /* How many levels do we need for this block nr? */
+ while (block_nr) {
+ levels_needed += 1;
+ block_nr >>= BM_RTREE_LEVEL_SHIFT;
+ }
+
+ /* Make sure the rtree has enough levels */
+ for (i = zone->levels; i < levels_needed; i++) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
return -ENOMEM;
- list_add(&bb->hook, list);
+
+ node->data[0] = (unsigned long)zone->rtree;
+ zone->rtree = node;
+ zone->levels += 1;
+ }
+
+ /* Allocate new block */
+ block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+ if (!block)
+ return -ENOMEM;
+
+ /* Now walk the rtree to insert the block */
+ node = zone->rtree;
+ dst = &zone->rtree;
+ block_nr = zone->blocks;
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ if (!node) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+ *dst = node;
+ }
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ dst = (struct rtree_node **)&((*dst)->data[index]);
+ node = *dst;
}
+ zone->blocks += 1;
+ *dst = block;
+
return 0;
}
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free);
+
+/*
+ * create_zone_bm_rtree - create a radix tree for one zone
+ *
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start, unsigned long end)
+{
+ struct mem_zone_bm_rtree *zone;
+ unsigned int i, nr_blocks;
+ unsigned long pages;
+
+ pages = end - start;
+ zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+ if (!zone)
+ return NULL;
+
+ INIT_LIST_HEAD(&zone->nodes);
+ INIT_LIST_HEAD(&zone->leaves);
+ zone->start_pfn = start;
+ zone->end_pfn = end;
+ nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+ free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+ return NULL;
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ free_image_page(node->data, clear_nosave_free);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ free_image_page(node->data, clear_nosave_free);
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
struct mem_extent {
struct list_head hook;
unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
int error;
chain_init(&ca, gfp_mask, safe_needed);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
error = create_mem_extents(&mem_extents, gfp_mask);
if (error)
return error;
list_for_each_entry(ext, &mem_extents, hook) {
- struct bm_block *bb;
- unsigned long pfn = ext->start;
- unsigned long pages = ext->end - ext->start;
-
- bb = list_entry(bm->blocks.prev, struct bm_block, hook);
+ struct mem_zone_bm_rtree *zone;
- error = create_bm_block_list(pages, bm->blocks.prev, &ca);
- if (error)
+ zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+ ext->start, ext->end);
+ if (!zone) {
+ error = -ENOMEM;
goto Error;
-
- list_for_each_entry_continue(bb, &bm->blocks, hook) {
- bb->data = get_image_page(gfp_mask, safe_needed);
- if (!bb->data) {
- error = -ENOMEM;
- goto Error;
- }
-
- bb->start_pfn = pfn;
- if (pages >= BM_BITS_PER_BLOCK) {
- pfn += BM_BITS_PER_BLOCK;
- pages -= BM_BITS_PER_BLOCK;
- } else {
- /* This is executed only once in the loop */
- pfn += pages;
- }
- bb->end_pfn = pfn;
}
+ list_add_tail(&zone->list, &bm->zones);
}
bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
*/
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *zone;
- list_for_each_entry(bb, &bm->blocks, hook)
- if (bb->data)
- free_image_page(bb->data, clear_nosave_free);
+ list_for_each_entry(zone, &bm->zones, list)
+ free_zone_bm_rtree(zone, clear_nosave_free);
free_list_of_pages(bm->p_list, clear_nosave_free);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
}
/**
- * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
- * to given pfn. The cur_zone_bm member of @bm and the cur_block member
- * of @bm->cur_zone_bm are updated.
+ * memory_bm_find_bit - Find the bit for pfn in the memory
+ * bitmap
+ *
+ * Find the bit in the bitmap @bm that corresponds to given pfn.
+ * The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * updated.
+ * It walks the radix tree to find the page which contains the bit for
+ * pfn and returns the bit position in **addr and *bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
- void **addr, unsigned int *bit_nr)
+ void **addr, unsigned int *bit_nr)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *curr, *zone;
+ struct rtree_node *node;
+ int i, block_nr;
+ zone = bm->cur.zone;
+
+ if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+ goto zone_found;
+
+ zone = NULL;
+
+ /* Find the right zone */
+ list_for_each_entry(curr, &bm->zones, list) {
+ if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+ zone = curr;
+ break;
+ }
+ }
+
+ if (!zone)
+ return -EFAULT;
+
+zone_found:
/*
- * Check if the pfn corresponds to the current bitmap block and find
- * the block where it fits if this is not the case.
+ * We have a zone. Now walk the radix tree to find the leave
+ * node for our pfn.
*/
- bb = bm->cur.block;
- if (pfn < bb->start_pfn)
- list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn)
- break;
- if (pfn >= bb->end_pfn)
- list_for_each_entry_continue(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
- break;
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ goto node_found;
- if (&bb->hook == &bm->blocks)
- return -EFAULT;
+ node = zone->rtree;
+ block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ BUG_ON(node->data[index] == 0);
+ node = (struct rtree_node *)node->data[index];
+ }
+
+node_found:
+ /* Update last position */
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+
+ /* Set return values */
+ *addr = node->data;
+ *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
- /* The block has been found */
- bm->cur.block = bb;
- pfn -= bb->start_pfn;
- bm->cur.bit = pfn + 1;
- *bit_nr = pfn;
- *addr = bb->data;
return 0;
}
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
+
return error;
}
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+ int bit;
+
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
-/**
- * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
- * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
- * returned.
+/*
+ * rtree_next_node - Jumps to the next leave node
+ *
+ * Sets the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * It is required to run memory_bm_position_reset() before the first call to
- * this function.
+ * Returns true if there is a next node, false otherwise.
*/
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
+ touch_softlockup_watchdog();
+ return true;
+ }
+
+ /* No more nodes, goto next zone */
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ struct mem_zone_bm_rtree, list);
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+ return true;
+ }
+ /* No more zones */
+ return false;
+}
+
+/**
+ * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ * Starting from the last returned position this function searches
+ * for the next set bit in the memory bitmap and returns its
+ * number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ * It is required to run memory_bm_position_reset() before the
+ * first call to this function.
+ */
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
- struct bm_block *bb;
+ unsigned long bits, pfn, pages;
int bit;
- bb = bm->cur.block;
do {
- bit = bm->cur.bit;
- bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
- if (bit < bm_block_bits(bb))
- goto Return_pfn;
-
- bb = list_entry(bb->hook.next, struct bm_block, hook);
- bm->cur.block = bb;
- bm->cur.bit = 0;
- } while (&bb->hook != &bm->blocks);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
+ if (bit < bits) {
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
+ return pfn;
+ }
+ } while (rtree_next_node(bm));
- memory_bm_position_reset(bm);
return BM_END_OF_MAP;
-
- Return_pfn:
- bm->cur.bit = bit + 1;
- return bb->start_pfn + bit;
}
/**
@@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void)
unsigned int snapshot_additional_pages(struct zone *zone)
{
- unsigned int res;
+ unsigned int rtree, nodes;
+
+ rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+ LINKED_PAGE_DATA_SIZE);
+ while (nodes > 1) {
+ nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+ rtree += nodes;
+ }
- res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block),
- LINKED_PAGE_DATA_SIZE);
- return 2 * res;
+ return 2 * rtree;
}
#ifdef CONFIG_HIGHMEM
@@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm;
void swsusp_free(void)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long fb_pfn, fr_pfn;
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (swsusp_page_is_forbidden(page) &&
- swsusp_page_is_free(page)) {
- swsusp_unset_page_forbidden(page);
- swsusp_unset_page_free(page);
- __free_page(page);
- }
- }
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ __free_page(page);
+ goto loop;
}
+
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..9a071bea80eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
#include "power.h"
-struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
- [PM_SUSPEND_STANDBY] = { .label = "standby", },
- [PM_SUSPEND_MEM] = { .label = "mem", },
-};
+static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
-
-static bool need_suspend_ops(suspend_state_t state)
-{
- return state > PM_SUSPEND_FREEZE;
-}
-
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
static bool suspend_freeze_wake;
@@ -97,10 +88,7 @@ static bool relative_states;
static int __init sleep_states_setup(char *str)
{
relative_states = !strncmp(str, "1", 1);
- if (relative_states) {
- pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
- pm_states[PM_SUSPEND_FREEZE].state = 0;
- }
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
return 1;
}
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
suspend_state_t i;
- int j = PM_SUSPEND_MAX - 1;
+ int j = 0;
lock_system_sleep();
suspend_ops = ops;
for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i))
- pm_states[j--].state = i;
- else if (!relative_states)
- pm_states[j--].state = 0;
+ if (valid_state(i)) {
+ pm_states[i] = pm_labels[j++];
+ } else if (!relative_states) {
+ pm_states[i] = NULL;
+ j++;
+ }
- pm_states[j--].state = PM_SUSPEND_FREEZE;
- while (j >= PM_SUSPEND_MIN)
- pm_states[j--].state = 0;
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
unlock_system_sleep();
}
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
}
EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+
+static int platform_suspend_prepare(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ suspend_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ suspend_ops->prepare_late() : 0;
+}
+
+static void platform_suspend_wake(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ suspend_ops->wake();
+}
+
+static void platform_suspend_finish(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ suspend_ops->finish();
+}
+
+static int platform_suspend_begin(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+ return freeze_ops->begin();
+ else if (suspend_ops->begin)
+ return suspend_ops->begin(state);
+ else
+ return 0;
+}
+
+static void platform_suspend_end(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+ else if (suspend_ops->end)
+ suspend_ops->end();
+}
+
+static void platform_suspend_recover(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ suspend_ops->recover();
+}
+
+static bool platform_suspend_again(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ suspend_ops->suspend_again() : false;
+}
+
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
{
int error;
- if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+ if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (need_suspend_ops(state) && suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- goto Platform_finish;
- }
+ error = platform_suspend_prepare(state);
+ if (error)
+ goto Platform_finish;
error = dpm_suspend_end(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platform_finish;
}
-
- if (need_suspend_ops(state) && suspend_ops->prepare_late) {
- error = suspend_ops->prepare_late();
- if (error)
- goto Platform_wake;
- }
+ error = platform_suspend_prepare_late(state);
+ if (error)
+ goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -278,15 +320,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
ftrace_start();
Platform_wake:
- if (need_suspend_ops(state) && suspend_ops->wake)
- suspend_ops->wake();
-
+ platform_suspend_wake(state);
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (need_suspend_ops(state) && suspend_ops->finish)
- suspend_ops->finish();
-
+ platform_suspend_finish(state);
return error;
}
@@ -299,18 +337,13 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (need_suspend_ops(state) && !suspend_ops)
+ if (!sleep_state_supported(state))
return -ENOSYS;
- if (need_suspend_ops(state) && suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
- error = freeze_ops->begin();
- if (error)
- goto Close;
- }
+ error = platform_suspend_begin(state);
+ if (error)
+ goto Close;
+
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
@@ -324,25 +357,20 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup && need_suspend_ops(state)
- && suspend_ops->suspend_again && suspend_ops->suspend_again());
+ } while (!error && !wakeup && platform_suspend_again(state));
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
resume_console();
- Close:
- if (need_suspend_ops(state) && suspend_ops->end)
- suspend_ops->end();
- else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
- freeze_ops->end();
+ Close:
+ platform_suspend_end(state);
return error;
Recover_platform:
- if (need_suspend_ops(state) && suspend_ops->recover)
- suspend_ops->recover();
+ platform_suspend_recover(state);
goto Resume_devices;
}
@@ -395,7 +423,7 @@ static int enter_state(suspend_state_t state)
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -404,7 +432,7 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
/* "=mem" ==> "mem" */
value++;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (!strcmp(pm_states[i].label, value)) {
- test_state = pm_states[i].state;
+ if (!strcmp(pm_states[i], value)) {
+ test_state = i;
return 0;
}
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
- if (!pm_states[test_state].state) {
- printk(warn_bad_state, pm_states[test_state].label);
+ if (!pm_states[test_state]) {
+ printk(warn_bad_state, pm_states[test_state]);
goto done;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
rdp->passed_quiesce = 1;
}
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+ .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state. This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+ raw_cpu_write(rcu_sched_qs_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
-
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
unsigned int curr;
+ int *rcrmp;
unsigned int snap;
curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
}
/*
- * There is a possibility that a CPU in adaptive-ticks state
- * might run in the kernel with the scheduling-clock tick disabled
- * for an extended time period. Invoke rcu_kick_nohz_cpu() to
- * force the CPU to restart the scheduling-clock tick in this
- * CPU is in this state.
- */
- rcu_kick_nohz_cpu(rdp->cpu);
-
- /*
- * Alternatively, the CPU might be running in the kernel
- * for an extended period of time without a quiescent state.
- * Attempt to force the CPU through the scheduler to gain the
- * needed quiescent state, but only if the grace period has gone
- * on for an uncommonly long time. If there are many stuck CPUs,
- * we will beat on the first one until it gets unstuck, then move
- * to the next. Only do this for the primary flavor of RCU.
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
+ *
+ * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+ * is set too high, we override with half of the RCU CPU stall
+ * warning delay.
*/
- if (rdp->rsp == rcu_state_p &&
+ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- rdp->rsp->jiffies_resched += 5;
- resched_cpu(rdp->cpu);
+ if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ /* Time to beat on that CPU again! */
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ }
}
return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
"rcu_node_fqs_1",
"rcu_node_fqs_2",
"rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
* if an adaptive-ticks CPU is failing to respond to the current grace
* period and has not be idle from an RCU perspective, kick it.
*/
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
EXPORT_SYMBOL_GPL(wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends. Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
- preempt_disable();
- __this_cpu_write(rcu_cond_resched_count, 0);
- rcu_note_context_switch(smp_processor_id());
- preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
int __sched _cond_resched(void)
{
- rcu_cond_resched();
if (should_resched()) {
__cond_resched();
return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- bool need_rcu_resched = rcu_should_resched();
int resched = should_resched();
int ret = 0;
lockdep_assert_held(lock);
- if (spin_needbreak(lock) || resched || need_rcu_resched) {
+ if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
__cond_resched();
- else if (unlikely(need_rcu_resched))
- rcu_resched();
else
cpu_relax();
ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
if (should_resched()) {
local_bh_enable();
__cond_resched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..658a58dc30f4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,8 +147,6 @@ use_default:
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
goto use_default;
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
/*
* Enter the idle state previously returned by the governor decision.
* This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
*/
entered_state = cpuidle_enter(drv, dev, next_state);
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
-
if (broadcast)
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ ktime_t exp;
+
if (!rtcdev)
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (old_setting)
alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- alarm_start(&timr->it.alarm.alarmtimer,
- timespec_to_ktime(new_setting->it_value));
+ exp = timespec_to_ktime(new_setting->it_value);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now;
+
+ now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ alarm_start(&timr->it.alarm.alarmtimer, exp);
return 0;
}
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -265,12 +265,12 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
+ update_function_graph_func();
+
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
- update_function_graph_func();
-
/*
* If we are using the list function, it doesn't care
* about the function_trace_ops.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
- if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
- return POLLIN | POLLRDNORM;
-
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f243444a3772..291397e66669 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
struct print_entry *entry;
unsigned long irq_flags;
int alloc;
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
return size;
}
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
struct bputs_entry *entry;
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
- irq_flags, preempt_count());
+ irq_flags, pc);
if (!event)
return 0;
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
return 1;
}
@@ -809,7 +823,7 @@ static struct {
{ trace_clock_local, "local", 1 },
{ trace_clock_global, "global", 1 },
{ trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 1 },
+ { trace_clock_jiffies, "uptime", 0 },
{ trace_clock, "perf", 1 },
ARCH_TRACE_CLOCKS
};
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
*/
u64 notrace trace_clock_jiffies(void)
{
- u64 jiffy = jiffies - INITIAL_JIFFIES;
-
- /* Return nsecs */
- return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
list_del(&file->list);
remove_subsystem(file->system);
+ free_event_filter(file->filter);
kmem_cache_free(file_cachep, file);
}
OpenPOWER on IntegriCloud