summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/acct.c125
-rw-r--r--kernel/audit.c34
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c272
-rw-r--r--kernel/auditsc.c254
-rw-r--r--kernel/capability.c10
-rw-r--r--kernel/compat.c40
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c219
-rw-r--r--kernel/cpuset.c179
-rw-r--r--kernel/delayacct.c162
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c114
-rw-r--r--kernel/fork.c119
-rw-r--r--kernel/futex.c1095
-rw-r--r--kernel/futex_compat.c48
-rw-r--r--kernel/hrtimer.c47
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/autoprobe.c56
-rw-r--r--kernel/irq/chip.c533
-rw-r--r--kernel/irq/handle.c165
-rw-r--r--kernel/irq/internals.h46
-rw-r--r--kernel/irq/manage.c209
-rw-r--r--kernel/irq/migration.c20
-rw-r--r--kernel/irq/proc.c30
-rw-r--r--kernel/irq/resend.c77
-rw-r--r--kernel/irq/spurious.c37
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kexec.c14
-rw-r--r--kernel/kfifo.c28
-rw-r--r--kernel/kmod.c20
-rw-r--r--kernel/kprobes.c59
-rw-r--r--kernel/ksysfs.c1
-rw-r--r--kernel/kthread.c63
-rw-r--r--kernel/lockdep.c2724
-rw-r--r--kernel/lockdep_internals.h78
-rw-r--r--kernel/lockdep_proc.c345
-rw-r--r--kernel/module.c198
-rw-r--r--kernel/mutex-debug.c404
-rw-r--r--kernel/mutex-debug.h111
-rw-r--r--kernel/mutex.c95
-rw-r--r--kernel/mutex.h25
-rw-r--r--kernel/panic.c16
-rw-r--r--kernel/params.c16
-rw-r--r--kernel/pid.c18
-rw-r--r--kernel/posix-cpu-timers.c101
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/Kconfig58
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c44
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/power.h63
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c62
-rw-r--r--kernel/power/snapshot.c1241
-rw-r--r--kernel/power/swap.c288
-rw-r--r--kernel/power/swsusp.c32
-rw-r--r--kernel/power/user.c17
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c19
-rw-r--r--kernel/ptrace.c84
-rw-r--r--kernel/rcupdate.c24
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/relay.c38
-rw-r--r--kernel/resource.c128
-rw-r--r--kernel/rtmutex-debug.c242
-rw-r--r--kernel/rtmutex-debug.h33
-rw-r--r--kernel/rtmutex-tester.c441
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h26
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/rwsem.c147
-rw-r--r--kernel/sched.c2086
-rw-r--r--kernel/signal.c79
-rw-r--r--kernel/softirq.c171
-rw-r--r--kernel/softlockup.c11
-rw-r--r--kernel/spinlock.c96
-rw-r--r--kernel/stacktrace.c24
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c45
-rw-r--r--kernel/sysctl.c218
-rw-r--r--kernel/taskstats.c564
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c534
-rw-r--r--kernel/unwind.c941
-rw-r--r--kernel/wait.c9
-rw-r--r--kernel/workqueue.c182
92 files changed, 14786 insertions, 3322 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,19 +8,30 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o
+ hrtimer.o rwsem.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += time/
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_STACK_UNWIND) += unwind.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -37,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 6802020e0ceb..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -43,7 +43,6 @@
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(long, struct file *);
+static void do_acct_process(struct file *);
/*
* This structure is used so that all the data protected by lock
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file)
if (old_acct) {
mnt_unpin(old_acct->f_vfsmnt);
spin_unlock(&acct_globals.lock);
- do_acct_process(0, old_acct);
+ do_acct_process(old_acct);
filp_close(old_acct, NULL);
spin_lock(&acct_globals.lock);
}
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value)
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
-static void do_acct_process(long exitcode, struct file *file)
+static void do_acct_process(struct file *file)
{
+ struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
mm_segment_t fs;
- unsigned long vsize;
unsigned long flim;
u64 elapsed;
u64 run_time;
struct timespec uptime;
- unsigned long jiffies;
/*
* First check to see if there is enough free_space to continue
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file)
#endif
do_div(elapsed, AHZ);
ac.ac_btime = xtime.tv_sec - elapsed;
- jiffies = cputime_to_jiffies(cputime_add(current->utime,
- current->signal->utime));
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
- jiffies = cputime_to_jiffies(cputime_add(current->stime,
- current->signal->stime));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
/* we really need to bite the bullet and change layout */
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
@@ -491,42 +483,27 @@ static void do_acct_process(long exitcode, struct file *file)
ac.ac_ppid = current->parent->tgid;
#endif
- read_lock(&tasklist_lock); /* pin current->signal */
+ mutex_lock(&tty_mutex);
+ /* FIXME: Whoever is responsible for current->signal locking needs
+ to use the same locking all over the kernel and document it */
+ read_lock(&tasklist_lock);
ac.ac_tty = current->signal->tty ?
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
-
- ac.ac_flag = 0;
- if (current->flags & PF_FORKNOEXEC)
- ac.ac_flag |= AFORK;
- if (current->flags & PF_SUPERPRIV)
- ac.ac_flag |= ASU;
- if (current->flags & PF_DUMPCORE)
- ac.ac_flag |= ACORE;
- if (current->flags & PF_SIGNALED)
- ac.ac_flag |= AXSIG;
-
- vsize = 0;
- if (current->mm) {
- struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
- vma = current->mm->mmap;
- while (vma) {
- vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- up_read(&current->mm->mmap_sem);
- }
- vsize = vsize / 1024;
- ac.ac_mem = encode_comp_t(vsize);
+ mutex_unlock(&tty_mutex);
+
+ spin_lock_irq(&current->sighand->siglock);
+ ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac.ac_flag = pacct->ac_flag;
+ ac.ac_mem = encode_comp_t(pacct->ac_mem);
+ ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac.ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_minflt = encode_comp_t(current->signal->min_flt +
- current->min_flt);
- ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
- current->maj_flt);
ac.ac_swaps = encode_comp_t(0);
- ac.ac_exitcode = exitcode;
/*
* Kernel segment override to datasegment and write it
@@ -546,12 +523,64 @@ static void do_acct_process(long exitcode, struct file *file)
}
/**
+ * acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+ memset(pacct, 0, sizeof(struct pacct_struct));
+ pacct->ac_utime = pacct->ac_stime = cputime_zero;
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ unsigned long vsize = 0;
+
+ if (group_dead && current->mm) {
+ struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up_read(&current->mm->mmap_sem);
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (group_dead)
+ pacct->ac_mem = vsize / 1024;
+ if (thread_group_leader(current)) {
+ pacct->ac_exitcode = exitcode;
+ if (current->flags & PF_FORKNOEXEC)
+ pacct->ac_flag |= AFORK;
+ }
+ if (current->flags & PF_SUPERPRIV)
+ pacct->ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ pacct->ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ pacct->ac_flag |= AXSIG;
+ pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+ pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ pacct->ac_minflt += current->min_flt;
+ pacct->ac_majflt += current->maj_flt;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+/**
* acct_process - now just a wrapper around do_acct_process
* @exitcode: task exit code
*
* handles process accounting for an exiting task
*/
-void acct_process(long exitcode)
+void acct_process(void)
{
struct file *file = NULL;
@@ -570,7 +599,7 @@ void acct_process(long exitcode)
get_file(file);
spin_unlock(&acct_globals.lock);
- do_acct_process(exitcode, file);
+ do_acct_process(file);
fput(file);
}
@@ -599,9 +628,7 @@ void acct_update_integrals(struct task_struct *tsk)
*/
void acct_clear_integrals(struct task_struct *tsk)
{
- if (tsk) {
- tsk->acct_stimexpd = 0;
- tsk->acct_rss_mem1 = 0;
- tsk->acct_vm_mem1 = 0;
- }
+ tsk->acct_stimexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
}
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd7..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -445,7 +445,7 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
* Check for appropriate CAP_AUDIT_ capabilities on incoming audit
* control messages.
*/
-static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
+static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
@@ -459,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
case AUDIT_DEL:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
- if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
+ if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
- if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
+ if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -488,7 +488,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
char *ctx;
u32 len;
- err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
+ err = audit_netlink_ok(skb, msg_type);
if (err)
return err;
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
if (sid) {
- if ((err = selinux_ctxid_to_string(
+ if ((err = selinux_sid_to_string(
sid, &ctx, &len)))
return err;
else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
"user pid=%d uid=%u auid=%u",
pid, uid, loginuid);
if (sid) {
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
sid, &ctx, &len)) {
audit_log_format(ab,
" ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_SIGNAL_INFO:
- err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+ err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
if (err)
return err;
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
@@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = {
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
-#ifdef CONFIG_AUDITSYSCALL
int i;
-#endif
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
@@ -717,10 +715,10 @@ static int __init audit_init(void)
audit_ih = inotify_init(&audit_inotify_ops);
if (IS_ERR(audit_ih))
audit_panic("cannot initialize inotify handle");
+#endif
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
-#endif
return 0;
}
@@ -818,7 +816,7 @@ err:
*/
unsigned int audit_serial(void)
{
- static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+ static DEFINE_SPINLOCK(serial_lock);
static unsigned int serial = 0;
unsigned long flags;
@@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
struct sk_buff *skb;
static const unsigned char *hex = "0123456789ABCDEF";
+ if (!ab)
+ return;
+
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
@@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
unsigned char *ptr;
struct sk_buff *skb;
+ if (!ab)
+ return;
+
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 8323e4132a33..a3370232a390 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -81,6 +81,7 @@ struct audit_krule {
u32 mask[AUDIT_BITMASK_SIZE];
u32 buflen; /* for data alloc on list rules */
u32 field_count;
+ char *filterkey; /* ties events to rules */
struct audit_field *fields;
struct audit_field *inode_f; /* quick access to an inode field */
struct audit_watch *watch; /* associated watch */
@@ -103,6 +104,7 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_compare_dname_path(const char *dname, const char *path,
int *dirlen);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4c99d2c586ed..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -141,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e)
selinux_audit_rule_free(f->se_rule);
}
kfree(e->rule.fields);
+ kfree(e->rule.filterkey);
kfree(e);
}
@@ -278,6 +279,38 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
return 0;
}
+static __u32 *classes[AUDIT_SYSCALL_CLASSES];
+
+int __init audit_register_class(int class, unsigned *list)
+{
+ __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ while (*list != ~0U) {
+ unsigned n = *list++;
+ if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
+ kfree(p);
+ return -EINVAL;
+ }
+ p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
+ }
+ if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
+ kfree(p);
+ return -EINVAL;
+ }
+ classes[class] = p;
+ return 0;
+}
+
+int audit_match_class(int class, unsigned syscall)
+{
+ if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32)))
+ return 0;
+ if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
+ return 0;
+ return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
+}
+
/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
{
@@ -321,6 +354,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
entry->rule.mask[i] = rule->mask[i];
+ for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
+ int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
+ __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
+ __u32 *class;
+
+ if (!(*p & AUDIT_BIT(bit)))
+ continue;
+ *p &= ~AUDIT_BIT(bit);
+ class = classes[i];
+ if (class) {
+ int j;
+ for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
+ entry->rule.mask[j] |= class[j];
+ }
+ }
+
return entry;
exit_err:
@@ -364,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_PERS:
case AUDIT_ARCH:
case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
case AUDIT_DEVMAJOR:
case AUDIT_DEVMINOR:
case AUDIT_EXIT:
@@ -373,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_ARG2:
case AUDIT_ARG3:
break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
case AUDIT_INODE:
err = audit_to_inode(&entry->rule, f);
if (err)
@@ -402,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_EQUAL:
break;
default:
+ err = -EINVAL;
goto exit_free;
}
}
@@ -469,11 +524,16 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_ARG2:
case AUDIT_ARG3:
break;
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
str = audit_unpack_string(&bufp, &remain, f->val);
if (IS_ERR(str))
goto exit_free;
@@ -511,6 +571,20 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
if (err)
goto exit_free;
break;
+ case AUDIT_FILTERKEY:
+ err = -EINVAL;
+ if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+ entry->rule.filterkey = str;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
default:
goto exit_free;
}
@@ -524,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_EQUAL:
break;
default:
+ err = -EINVAL;
goto exit_free;
}
}
@@ -600,11 +675,16 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = f->op;
switch(f->type) {
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
data->buflen += data->values[i] =
audit_pack_string(&bufp, f->se_str);
break;
@@ -612,6 +692,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->watch->path);
break;
+ case AUDIT_FILTERKEY:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, krule->filterkey);
+ break;
default:
data->values[i] = f->val;
}
@@ -639,11 +723,16 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 1;
switch(a->fields[i].type) {
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
return 1;
break;
@@ -651,6 +740,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->watch->path, b->watch->path))
return 1;
break;
+ case AUDIT_FILTERKEY:
+ /* both filterkeys exist based on above type compare */
+ if (strcmp(a->filterkey, b->filterkey))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -730,6 +824,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
u32 fcount = old->field_count;
struct audit_entry *entry;
struct audit_krule *new;
+ char *fk;
int i, err = 0;
entry = audit_init_entry(fcount);
@@ -753,13 +848,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
* the originals will all be freed when the old rule is freed. */
for (i = 0; i < fcount; i++) {
switch (new->fields[i].type) {
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
err = audit_dupe_selinux_field(&new->fields[i],
&old->fields[i]);
+ break;
+ case AUDIT_FILTERKEY:
+ fk = kstrdup(old->filterkey, GFP_KERNEL);
+ if (unlikely(!fk))
+ err = -ENOMEM;
+ else
+ new->filterkey = fk;
}
if (err) {
audit_free_rule(entry);
@@ -824,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent,
}
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "audit updated rules specifying watch=");
+ audit_log_format(ab, "audit updated rules specifying path=");
audit_log_untrustedstring(ab, owatch->path);
audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
audit_log_end(ab);
@@ -847,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_watch *w, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *e;
+ struct audit_buffer *ab;
mutex_lock(&audit_filter_mutex);
parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "audit implicitly removed rule path=");
+ audit_log_untrustedstring(ab, w->path);
+ if (r->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab, r->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
+ audit_log_format(ab, " list=%d", r->listnr);
+ audit_log_end(ab);
+
list_del(&r->rlist);
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
-
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "audit implicitly removed rule from list=%d\n",
- AUDIT_FILTER_EXIT);
}
audit_remove_watch(w);
}
@@ -1047,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
struct audit_watch *watch = entry->rule.watch;
struct nameidata *ndp, *ndw;
int h, err, putnd_needed = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
if (inode_f) {
h = audit_hash_ino(inode_f->val);
@@ -1087,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
} else {
list_add_tail_rcu(&entry->list, list);
}
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules++;
+#endif
mutex_unlock(&audit_filter_mutex);
if (putnd_needed)
@@ -1111,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry,
struct audit_watch *watch, *tmp_watch = entry->rule.watch;
LIST_HEAD(inotify_list);
int h, ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
if (inode_f) {
h = audit_hash_ino(inode_f->val);
@@ -1148,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry,
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules--;
+#endif
mutex_unlock(&audit_filter_mutex);
if (!list_empty(&inotify_list))
@@ -1245,6 +1385,34 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
skb_queue_tail(q, skb);
}
+/* Log rule additions and removals */
+static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
+ struct audit_krule *rule, int res)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (!ab)
+ return;
+ audit_log_format(ab, "auid=%u", loginuid);
+ if (sid) {
+ char *ctx = NULL;
+ u32 len;
+ if (selinux_sid_to_string(sid, &ctx, &len))
+ audit_log_format(ab, " ssid=%u", sid);
+ else
+ audit_log_format(ab, " subj=%s", ctx);
+ kfree(ctx);
+ }
+ audit_log_format(ab, " %s rule key=", action);
+ if (rule->filterkey)
+ audit_log_untrustedstring(ab, rule->filterkey);
+ else
+ audit_log_format(ab, "(null)");
+ audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
+ audit_log_end(ab);
+}
+
/**
* audit_receive_filter - apply all rules to the specified message type
* @type: audit message type
@@ -1304,24 +1472,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_add_rule(entry,
&audit_filter_list[entry->rule.listnr]);
-
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (selinux_ctxid_to_string(sid, &ctx, &len)) {
- /* Maybe call audit_panic? */
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u ssid=%u add rule to list=%d res=%d",
- loginuid, sid, entry->rule.listnr, !err);
- } else
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u subj=%s add rule to list=%d res=%d",
- loginuid, ctx, entry->rule.listnr, !err);
- kfree(ctx);
- } else
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u add rule to list=%d res=%d",
- loginuid, entry->rule.listnr, !err);
+ audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
if (err)
audit_free_rule(entry);
@@ -1337,24 +1488,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_del_rule(entry,
&audit_filter_list[entry->rule.listnr]);
-
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (selinux_ctxid_to_string(sid, &ctx, &len)) {
- /* Maybe call audit_panic? */
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u ssid=%u remove rule from list=%d res=%d",
- loginuid, sid, entry->rule.listnr, !err);
- } else
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u subj=%s remove rule from list=%d res=%d",
- loginuid, ctx, entry->rule.listnr, !err);
- kfree(ctx);
- } else
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u remove rule from list=%d res=%d",
- loginuid, entry->rule.listnr, !err);
+ audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
+ !err);
audit_free_rule(entry);
break;
@@ -1514,11 +1649,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
switch (f->type) {
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
return 1;
}
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96fda295..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -85,6 +85,9 @@ extern int audit_enabled;
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
+/* number of audit rules */
+int audit_n_rules;
+
/* When fs/namei.c:getname() is called, we store the pointer in name and
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
@@ -174,6 +177,7 @@ struct audit_aux_data_path {
/* The per-task audit context. */
struct audit_context {
+ int dummy; /* must be the first element */
int in_syscall; /* 1 if task is in a syscall */
enum audit_state state;
unsigned int serial; /* serial number for record */
@@ -186,6 +190,7 @@ struct audit_context {
int auditable; /* 1 if record should be written */
int name_count;
struct audit_names names[AUDIT_NAMES];
+ char * filterkey; /* key for rule that triggered record */
struct dentry * pwd;
struct vfsmount * pwdmnt;
struct audit_context *previous; /* For nested syscalls */
@@ -204,6 +209,54 @@ struct audit_context {
#endif
};
+#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
+static inline int open_arg(int flags, int mask)
+{
+ int n = ACC_MODE(flags);
+ if (flags & (O_TRUNC | O_CREAT))
+ n |= AUDIT_PERM_WRITE;
+ return n & mask;
+}
+
+static int audit_match_perm(struct audit_context *ctx, int mask)
+{
+ unsigned n = ctx->major;
+ switch (audit_classify_syscall(ctx->arch, n)) {
+ case 0: /* native */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR, n))
+ return 1;
+ return 0;
+ case 1: /* 32bit on biarch */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR_32, n))
+ return 1;
+ return 0;
+ case 2: /* open */
+ return mask & ACC_MODE(ctx->argv[1]);
+ case 3: /* openat */
+ return mask & ACC_MODE(ctx->argv[2]);
+ case 4: /* socketcall */
+ return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
+ case 5: /* execve */
+ return mask & AUDIT_PERM_EXEC;
+ default:
+ return 0;
+ }
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
@@ -320,11 +373,11 @@ static int audit_filter_rules(struct task_struct *tsk,
if (ctx)
result = audit_comparator(ctx->loginuid, f->op, f->val);
break;
- case AUDIT_SE_USER:
- case AUDIT_SE_ROLE:
- case AUDIT_SE_TYPE:
- case AUDIT_SE_SEN:
- case AUDIT_SE_CLR:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
/* NOTE: this may return negative values indicating
a temporary error. We simply treat this as a
match for now to avoid losing information that
@@ -332,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->se_rule) {
if (need_sid) {
- selinux_task_ctxid(tsk, &sid);
+ selinux_get_task_sid(tsk, &sid);
need_sid = 0;
}
result = selinux_audit_rule_match(sid, f->type,
@@ -341,6 +394,46 @@ static int audit_filter_rules(struct task_struct *tsk,
ctx);
}
break;
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
+ also applies here */
+ if (f->se_rule) {
+ /* Find files that match */
+ if (name) {
+ result = selinux_audit_rule_match(
+ name->osid, f->type, f->op,
+ f->se_rule, ctx);
+ } else if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (selinux_audit_rule_match(
+ ctx->names[j].osid,
+ f->type, f->op,
+ f->se_rule, ctx)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ /* Find ipc objects that match */
+ if (ctx) {
+ struct audit_aux_data *aux;
+ for (aux = ctx->aux; aux;
+ aux = aux->next) {
+ if (aux->type == AUDIT_IPC) {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ }
+ }
+ break;
case AUDIT_ARG0:
case AUDIT_ARG1:
case AUDIT_ARG2:
@@ -348,11 +441,20 @@ static int audit_filter_rules(struct task_struct *tsk,
if (ctx)
result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
break;
+ case AUDIT_FILTERKEY:
+ /* ignore this field for filtering */
+ result = 1;
+ break;
+ case AUDIT_PERM:
+ result = audit_match_perm(ctx, f->val);
+ break;
}
if (!result)
return 0;
}
+ if (rule->filterkey)
+ ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
switch (rule->action) {
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
@@ -467,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
context->return_valid = return_valid;
context->return_code = return_code;
- if (context->in_syscall && !context->auditable) {
+ if (context->in_syscall && !context->dummy && !context->auditable) {
enum audit_state state;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
@@ -483,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
}
get_context:
- context->pid = tsk->pid;
- context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
- context->uid = tsk->uid;
- context->gid = tsk->gid;
- context->euid = tsk->euid;
- context->suid = tsk->suid;
- context->fsuid = tsk->fsuid;
- context->egid = tsk->egid;
- context->sgid = tsk->sgid;
- context->fsgid = tsk->fsgid;
- context->personality = tsk->personality;
+
tsk->audit_context = NULL;
return context;
}
@@ -627,6 +719,7 @@ static inline void audit_free_context(struct audit_context *context)
}
audit_free_names(context);
audit_free_aux(context);
+ kfree(context->filterkey);
kfree(context);
context = previous;
} while (context);
@@ -658,8 +751,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
return;
error_path:
- if (ctx)
- kfree(ctx);
+ kfree(ctx);
audit_panic("error in audit_log_task_context");
return;
}
@@ -702,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
const char *tty;
/* tsk == current */
+ context->pid = tsk->pid;
+ context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
+ context->uid = tsk->uid;
+ context->gid = tsk->gid;
+ context->euid = tsk->euid;
+ context->suid = tsk->suid;
+ context->fsuid = tsk->fsuid;
+ context->egid = tsk->egid;
+ context->sgid = tsk->sgid;
+ context->fsgid = tsk->fsgid;
+ context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
@@ -714,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
+
+ mutex_lock(&tty_mutex);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
@@ -735,7 +840,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->gid,
context->euid, context->suid, context->fsuid,
context->egid, context->sgid, context->fsgid, tty);
+
+ mutex_unlock(&tty_mutex);
+
audit_log_task_info(ab, tsk);
+ if (context->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab, context->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
audit_log_end(ab);
for (aux = context->aux; aux; aux = aux->next) {
@@ -790,7 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (axi->osid != 0) {
char *ctx = NULL;
u32 len;
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
axi->osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u",
axi->osid);
@@ -897,7 +1010,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (n->osid != 0) {
char *ctx = NULL;
u32 len;
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
n->osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", n->osid);
call_panic = 2;
@@ -1014,7 +1127,8 @@ void audit_syscall_entry(int arch, int major,
context->argv[3] = a4;
state = context->state;
- if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
if (likely(state == AUDIT_DISABLED))
return;
@@ -1061,6 +1175,8 @@ void audit_syscall_exit(int valid, long return_code)
} else {
audit_free_names(context);
audit_free_aux(context);
+ kfree(context->filterkey);
+ context->filterkey = NULL;
tsk->audit_context = context;
}
}
@@ -1145,14 +1261,18 @@ void audit_putname(const char *name)
#endif
}
-static void audit_inode_context(int idx, const struct inode *inode)
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
{
- struct audit_context *context = current->audit_context;
-
- selinux_get_inode_sid(inode, &context->names[idx].osid);
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ selinux_get_inode_sid(inode, &name->osid);
}
-
/**
* audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1186,20 +1306,14 @@ void __audit_inode(const char *name, const struct inode *inode)
++context->ino_count;
#endif
}
- context->names[idx].ino = inode->i_ino;
- context->names[idx].dev = inode->i_sb->s_dev;
- context->names[idx].mode = inode->i_mode;
- context->names[idx].uid = inode->i_uid;
- context->names[idx].gid = inode->i_gid;
- context->names[idx].rdev = inode->i_rdev;
- audit_inode_context(idx, inode);
+ audit_copy_inode(&context->names[idx], inode);
}
/**
* audit_inode_child - collect inode info for created/removed objects
* @dname: inode's dentry name
* @inode: inode being audited
- * @pino: inode number of dentry parent
+ * @parent: inode of dentry parent
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -1210,7 +1324,7 @@ void __audit_inode(const char *name, const struct inode *inode)
* unsuccessful attempts.
*/
void __audit_inode_child(const char *dname, const struct inode *inode,
- unsigned long pino)
+ const struct inode *parent)
{
int idx;
struct audit_context *context = current->audit_context;
@@ -1224,7 +1338,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
if (!dname)
goto update_context;
for (idx = 0; idx < context->name_count; idx++)
- if (context->names[idx].ino == pino) {
+ if (context->names[idx].ino == parent->i_ino) {
const char *name = context->names[idx].name;
if (!name)
@@ -1248,16 +1362,47 @@ update_context:
context->names[idx].name_len = AUDIT_NAME_FULL;
context->names[idx].name_put = 0; /* don't call __putname() */
- if (inode) {
- context->names[idx].ino = inode->i_ino;
- context->names[idx].dev = inode->i_sb->s_dev;
- context->names[idx].mode = inode->i_mode;
- context->names[idx].uid = inode->i_uid;
- context->names[idx].gid = inode->i_gid;
- context->names[idx].rdev = inode->i_rdev;
- audit_inode_context(idx, inode);
- } else
- context->names[idx].ino = (unsigned long)-1;
+ if (!inode)
+ context->names[idx].ino = (unsigned long)-1;
+ else
+ audit_copy_inode(&context->names[idx], inode);
+
+ /* A parent was not found in audit_names, so copy the inode data for the
+ * provided parent. */
+ if (!found_name) {
+ idx = context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ audit_copy_inode(&context->names[idx], parent);
+ }
+}
+
+/**
+ * audit_inode_update - update inode info for last collected name
+ * @inode: inode being audited
+ *
+ * When open() is called on an existing object with the O_CREAT flag, the inode
+ * data audit initially collects is incorrect. This additional hook ensures
+ * audit has the inode data for the actual object to be opened.
+ */
+void __audit_inode_update(const struct inode *inode)
+{
+ struct audit_context *context = current->audit_context;
+ int idx;
+
+ if (!context->in_syscall || !inode)
+ return;
+
+ if (context->name_count == 0) {
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ }
+ idx = context->name_count - 1;
+
+ audit_copy_inode(&context->names[idx], inode);
}
/**
@@ -1367,7 +1512,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
* @mqdes: MQ descriptor
* @msg_len: Message length
* @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_abs_timeout: Message timeout in absolute time
*
* Returns 0 for success or NULL context or < 0 on error.
*/
@@ -1409,8 +1554,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
* __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
* @mqdes: MQ descriptor
* @msg_len: Message length
- * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
*
* Returns 0 for success or NULL context or < 0 on error.
*/
@@ -1558,7 +1703,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
* @uid: msgq user id
* @gid: msgq group id
* @mode: msgq mode (permissions)
- * @ipcp: in-kernel IPC permissions
*
* Returns 0 for success or NULL context or < 0 on error.
*/
@@ -1589,7 +1733,7 @@ int audit_bprm(struct linux_binprm *bprm)
unsigned long p, next;
void *to;
- if (likely(!audit_enabled || !context))
+ if (likely(!audit_enabled || !context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
@@ -1627,7 +1771,7 @@ int audit_socketcall(int nargs, unsigned long *args)
struct audit_aux_data_socketcall *ax;
struct audit_context *context = current->audit_context;
- if (likely(!context))
+ if (likely(!context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
@@ -1655,7 +1799,7 @@ int audit_sockaddr(int len, void *a)
struct audit_aux_data_sockaddr *ax;
struct audit_context *context = current->audit_context;
- if (likely(!context))
+ if (likely(!context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a4d8a40d3f9..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
int ret = 0;
pid_t pid;
__u32 version;
- task_t *target;
+ struct task_struct *target;
struct __user_cap_data_struct data;
if (get_user(version, &header->version))
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
- task_t *g, *target;
+ struct task_struct *g, *target;
int ret = -EPERM;
int found = 0;
@@ -128,12 +128,12 @@ static inline int cap_set_all(kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
- task_t *g, *target;
+ struct task_struct *g, *target;
int ret = -EPERM;
int found = 0;
do_each_thread(g, target) {
- if (target == current || target->pid == 1)
+ if (target == current || is_init(target))
continue;
found = 1;
if (security_capset_check(target, effective, inheritable,
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
kernel_cap_t inheritable, permitted, effective;
__u32 version;
- task_t *target;
+ struct task_struct *target;
int ret;
pid_t pid;
diff --git a/kernel/compat.c b/kernel/compat.c
index 2f672332430f..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/timex.h>
#include <linux/migrate.h>
+#include <linux/posix-timers.h>
#include <asm/uaccess.h>
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
return err;
}
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec tu;
+ struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+
+ restart->arg1 = (unsigned long) &tu;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = clock_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ put_compat_timespec(&tu, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
+ return err;
+}
+
long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
long err;
mm_segment_t oldfs;
struct timespec in, out;
+ struct restart_block *restart;
if (get_compat_timespec(&in, rqtp))
return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
(struct timespec __user *) &in,
(struct timespec __user *) &out);
set_fs(oldfs);
+
if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
put_compat_timespec(&out, rmtp))
return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->arg1 = (unsigned long) rmtp;
+ }
return err;
}
@@ -730,17 +763,10 @@ void
sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
-#if defined (__COMPAT_ENDIAN_SWAP__)
- case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
- case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
- case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
- case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
-#else
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
-#endif
}
}
diff --git a/kernel/configs.c b/kernel/configs.c
index 009e1ebdcb88..f9e31974f4ad 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -23,7 +23,6 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,66 +13,66 @@
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
/* This protects CPUs going up and down... */
-static DECLARE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
-static BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
-
-static int __lock_cpu_hotplug(int interruptible)
-{
- int ret = 0;
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
- if (lock_cpu_hotplug_owner != current) {
- if (interruptible)
- ret = down_interruptible(&cpucontrol);
- else
- down(&cpucontrol);
- }
+#ifdef CONFIG_HOTPLUG_CPU
- /*
- * Set only if we succeed in locking
- */
- if (!ret) {
- lock_cpu_hotplug_depth++;
- lock_cpu_hotplug_owner = current;
- }
-
- return ret;
-}
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
void lock_cpu_hotplug(void)
{
- __lock_cpu_hotplug(0);
+ struct task_struct *tsk = current;
+
+ if (tsk == recursive) {
+ static int warnings = 10;
+ if (warnings) {
+ printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+ WARN_ON(1);
+ warnings--;
+ }
+ recursive_depth++;
+ return;
+ }
+ mutex_lock(&cpu_bitmask_lock);
+ recursive = tsk;
}
EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
void unlock_cpu_hotplug(void)
{
- if (--lock_cpu_hotplug_depth == 0) {
- lock_cpu_hotplug_owner = NULL;
- up(&cpucontrol);
+ WARN_ON(recursive != current);
+ if (recursive_depth) {
+ recursive_depth--;
+ return;
}
+ mutex_unlock(&cpu_bitmask_lock);
+ recursive = NULL;
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
-int lock_cpu_hotplug_interruptible(void)
-{
- return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&cpu_chain, nb);
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +81,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
-#ifdef CONFIG_HOTPLUG_CPU
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
@@ -114,32 +113,25 @@ static int take_cpu_down(void *unused)
return 0;
}
-int cpu_down(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
cpumask_t old_allowed, tmp;
- if ((err = lock_cpu_hotplug_interruptible()) != 0)
- return err;
+ if (num_online_cpus() == 1)
+ return -EBUSY;
- if (num_online_cpus() == 1) {
- err = -EBUSY;
- goto out;
- }
-
- if (!cpu_online(cpu)) {
- err = -EINVAL;
- goto out;
- }
+ if (!cpu_online(cpu))
+ return -EINVAL;
err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Ensure that we are not runnable on dying cpu */
@@ -148,7 +140,10 @@ int cpu_down(unsigned int cpu)
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
+ mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
+ mutex_unlock(&cpu_bitmask_lock);
+
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -184,24 +179,32 @@ out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
-out:
- unlock_cpu_hotplug();
+ return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+ int err = 0;
+
+ mutex_lock(&cpu_add_remove_lock);
+ if (cpu_hotplug_disabled)
+ err = -EBUSY;
+ else
+ err = _cpu_down(cpu);
+
+ mutex_unlock(&cpu_add_remove_lock);
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
-int __devinit cpu_up(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int __devinit _cpu_up(unsigned int cpu)
{
int ret;
void *hcpu = (void *)(long)cpu;
- if ((ret = lock_cpu_hotplug_interruptible()) != 0)
- return ret;
-
- if (cpu_online(cpu) || !cpu_present(cpu)) {
- ret = -EINVAL;
- goto out;
- }
+ if (cpu_online(cpu) || !cpu_present(cpu))
+ return -EINVAL;
ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
@@ -212,7 +215,9 @@ int __devinit cpu_up(unsigned int cpu)
}
/* Arch-specific enabling code. */
+ mutex_lock(&cpu_bitmask_lock);
ret = __cpu_up(cpu);
+ mutex_unlock(&cpu_bitmask_lock);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -224,7 +229,95 @@ out_notify:
if (ret != 0)
blocking_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED, hcpu);
-out:
- unlock_cpu_hotplug();
+
return ret;
}
+
+int __devinit cpu_up(unsigned int cpu)
+{
+ int err = 0;
+
+ mutex_lock(&cpu_add_remove_lock);
+ if (cpu_hotplug_disabled)
+ err = -EBUSY;
+ else
+ err = _cpu_up(cpu);
+
+ mutex_unlock(&cpu_add_remove_lock);
+ return err;
+}
+
+#ifdef CONFIG_SUSPEND_SMP
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+ int cpu, first_cpu, error;
+
+ mutex_lock(&cpu_add_remove_lock);
+ first_cpu = first_cpu(cpu_present_map);
+ if (!cpu_online(first_cpu)) {
+ error = _cpu_up(first_cpu);
+ if (error) {
+ printk(KERN_ERR "Could not bring CPU%d up.\n",
+ first_cpu);
+ goto out;
+ }
+ }
+ error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
+ if (error) {
+ printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
+ goto out;
+ }
+ /* We take down all of the non-boot CPUs in one shot to avoid races
+ * with the userspace trying to use the CPU hotplug at the same time
+ */
+ cpus_clear(frozen_cpus);
+ printk("Disabling non-boot CPUs ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == first_cpu)
+ continue;
+ error = _cpu_down(cpu);
+ if (!error) {
+ cpu_set(cpu, frozen_cpus);
+ printk("CPU%d is down\n", cpu);
+ } else {
+ printk(KERN_ERR "Error taking CPU%d down: %d\n",
+ cpu, error);
+ break;
+ }
+ }
+ if (!error) {
+ BUG_ON(num_online_cpus() > 1);
+ /* Make sure the CPUs won't be enabled by someone else */
+ cpu_hotplug_disabled = 1;
+ } else {
+ printk(KERN_ERR "Non-boot CPUs are not disabled");
+ }
+out:
+ mutex_unlock(&cpu_add_remove_lock);
+ return error;
+}
+
+void enable_nonboot_cpus(void)
+{
+ int cpu, error;
+
+ /* Allow everyone to use the CPU hotplug again */
+ mutex_lock(&cpu_add_remove_lock);
+ cpu_hotplug_disabled = 0;
+ mutex_unlock(&cpu_add_remove_lock);
+
+ printk("Enabling non-boot CPUs ...\n");
+ for_each_cpu_mask(cpu, frozen_cpus) {
+ error = cpu_up(cpu);
+ if (!error) {
+ printk("CPU%d is up\n", cpu);
+ continue;
+ }
+ printk(KERN_WARNING "Error taking CPU%d up: %d\n",
+ cpu, error);
+ }
+ cpus_clear(frozen_cpus);
+}
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38d..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
* distribution for more details.
*/
-#include <linux/config.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
@@ -241,7 +240,7 @@ static struct super_block *cpuset_sb;
* A cpuset can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cpusets is empty. Since all
* tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
* always has either children cpusets and/or using tasks. So we don't
* need a special hack to ensure that top_cpuset cannot be deleted.
*
@@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
- inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
*
* Call with manage_mutex held. May nest a call to the
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
*/
static void update_cpu_domains(struct cpuset *cur)
@@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur)
if (is_cpu_exclusive(c))
cpus_andnot(pspan, pspan, c->cpus_allowed);
}
- if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+ if (!is_cpu_exclusive(cur)) {
cpus_or(pspan, pspan, cur->cpus_allowed);
if (cpus_equal(pspan, cur->cpus_allowed))
return;
@@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
struct cpuset trialcs;
int retval, cpus_unchanged;
+ /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = cpulist_parse(buf, trialcs.cpus_allowed);
if (retval < 0)
@@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
int fudge;
int retval;
+ /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
@@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
}
/*
- * Frequency meter - How fast is some event occuring?
+ * Frequency meter - How fast is some event occurring?
*
* These routines manage a digitally filtered, constant time based,
* event frequency meter. There are four routines:
@@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
task_lock(tsk);
oldcs = tsk->cpuset;
- if (!oldcs) {
+ /*
+ * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+ * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+ * then fail this attach_task(), to avoid breaking top_cpuset.count.
+ */
+ if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
mutex_unlock(&callback_mutex);
put_task_struct(tsk);
@@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex. Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cpuset *cs = dentry->d_fsdata;
@@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&manage_mutex);
return -EBUSY;
}
+ if (is_cpu_exclusive(cs)) {
+ int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+ if (retval < 0) {
+ mutex_unlock(&manage_mutex);
+ return retval;
+ }
+ }
parent = cs->parent;
mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
- if (is_cpu_exclusive(cs))
- update_cpu_domains(cs);
list_del(&cs->sibling); /* delete my sibling from parent->children */
spin_lock(&cs->dentry->d_lock);
d = dget(cs->dentry);
@@ -2016,6 +2045,104 @@ out:
return err;
}
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes. Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes. It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+ struct cpuset *c;
+
+ /* Each of our child cpusets mems must be online */
+ list_for_each_entry(c, &cur->children, sibling) {
+ guarantee_online_cpus_mems_in_subtree(c);
+ if (!cpus_empty(c->cpus_allowed))
+ guarantee_online_cpus(c, &c->cpus_allowed);
+ if (!nodes_empty(c->mems_allowed))
+ guarantee_online_mems(c, &c->mems_allowed);
+ }
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map. Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here. We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
+{
+ mutex_lock(&manage_mutex);
+ mutex_lock(&callback_mutex);
+
+ guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+ top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_online_map;
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+ unsigned long phase, void *cpu)
+{
+ common_cpu_mem_hotplug_unplug();
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+void cpuset_track_online_nodes()
+{
+ common_cpu_mem_hotplug_unplug();
+}
+#endif
+
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void)
{
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
+
+ hotcpu_notifier(cpuset_handle_cpuhp, 0);
}
/**
@@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int i;
for (i = 0; zl->zones[i]; i++) {
- int nid = zl->zones[i]->zone_pgdat->node_id;
+ int nid = zone_to_nid(zl->zones[i]);
if (node_isset(nid, current->mems_allowed))
return 1;
@@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
- if (in_interrupt())
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = z->zone_pgdat->node_id;
+ node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
int cpuset_excl_nodes_overlap(const struct task_struct *p)
{
const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
- int overlap = 0; /* do cpusets overlap? */
+ int overlap = 1; /* do cpusets overlap? */
task_lock(current);
if (current->flags & PF_EXITING) {
@@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void)
*/
static int proc_cpuset_show(struct seq_file *m, void *v)
{
+ struct pid *pid;
struct task_struct *tsk;
char *buf;
- int retval = 0;
+ int retval;
+ retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
- return -ENOMEM;
+ goto out;
+
+ retval = -ESRCH;
+ pid = m->private;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (!tsk)
+ goto out_free;
- tsk = m->private;
+ retval = -EINVAL;
mutex_lock(&manage_mutex);
+
retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
if (retval < 0)
- goto out;
+ goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
-out:
+out_unlock:
mutex_unlock(&manage_mutex);
+ put_task_struct(tsk);
+out_free:
kfree(buf);
+out:
return retval;
}
static int cpuset_open(struct inode *inode, struct file *file)
{
- struct task_struct *tsk = PROC_I(inode)->task;
- return single_open(file, proc_cpuset_show, tsk);
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cpuset_show, pid);
}
struct file_operations proc_cpuset_operations = {
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..36752f124c6a
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,162 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_disable(char *str)
+{
+ delayacct_on = 0;
+ return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+
+void delayacct_init(void)
+{
+ delayacct_cache = kmem_cache_create("delayacct_cache",
+ sizeof(struct task_delay_info),
+ 0,
+ SLAB_PANIC,
+ NULL, NULL);
+ delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+ if (tsk->delays)
+ spin_lock_init(&tsk->delays->lock);
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+ do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+ u64 *total, u32 *count)
+{
+ struct timespec ts;
+ s64 ns;
+
+ do_posix_clock_monotonic_gettime(end);
+ ts = timespec_sub(*end, *start);
+ ns = timespec_to_ns(&ts);
+ if (ns < 0)
+ return;
+
+ spin_lock(&current->delays->lock);
+ *total += ns;
+ (*count)++;
+ spin_unlock(&current->delays->lock);
+}
+
+void __delayacct_blkio_start(void)
+{
+ delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+ if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+ /* Swapin block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+ else /* Other block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->blkio_delay,
+ &current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+ s64 tmp;
+ struct timespec ts;
+ unsigned long t1,t2,t3;
+
+ /* Though tsk->delays accessed later, early exit avoids
+ * unnecessary returning of other data
+ */
+ if (!tsk->delays)
+ goto done;
+
+ tmp = (s64)d->cpu_run_real_total;
+ cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+ /*
+ * No locking available for sched_info (and too expensive to add one)
+ * Mitigate by taking snapshot of values
+ */
+ t1 = tsk->sched_info.pcnt;
+ t2 = tsk->sched_info.run_delay;
+ t3 = tsk->sched_info.cpu_time;
+
+ d->cpu_count += t1;
+
+ jiffies_to_timespec(t2, &ts);
+ tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+ d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+ tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+ d->cpu_run_virtual_total =
+ (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+
+ /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+ spin_lock(&tsk->delays->lock);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ spin_unlock(&tsk->delays->lock);
+
+done:
+ return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+ __u64 ret;
+
+ spin_lock(&tsk->delays->lock);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+ tsk->delays->swapin_delay);
+ spin_unlock(&tsk->delays->lock);
+ return ret;
+}
+
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c01cead2cfd6..3c2eaea66b1e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -7,7 +7,6 @@
* 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index a3baf92462bd..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -4,7 +4,6 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
@@ -26,6 +25,8 @@
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
#include <linux/cpuset.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -36,6 +37,7 @@
#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -45,8 +47,6 @@
extern void sem_exit (void);
extern struct task_struct *child_reaper;
-int getrusage(struct task_struct *, int, struct rusage __user *);
-
static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p)
@@ -136,14 +136,10 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
void release_task(struct task_struct * p)
{
+ struct task_struct *leader;
int zap_leader;
- task_t *leader;
- struct dentry *proc_dentry;
-
repeat:
atomic_dec(&p->user->processes);
- spin_lock(&p->proc_lock);
- proc_dentry = proc_pid_unhash(p);
write_lock_irq(&tasklist_lock);
ptrace_unlink(p);
BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +168,7 @@ repeat:
sched_exit(p);
write_unlock_irq(&tasklist_lock);
- spin_unlock(&p->proc_lock);
- proc_pid_flush(proc_dentry);
+ proc_flush_task(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -216,7 +211,7 @@ out:
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
{
struct task_struct *p;
int ret = 1;
@@ -224,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p == ignored_task
|| p->exit_state
- || p->real_parent->pid == 1)
+ || is_init(p->real_parent))
continue;
if (process_group(p->real_parent) != pgrp
&& p->real_parent->signal->session == p->signal->session) {
@@ -254,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p->state != TASK_STOPPED)
continue;
-
- /* If p is stopped by a debugger on a signal that won't
- stop it, then don't count p as stopped. This isn't
- perfect but it's a good approximation. */
- if (unlikely (p->ptrace)
- && p->exit_code != SIGSTOP
- && p->exit_code != SIGTSTP
- && p->exit_code != SIGTTOU
- && p->exit_code != SIGTTIN)
- continue;
-
retval = 1;
break;
} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -297,9 +281,7 @@ static void reparent_to_init(void)
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
- if ((current->policy == SCHED_NORMAL ||
- current->policy == SCHED_BATCH)
- && (task_nice(current) < 0))
+ if (!has_rt_policy(current) && (task_nice(current) < 0))
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
@@ -492,6 +474,18 @@ void fastcall put_files_struct(struct files_struct *files)
EXPORT_SYMBOL(put_files_struct);
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+ struct files_struct *old;
+
+ old = tsk->files;
+ task_lock(tsk);
+ tsk->files = files;
+ task_unlock(tsk);
+ put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
+
static inline void __exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -589,7 +583,8 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}
-static inline void choose_new_parent(task_t *p, task_t *reaper)
+static inline void
+choose_new_parent(struct task_struct *p, struct task_struct *reaper)
{
/*
* Make sure we're not reparenting to ourselves and that
@@ -599,7 +594,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper)
p->real_parent = reaper;
}
-static void reparent_thread(task_t *p, task_t *father, int traced)
+static void
+reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
{
/* We don't want people slaying init. */
if (p->exit_signal != -1)
@@ -663,8 +659,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
-static void forget_original_parent(struct task_struct * father,
- struct list_head *to_release)
+static void
+forget_original_parent(struct task_struct *father, struct list_head *to_release)
{
struct task_struct *p, *reaper = father;
struct list_head *_p, *_n;
@@ -687,7 +683,7 @@ static void forget_original_parent(struct task_struct * father,
*/
list_for_each_safe(_p, _n, &father->children) {
int ptrace;
- p = list_entry(_p,struct task_struct,sibling);
+ p = list_entry(_p, struct task_struct, sibling);
ptrace = p->ptrace;
@@ -716,7 +712,7 @@ static void forget_original_parent(struct task_struct * father,
list_add(&p->ptrace_list, to_release);
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
- p = list_entry(_p,struct task_struct,ptrace_list);
+ p = list_entry(_p, struct task_struct, ptrace_list);
choose_new_parent(p, reaper);
reparent_thread(p, father, 1);
}
@@ -836,7 +832,7 @@ static void exit_notify(struct task_struct *tsk)
list_for_each_safe(_p, _n, &ptrace_dead) {
list_del_init(_p);
- t = list_entry(_p,struct task_struct,ptrace_list);
+ t = list_entry(_p, struct task_struct, ptrace_list);
release_task(t);
}
@@ -848,7 +844,9 @@ static void exit_notify(struct task_struct *tsk)
fastcall NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
+ struct taskstats *tidstats;
int group_dead;
+ unsigned int mycpu;
profile_task_exit(tsk);
@@ -886,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
current->comm, current->pid,
preempt_count());
+ taskstats_exit_alloc(&tidstats, &mycpu);
+
acct_update_integrals(tsk);
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
@@ -895,18 +895,23 @@ fastcall NORET_TYPE void do_exit(long code)
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
- acct_process(code);
}
+ acct_collect(code, group_dead);
if (unlikely(tsk->robust_list))
exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
if (unlikely(tsk->compat_robust_list))
compat_exit_robust_list(tsk);
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
+ taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
+ taskstats_exit_free(tidstats);
+
exit_mm(tsk);
+ if (group_dead)
+ acct_process();
exit_sem(tsk);
__exit_files(tsk);
__exit_fs(tsk);
@@ -930,9 +935,17 @@ fastcall NORET_TYPE void do_exit(long code)
tsk->mempolicy = NULL;
#endif
/*
- * If DEBUG_MUTEXES is on, make sure we are holding no locks:
+ * This must happen late, after the PID is not
+ * hashed anymore:
*/
- mutex_debug_check_no_locks_held(tsk);
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+ if (unlikely(current->pi_state_cache))
+ kfree(current->pi_state_cache);
+ /*
+ * Make sure we are holding no locks:
+ */
+ debug_check_no_locks_held(tsk);
if (tsk->io_context)
exit_io_context();
@@ -940,15 +953,15 @@ fastcall NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
- /* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
- BUG_ON(tsk->flags & PF_DEAD);
- tsk->flags |= PF_DEAD;
+ /* causes final put_task_struct in finish_task_switch(). */
+ tsk->state = TASK_DEAD;
schedule();
BUG();
/* Avoid "noreturn function does return". */
- for (;;) ;
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
}
EXPORT_SYMBOL_GPL(do_exit);
@@ -957,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
-
+
do_exit(code);
}
@@ -1007,7 +1020,7 @@ asmlinkage void sys_exit_group(int error_code)
do_group_exit((error_code & 0xff) << 8);
}
-static int eligible_child(pid_t pid, int options, task_t *p)
+static int eligible_child(pid_t pid, int options, struct task_struct *p)
{
if (pid > 0) {
if (p->pid != pid)
@@ -1039,7 +1052,7 @@ static int eligible_child(pid_t pid, int options, task_t *p)
* Do not consider thread group leaders that are
* in a non-empty thread group:
*/
- if (current->tgid != p->tgid && delay_group_leader(p))
+ if (delay_group_leader(p))
return 2;
if (security_task_wait(p))
@@ -1048,12 +1061,13 @@ static int eligible_child(pid_t pid, int options, task_t *p)
return 1;
}
-static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
+static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
int why, int status,
struct siginfo __user *infop,
struct rusage __user *rusagep)
{
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+
put_task_struct(p);
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
@@ -1078,7 +1092,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(task_t *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int noreap,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1240,8 +1254,8 @@ static int wait_task_zombie(task_t *p, int noreap,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
- struct siginfo __user *infop,
+static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+ int noreap, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code;
@@ -1355,7 +1369,7 @@ bail_ref:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(task_t *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int noreap,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1441,7 +1455,7 @@ repeat:
int ret;
list_for_each(_p,&tsk->children) {
- p = list_entry(_p,struct task_struct,sibling);
+ p = list_entry(_p, struct task_struct, sibling);
ret = eligible_child(pid, options, p);
if (!ret)
diff --git a/kernel/fork.c b/kernel/fork.c
index 49adc0e8d47c..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,7 +11,6 @@
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
-#include <linux/config.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/unistd.h>
@@ -44,6 +43,9 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
+#include <linux/random.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -62,9 +64,7 @@ int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
- __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
-
-EXPORT_SYMBOL(tasklist_lock);
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
int nr_processes(void)
{
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
void free_task(struct task_struct *tsk)
{
free_thread_info(tsk->thread_info);
+ rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -117,6 +118,7 @@ void __put_task_struct(struct task_struct *tsk)
security_task_free(tsk);
free_uid(tsk->user);
put_group_info(tsk->group_info);
+ delayacct_tsk_free(tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -174,10 +176,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->thread_info = ti;
setup_thread_stack(tsk, orig);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ tsk->stack_canary = get_random_int();
+#endif
+
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
+#endif
tsk->splice_pipe = NULL;
return tsk;
}
@@ -193,7 +201,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
down_write(&oldmm->mmap_sem);
flush_cache_mm(oldmm);
- down_write(&mm->mmap_sem);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
mm->locked_vm = 0;
mm->mmap = NULL;
@@ -817,6 +828,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
if (clone_flags & CLONE_THREAD) {
atomic_inc(&current->signal->count);
atomic_inc(&current->signal->live);
+ taskstats_tgid_alloc(current->signal);
return 0;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -861,6 +873,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ taskstats_tgid_init(sig);
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -874,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
tsk->it_prof_expires =
secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
}
+ acct_init_pacct(&sig->pacct);
return 0;
}
@@ -881,6 +895,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
void __cleanup_signal(struct signal_struct *sig)
{
exit_thread_group_keys(sig);
+ taskstats_tgid_free(sig);
kmem_cache_free(signal_cachep, sig);
}
@@ -912,6 +927,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
return current->pid;
}
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+ spin_lock_init(&p->pi_lock);
+ plist_head_init(&p->pi_waiters, &p->pi_lock);
+ p->pi_blocked_on = NULL;
+#endif
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -920,13 +944,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static task_t *copy_process(unsigned long clone_flags,
- unsigned long stack_start,
- struct pt_regs *regs,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- int pid)
+static struct task_struct *copy_process(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr,
+ int pid)
{
int retval;
struct task_struct *p = NULL;
@@ -958,6 +982,10 @@ static task_t *copy_process(unsigned long clone_flags,
if (!p)
goto fork_out;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+ DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -985,20 +1013,18 @@ static task_t *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid;
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
- goto bad_fork_cleanup;
-
- p->proc_dentry = NULL;
+ goto bad_fork_cleanup_delays_binfmt;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
- spin_lock_init(&p->proc_lock);
clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
@@ -1035,6 +1061,32 @@ static task_t *copy_process(unsigned long clone_flags,
}
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ p->hardirqs_enabled = 1;
+#else
+ p->hardirqs_enabled = 0;
+#endif
+ p->hardirq_enable_ip = 0;
+ p->hardirq_enable_event = 0;
+ p->hardirq_disable_ip = _THIS_IP_;
+ p->hardirq_disable_event = 0;
+ p->softirqs_enabled = 1;
+ p->softirq_enable_ip = _THIS_IP_;
+ p->softirq_enable_event = 0;
+ p->softirq_disable_ip = 0;
+ p->softirq_disable_event = 0;
+ p->hardirq_context = 0;
+ p->softirq_context = 0;
+#endif
+#ifdef CONFIG_LOCKDEP
+ p->lockdep_depth = 0; /* no locks held yet */
+ p->curr_chain_key = 0;
+ p->lockdep_recursion = 0;
+#endif
+
+ rt_mutex_init_task(p);
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
@@ -1078,6 +1130,9 @@ static task_t *copy_process(unsigned long clone_flags,
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
+ INIT_LIST_HEAD(&p->pi_state_list);
+ p->pi_state_cache = NULL;
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1095,7 +1150,6 @@ static task_t *copy_process(unsigned long clone_flags,
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
-
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
@@ -1118,6 +1172,9 @@ static task_t *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
+ /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+ p->ioprio = current->ioprio;
+
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
@@ -1158,18 +1215,6 @@ static task_t *copy_process(unsigned long clone_flags,
}
if (clone_flags & CLONE_THREAD) {
- /*
- * Important: if an exit-all has been started then
- * do not create this new thread - the whole thread
- * group is supposed to exit anyway.
- */
- if (current->signal->flags & SIGNAL_GROUP_EXIT) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
- retval = -EAGAIN;
- goto bad_fork_cleanup_namespace;
- }
-
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
@@ -1189,11 +1234,6 @@ static task_t *copy_process(unsigned long clone_flags,
}
}
- /*
- * inherit ioprio
- */
- p->ioprio = current->ioprio;
-
if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
@@ -1246,7 +1286,8 @@ bad_fork_cleanup_policy:
bad_fork_cleanup_cpuset:
#endif
cpuset_exit(p);
-bad_fork_cleanup:
+bad_fork_cleanup_delays_binfmt:
+ delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
@@ -1267,9 +1308,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
return regs;
}
-task_t * __devinit fork_idle(int cpu)
+struct task_struct * __devinit fork_idle(int cpu)
{
- task_t *task;
+ struct task_struct *task;
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
@@ -1356,8 +1397,10 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+ current->ptrace_message = nr;
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+ }
}
} else {
free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
*
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
#include <linux/signal.h>
#include <asm/futex.h>
+#include "rtmutex_common.h"
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -63,7 +69,7 @@ union futex_key {
int offset;
} shared;
struct {
- unsigned long uaddr;
+ unsigned long address;
struct mm_struct *mm;
int offset;
} private;
@@ -75,6 +81,27 @@ union futex_key {
};
/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex pi_mutex;
+
+ struct task_struct *owner;
+ atomic_t refcount;
+
+ union futex_key key;
+};
+
+/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
@@ -87,15 +114,19 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Which hash list lock to use. */
+ /* Which hash list lock to use: */
spinlock_t *lock_ptr;
- /* Key which the futex is hashed on. */
+ /* Key which the futex is hashed on: */
union futex_key key;
- /* For fd, sigio sent using these. */
+ /* For fd, sigio sent using these: */
int fd;
struct file *filp;
+
+ /* Optional priority inheritance state: */
+ struct futex_pi_state *pi_state;
+ struct task_struct *task;
};
/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
{
+ unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
/*
* The futex address must be "naturally" aligned.
*/
- key->both.offset = uaddr % PAGE_SIZE;
+ key->both.offset = address % PAGE_SIZE;
if (unlikely((key->both.offset % sizeof(u32)) != 0))
return -EINVAL;
- uaddr -= key->both.offset;
+ address -= key->both.offset;
/*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
- vma = find_extend_vma(mm, uaddr);
+ vma = find_extend_vma(mm, address);
if (unlikely(!vma))
return -EFAULT;
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
key->private.mm = mm;
- key->private.uaddr = uaddr;
+ key->private.address = address;
return 0;
}
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
key->shared.inode = vma->vm_file->f_dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
return 0;
}
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
* from swap. But that's a lot of code to duplicate here
* for a rare case, so we simply fetch the page.
*/
- err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
if (err >= 0) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,259 @@ static void drop_key_refs(union futex_key *key)
}
}
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
int ret;
inc_preempt_count();
- ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+ ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
dec_preempt_count();
return ret ? -EFAULT : 0;
}
/*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+
+ if (attempt > 2 || !(vma = find_vma(mm, address)) ||
+ vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+ return -EFAULT;
+
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ default:
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ memset(pi_state, 0, sizeof(*pi_state));
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!atomic_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ }
+
+ if (current->pi_state_cache)
+ kfree(pi_state);
+ else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+ if ((current->euid != p->euid) && (current->euid != p->uid)) {
+ p = NULL;
+ goto out_unlock;
+ }
+ if (p->exit_state != 0) {
+ p = NULL;
+ goto out_unlock;
+ }
+ get_task_struct(p);
+out_unlock:
+ rcu_read_unlock();
+
+ return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key;
+
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = hash_futex(&key);
+ spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ spin_unlock(&hb->lock);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+ spin_unlock_irq(&curr->pi_lock);
+
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ spin_unlock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ }
+ spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_q *this, *next;
+ struct list_head *head;
+ struct task_struct *p;
+ pid_t pid;
+
+ head = &hb->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex(&this->key, &me->key)) {
+ /*
+ * Another waiter already exists - bump up
+ * the refcount and return its pi_state:
+ */
+ pi_state = this->pi_state;
+ /*
+ * Userspace might have messed up non PI and PI futexes
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
+ atomic_inc(&pi_state->refcount);
+ me->pi_state = pi_state;
+
+ return 0;
+ }
+ }
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when the owner died bit is set
+ * and TID = 0:
+ */
+ pid = uval & FUTEX_TID_MASK;
+ if (!pid && (uval & FUTEX_OWNER_DIED))
+ return -ESRCH;
+ p = futex_find_get_task(pid);
+ if (!p)
+ return -ESRCH;
+
+ pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make 'p'
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = me->key;
+
+ spin_lock_irq(&p->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+ spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ me->pi_state = pi_state;
+
+ return 0;
+}
+
+/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
@@ -284,16 +557,105 @@ static void wake_futex(struct futex_q *q)
q->lock_ptr = NULL;
}
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+ struct task_struct *new_owner;
+ struct futex_pi_state *pi_state = this->pi_state;
+ u32 curval, newval;
+
+ if (!pi_state)
+ return -EINVAL;
+
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+ /*
+ * This happens when we have stolen the lock and the original
+ * pending owner did not enqueue itself back on the rt_mutex.
+ * Thats not a tragedy. We know that way, that a lock waiter
+ * is on the fly. We make the futex_q waiter the pending owner.
+ */
+ if (!new_owner)
+ new_owner = this->task;
+
+ /*
+ * We pass it to the next owner. (The WAITERS bit is always
+ * kept enabled while there is PI state around. We must also
+ * preserve the owner died bit.)
+ */
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ newval = FUTEX_WAITERS | new_owner->pid;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+ if (curval == -EFAULT)
+ return -EFAULT;
+ if (curval != uval)
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ spin_lock_irq(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ spin_unlock_irq(&new_owner->pi_lock);
+
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+ u32 oldval;
+
+ /*
+ * There is no waiter, so we unlock the futex. The owner died
+ * bit has not to be preserved here. We are the owner:
+ */
+ inc_preempt_count();
+ oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+ dec_preempt_count();
+
+ if (oldval == -EFAULT)
+ return oldval;
+ if (oldval != uval)
+ return -EAGAIN;
+
+ return 0;
+}
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 <= hb2) {
+ spin_lock(&hb1->lock);
+ if (hb1 < hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ } else { /* hb1 > hb2 */
+ spin_lock(&hb2->lock);
+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
{
- union futex_key key;
- struct futex_hash_bucket *bh;
- struct list_head *head;
+ struct futex_hash_bucket *hb;
struct futex_q *this, *next;
+ struct list_head *head;
+ union futex_key key;
int ret;
down_read(&current->mm->mmap_sem);
@@ -302,19 +664,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
if (unlikely(ret != 0))
goto out;
- bh = hash_futex(&key);
- spin_lock(&bh->lock);
- head = &bh->chain;
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
+ if (this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
}
}
- spin_unlock(&bh->lock);
+ spin_unlock(&hb->lock);
out:
up_read(&current->mm->mmap_sem);
return ret;
@@ -324,10 +690,12 @@ out:
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
- struct futex_hash_bucket *bh1, *bh2;
+ struct futex_hash_bucket *hb1, *hb2;
struct list_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
@@ -342,27 +710,25 @@ retryfull:
if (unlikely(ret != 0))
goto out;
- bh1 = hash_futex(&key1);
- bh2 = hash_futex(&key2);
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
retry:
- if (bh1 < bh2)
- spin_lock(&bh1->lock);
- spin_lock(&bh2->lock);
- if (bh1 > bh2)
- spin_lock(&bh1->lock);
+ double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- int dummy;
+ u32 dummy;
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
#ifndef CONFIG_MMU
- /* we don't get EFAULT from MMU faults if we don't have an MMU,
- * but we might get them from range checking */
+ /*
+ * we don't get EFAULT from MMU faults if we don't have an MMU,
+ * but we might get them from range checking
+ */
ret = op_ret;
goto out;
#endif
@@ -372,47 +738,36 @@ retry:
goto out;
}
- /* futex_atomic_op_inuser needs to both read and write
+ /*
+ * futex_atomic_op_inuser needs to both read and write
* *(int __user *)uaddr2, but we can't modify it
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
- * still holding the mmap_sem. */
+ * still holding the mmap_sem.
+ */
if (attempt++) {
- struct vm_area_struct * vma;
- struct mm_struct *mm = current->mm;
-
- ret = -EFAULT;
- if (attempt >= 2 ||
- !(vma = find_vma(mm, uaddr2)) ||
- vma->vm_start > uaddr2 ||
- !(vma->vm_flags & VM_WRITE))
- goto out;
-
- switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
+ if (futex_handle_fault((unsigned long)uaddr2,
+ attempt)) {
+ ret = -EFAULT;
goto out;
}
goto retry;
}
- /* If we would have faulted, release mmap_sem,
- * fault it in and start all over again. */
+ /*
+ * If we would have faulted, release mmap_sem,
+ * fault it in and start all over again.
+ */
up_read(&current->mm->mmap_sem);
- ret = get_user(dummy, (int __user *)uaddr2);
+ ret = get_user(dummy, uaddr2);
if (ret)
return ret;
goto retryfull;
}
- head = &bh1->chain;
+ head = &hb1->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
@@ -423,7 +778,7 @@ retry:
}
if (op_ret > 0) {
- head = &bh2->chain;
+ head = &hb2->chain;
op_ret = 0;
list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +791,9 @@ retry:
ret += op_ret;
}
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
out:
up_read(&current->mm->mmap_sem);
return ret;
@@ -448,11 +803,11 @@ out:
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
- int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
- struct futex_hash_bucket *bh1, *bh2;
+ struct futex_hash_bucket *hb1, *hb2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
@@ -467,68 +822,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
if (unlikely(ret != 0))
goto out;
- bh1 = hash_futex(&key1);
- bh2 = hash_futex(&key2);
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
- if (bh1 < bh2)
- spin_lock(&bh1->lock);
- spin_lock(&bh2->lock);
- if (bh1 > bh2)
- spin_lock(&bh1->lock);
+ double_lock_hb(hb1, hb2);
- if (likely(valp != NULL)) {
- int curval;
+ if (likely(cmpval != NULL)) {
+ u32 curval;
- ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+ ret = get_futex_value_locked(&curval, uaddr1);
if (unlikely(ret)) {
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
- /* If we would have faulted, release mmap_sem, fault
+ /*
+ * If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
up_read(&current->mm->mmap_sem);
- ret = get_user(curval, (int __user *)uaddr1);
+ ret = get_user(curval, uaddr1);
if (!ret)
goto retry;
return ret;
}
- if (curval != *valp) {
+ if (curval != *cmpval) {
ret = -EAGAIN;
goto out_unlock;
}
}
- head1 = &bh1->chain;
+ head1 = &hb1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
wake_futex(this);
} else {
- list_move_tail(&this->list, &bh2->chain);
- this->lock_ptr = &bh2->lock;
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ list_move_tail(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+ }
this->key = key2;
get_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
break;
- /* Make sure to stop if key1 == key2 */
- if (head1 == &bh2->chain && head1 != &next->list)
- head1 = &this->list;
}
}
out_unlock:
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
/* drop_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
@@ -543,7 +898,7 @@ out:
static inline struct futex_hash_bucket *
queue_lock(struct futex_q *q, int fd, struct file *filp)
{
- struct futex_hash_bucket *bh;
+ struct futex_hash_bucket *hb;
q->fd = fd;
q->filp = filp;
@@ -551,23 +906,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
init_waitqueue_head(&q->waiters);
get_key_refs(&q->key);
- bh = hash_futex(&q->key);
- q->lock_ptr = &bh->lock;
+ hb = hash_futex(&q->key);
+ q->lock_ptr = &hb->lock;
- spin_lock(&bh->lock);
- return bh;
+ spin_lock(&hb->lock);
+ return hb;
}
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &bh->chain);
- spin_unlock(&bh->lock);
+ list_add_tail(&q->list, &hb->chain);
+ q->task = current;
+ spin_unlock(&hb->lock);
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
- spin_unlock(&bh->lock);
+ spin_unlock(&hb->lock);
drop_key_refs(&q->key);
}
@@ -579,20 +935,22 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
/* The key must be already stored in q->key. */
static void queue_me(struct futex_q *q, int fd, struct file *filp)
{
- struct futex_hash_bucket *bh;
- bh = queue_lock(q, fd, filp);
- __queue_me(q, bh);
+ struct futex_hash_bucket *hb;
+
+ hb = queue_lock(q, fd, filp);
+ __queue_me(q, hb);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
- int ret = 0;
spinlock_t *lock_ptr;
+ int ret = 0;
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
+ barrier();
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
@@ -614,6 +972,9 @@ static int unqueue_me(struct futex_q *q)
}
WARN_ON(list_empty(&q->list));
list_del(&q->list);
+
+ BUG_ON(q->pi_state);
+
spin_unlock(lock_ptr);
ret = 1;
}
@@ -622,21 +983,42 @@ static int unqueue_me(struct futex_q *q)
return ret;
}
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+{
+ WARN_ON(list_empty(&q->list));
+ list_del(&q->list);
+
+ BUG_ON(!q->pi_state);
+ free_pi_state(q->pi_state);
+ q->pi_state = NULL;
+
+ spin_unlock(&hb->lock);
+
+ drop_key_refs(&q->key);
+}
+
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
{
- DECLARE_WAITQUEUE(wait, current);
- int ret, curval;
+ struct task_struct *curr = current;
+ DECLARE_WAITQUEUE(wait, curr);
+ struct futex_hash_bucket *hb;
struct futex_q q;
- struct futex_hash_bucket *bh;
+ u32 uval;
+ int ret;
+ q.pi_state = NULL;
retry:
- down_read(&current->mm->mmap_sem);
+ down_read(&curr->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
- bh = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q, -1, NULL);
/*
* Access the page AFTER the futex is queued.
@@ -658,37 +1040,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
* We hold the mmap semaphore, so the mapping cannot have changed
* since we looked it up in get_futex_key.
*/
-
- ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+ ret = get_futex_value_locked(&uval, uaddr);
if (unlikely(ret)) {
- queue_unlock(&q, bh);
+ queue_unlock(&q, hb);
- /* If we would have faulted, release mmap_sem, fault it in and
+ /*
+ * If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ up_read(&curr->mm->mmap_sem);
- ret = get_user(curval, (int __user *)uaddr);
+ ret = get_user(uval, uaddr);
if (!ret)
goto retry;
return ret;
}
- if (curval != val) {
- ret = -EWOULDBLOCK;
- queue_unlock(&q, bh);
- goto out_release_sem;
- }
+ ret = -EWOULDBLOCK;
+ if (uval != val)
+ goto out_unlock_release_sem;
/* Only actually queue if *uaddr contained val. */
- __queue_me(&q, bh);
+ __queue_me(&q, hb);
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
- */
- up_read(&current->mm->mmap_sem);
+ */
+ up_read(&curr->mm->mmap_sem);
/*
* There might have been scheduling since the queue_me(), as we
@@ -720,12 +1100,367 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
return 0;
if (time == 0)
return -ETIMEDOUT;
- /* We expect signal_pending(current), but another thread may
- * have handled it for us already. */
+ /*
+ * We expect signal_pending(current), but another thread may
+ * have handled it for us already.
+ */
return -EINTR;
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
out_release_sem:
+ up_read(&curr->mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+ long nsec, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct task_struct *curr = current;
+ struct futex_hash_bucket *hb;
+ u32 uval, newval, curval;
+ struct futex_q q;
+ int ret, attempt = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ if (sec != MAX_SCHEDULE_TIMEOUT) {
+ to = &timeout;
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+ hrtimer_init_sleeper(to, current);
+ to->timer.expires = ktime_set(sec, nsec);
+ }
+
+ q.pi_state = NULL;
+ retry:
+ down_read(&curr->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &q.key);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+ /*
+ * To avoid races, we attempt to take the lock here again
+ * (by doing a 0 -> TID atomic cmpxchg), while holding all
+ * the locks. It will most likely not succeed.
+ */
+ newval = current->pid;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+
+ /* We own the lock already */
+ if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+ if (!detect && 0)
+ force_sig(SIGKILL, current);
+ ret = -EDEADLK;
+ goto out_unlock_release_sem;
+ }
+
+ /*
+ * Surprise - we got the lock. Just return
+ * to userspace:
+ */
+ if (unlikely(!curval))
+ goto out_unlock_release_sem;
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+
+ /*
+ * We dont have the lock. Look up the PI state (or create it if
+ * we are the first waiter):
+ */
+ ret = lookup_pi_state(uval, hb, &q);
+
+ if (unlikely(ret)) {
+ /*
+ * There were no waiters and the owner task lookup
+ * failed. When the OWNER_DIED bit is set, then we
+ * know that this is a robust futex and we actually
+ * take the lock. This is safe as we are protected by
+ * the hash bucket lock. We also set the waiters bit
+ * unconditionally here, to simplify glibc handling of
+ * multiple tasks racing to acquire the lock and
+ * cleanup the problems which were left by the dead
+ * owner.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ uval = newval;
+ newval = current->pid |
+ FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+ ret = 0;
+ }
+ goto out_unlock_release_sem;
+ }
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ up_read(&curr->mm->mmap_sem);
+
+ WARN_ON(!q.pi_state);
+ /*
+ * Block on the PI mutex:
+ */
+ if (!trylock)
+ ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+ else {
+ ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ }
+
+ down_read(&curr->mm->mmap_sem);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ u32 newtid = current->pid | FUTEX_WAITERS;
+
+ /* Owner died? */
+ if (q.pi_state->owner != NULL) {
+ spin_lock_irq(&q.pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&q.pi_state->list));
+ list_del_init(&q.pi_state->list);
+ spin_unlock_irq(&q.pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ q.pi_state->owner = current;
+
+ spin_lock_irq(&current->pi_lock);
+ WARN_ON(!list_empty(&q.pi_state->list));
+ list_add(&q.pi_state->list, &current->pi_state_list);
+ spin_unlock_irq(&current->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ }
+
+ if (!detect && ret == -EDEADLK && 0)
+ force_sig(SIGKILL, current);
+
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
+ out_release_sem:
+ up_read(&curr->mm->mmap_sem);
+ return ret;
+
+ uaddr_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
+ goto out_unlock_release_sem;
+ }
+ goto retry_locked;
+ }
+
+ queue_unlock(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
+ return ret;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ u32 uval;
+ struct list_head *head;
+ union futex_key key;
+ int ret, attempt = 0;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != current->pid)
+ return -EPERM;
+ /*
+ * First take all the futex related locks:
+ */
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &key);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+
+retry_locked:
+ /*
+ * To avoid races, try to do the TID -> 0 atomic transition
+ * again. If it succeeds then we can return without waking
+ * anyone else up:
+ */
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ inc_preempt_count();
+ uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+ dec_preempt_count();
+ }
+
+ if (unlikely(uval == -EFAULT))
+ goto pi_faulted;
+ /*
+ * Rare case: we managed to release the lock atomically,
+ * no need to wake anyone else up:
+ */
+ if (unlikely(uval == current->pid))
+ goto out_unlock;
+
+ /*
+ * Ok, other tasks may need to be woken up - check waiters
+ * and do the wakeup if necessary:
+ */
+ head = &hb->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (!match_futex (&this->key, &key))
+ continue;
+ ret = wake_futex_pi(uaddr, uval, this);
+ /*
+ * The atomic access to the futex value
+ * generated a pagefault, so retry the
+ * user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ goto out_unlock;
+ }
+ /*
+ * No waiters - kernel unlocks the futex:
+ */
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ }
+
+out_unlock:
+ spin_unlock(&hb->lock);
+out:
up_read(&current->mm->mmap_sem);
+
+ return ret;
+
+pi_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ goto retry_locked;
+ }
+
+ spin_unlock(&hb->lock);
+ up_read(&current->mm->mmap_sem);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
return ret;
}
@@ -735,6 +1470,7 @@ static int futex_close(struct inode *inode, struct file *filp)
unqueue_me(q);
kfree(q);
+
return 0;
}
@@ -766,7 +1502,7 @@ static struct file_operations futex_fops = {
* Signal allows caller to avoid the race which would occur if they
* set the sigio stuff up afterwards.
*/
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
{
struct futex_q *q;
struct file *filp;
@@ -803,6 +1539,7 @@ static int futex_fd(unsigned long uaddr, int signal)
err = -ENOMEM;
goto error;
}
+ q->pi_state = NULL;
down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1577,7 @@ error:
* Implementation: user-space maintains a per-thread list of locks it
* is holding. Upon do_exit(), the kernel carefully walks this list,
* and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
* always manipulated with the lock held, so the list is private and
* per-thread. Userspace also maintains a per-thread 'list_op_pending'
* field, to allow the kernel to clean up if the thread dies after
@@ -887,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
struct task_struct *p;
ret = -ESRCH;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto err_unlock;
@@ -896,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
!capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->robust_list;
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}
if (put_user(sizeof(*head), len_ptr))
@@ -904,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
return put_user(head, head_ptr);
err_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
@@ -913,9 +1650,9 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval;
+ u32 uval, nval, mval;
retry:
if (get_user(uval, uaddr))
@@ -932,17 +1669,45 @@ retry:
* thread-death.) The rest of the cleanup is done in
* userspace.
*/
- if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
- uval | FUTEX_OWNER_DIED) != uval)
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
+ if (nval == -EFAULT)
+ return -1;
+
+ if (nval != uval)
goto retry;
- if (uval & FUTEX_WAITERS)
- futex_wake((unsigned long)uaddr, 1);
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi) {
+ if (uval & FUTEX_WAITERS)
+ futex_wake(uaddr, 1);
+ }
}
return 0;
}
/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user **head, int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long *)head))
+ return -EFAULT;
+
+ *entry = (void *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
@@ -952,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned long futex_offset;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (get_user(entry, &head->list.next))
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
@@ -970,24 +1735,25 @@ void exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (get_user(pending, &head->list_op_pending))
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
return;
+
if (pending)
- handle_futex_death((void *)pending + futex_offset, curr);
+ handle_futex_death((void *)pending + futex_offset, curr, pip);
while (entry != &head->list) {
/*
* A pending lock might already be on the list, so
- * dont process it twice:
+ * don't process it twice:
*/
if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset,
- curr))
+ curr, pi))
return;
/*
* Fetch the next entry in the list:
*/
- if (get_user(entry, &entry->next))
+ if (fetch_robust_entry(&entry, &entry->next, &pi))
return;
/*
* Avoid excessively long or circular lists:
@@ -999,8 +1765,8 @@ void exit_robust_list(struct task_struct *curr)
}
}
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
- unsigned long uaddr2, int val2, int val3)
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
@@ -1024,6 +1790,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
break;
+ case FUTEX_LOCK_PI:
+ ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ break;
+ case FUTEX_UNLOCK_PI:
+ ret = futex_unlock_pi(uaddr);
+ break;
+ case FUTEX_TRYLOCK_PI:
+ ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ break;
default:
ret = -ENOSYS;
}
@@ -1031,29 +1806,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
}
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
- int val3)
+ u32 val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
- int val2 = 0;
+ u32 val2 = 0;
- if (utime && (op == FUTEX_WAIT)) {
+ if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (copy_from_user(&t, utime, sizeof(t)) != 0)
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- timeout = timespec_to_jiffies(&t) + 1;
+ if (op == FUTEX_WAIT)
+ timeout = timespec_to_jiffies(&t) + 1;
+ else {
+ timeout = t.tv_sec;
+ val2 = t.tv_nsec;
+ }
}
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op >= FUTEX_REQUEUE)
- val2 = (int) (unsigned long) utime;
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ val2 = (u32) (unsigned long) utime;
- return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..c5cca3f65cb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
#include <asm/uaccess.h>
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t *head, int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
compat_uptr_t uentry, upending;
- unsigned int limit = ROBUST_LIST_LIMIT;
compat_long_t futex_offset;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (get_user(uentry, &head->list.next))
+ if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
- entry = compat_ptr(uentry);
/*
* Fetch the relative futex offset:
*/
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (get_user(upending, &head->list_op_pending))
+ if (fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
return;
- pending = compat_ptr(upending);
if (upending)
- handle_futex_death((void *)pending + futex_offset, curr);
+ handle_futex_death((void *)pending + futex_offset, curr, pip);
while (compat_ptr(uentry) != &head->list) {
/*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
*/
if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset,
- curr))
+ curr, pi))
return;
/*
* Fetch the next entry in the list:
*/
- if (get_user(uentry, (compat_uptr_t *)&entry->next))
+ if (fetch_robust_entry(&uentry, &entry,
+ (compat_uptr_t *)&entry->next, &pi))
return;
- entry = compat_ptr(uentry);
/*
* Avoid excessively long or circular lists:
*/
@@ -129,16 +145,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
- if (utime && (op == FUTEX_WAIT)) {
+ if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (get_compat_timespec(&t, utime))
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- timeout = timespec_to_jiffies(&t) + 1;
+ if (op == FUTEX_WAIT)
+ timeout = timespec_to_jiffies(&t) + 1;
+ else {
+ timeout = t.tv_sec;
+ val2 = t.tv_nsec;
+ }
}
- if (op >= FUTEX_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (int) (unsigned long) utime;
- return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 18324305724a..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
/**
* ktime_get_ts - get the monotonic clock in timespec format
- *
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
@@ -188,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
{
struct hrtimer_base *new_base;
- new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+ new_base = &__get_cpu_var(hrtimer_bases)[base->index];
if (base != new_base) {
/*
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
# ifndef CONFIG_KTIME_SCALAR
/**
* ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- *
* @kt: addend
* @nsec: the scalar nsec value to add
*
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/**
* hrtimer_forward - forward the timer expiry
- *
* @timer: hrtimer to forward
* @now: forward past this time
* @interval: the interval to forward
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
/**
* hrtimer_start - (re)start an relative timer on the current CPU
- *
* @timer: the timer to be added
* @tim: expiry time
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
/**
* hrtimer_try_to_cancel - try to deactivate a timer
- *
* @timer: hrtimer to stop
*
* Returns:
* 0 when the timer was not active
* 1 when the timer was active
* -1 when the timer is currently excuting the callback function and
- * can not be stopped
+ * cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
/**
* hrtimer_cancel - cancel a timer and wait for the handler to finish.
- *
* @timer: the timer to be cancelled
*
* Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
- *
* @timer: the timer to read
*/
ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
/**
* hrtimer_init - initialize a timer to the given clock
- *
* @timer: the timer to be initialized
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
@@ -576,7 +568,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
memset(timer, 0, sizeof(struct hrtimer));
- bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+ bases = __raw_get_cpu_var(hrtimer_bases);
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
clock_id = CLOCK_MONOTONIC;
@@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
/**
* hrtimer_get_res - get the timer resolution for a clock
- *
* @which_clock: which clock to query
* @tp: pointer to timespec variable to store the resolution
*
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
struct hrtimer_base *bases;
- bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+ bases = __raw_get_cpu_var(hrtimer_bases);
*tp = ktime_to_timespec(bases[which_clock].resolution);
return 0;
@@ -678,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
{
sl->timer.function = hrtimer_wakeup;
sl->task = task;
@@ -702,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
return t->task == NULL;
}
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
struct hrtimer_sleeper t;
struct timespec __user *rmtp;
@@ -711,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
restart->fn = do_no_restart_syscall;
- hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
- t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+ hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+ t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
if (do_nanosleep(&t, HRTIMER_ABS))
return 0;
- rmtp = (struct timespec __user *) restart->arg2;
+ rmtp = (struct timespec __user *) restart->arg1;
if (rmtp) {
time = ktime_sub(t.timer.expires, t.timer.base->get_time());
if (time.tv64 <= 0)
@@ -727,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
return -EFAULT;
}
- restart->fn = nanosleep_restart;
+ restart->fn = hrtimer_nanosleep_restart;
/* The other values in restart are already filled in */
return -ERESTART_RESTARTBLOCK;
@@ -760,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
}
restart = &current_thread_info()->restart_block;
- restart->fn = nanosleep_restart;
- restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg1 = t.timer.expires.tv64 >> 32;
- restart->arg2 = (unsigned long) rmtp;
- restart->arg3 = (unsigned long) t.timer.base->index;
+ restart->fn = hrtimer_nanosleep_restart;
+ restart->arg0 = (unsigned long) t.timer.base->index;
+ restart->arg1 = (unsigned long) rmtp;
+ restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
+ restart->arg3 = t.timer.expires.tv64 >> 32;
return -ERESTART_RESTARTBLOCK;
}
@@ -791,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu)
struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
int i;
- for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
+ for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
spin_lock_init(&base->lock);
+ lockdep_set_class(&base->lock, &base->lock_key);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -842,7 +835,7 @@ static void migrate_hrtimers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -866,7 +859,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
.notifier_call = hrtimer_cpu_notify,
};
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o resend.o chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..533068cfb607 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -11,12 +11,14 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
+#include "internals.h"
+
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
* "IRQ_WAITING" cleared and the interrupt disabled.
*/
-static DECLARE_MUTEX(probe_sem);
+static DEFINE_MUTEX(probing_active);
/**
* probe_irq_on - begin an interrupt autodetect
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem);
*/
unsigned long probe_irq_on(void)
{
- unsigned long val;
- irq_desc_t *desc;
+ struct irq_desc *desc;
+ unsigned long mask;
unsigned int i;
- down(&probe_sem);
+ mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void)
desc = irq_desc + i;
spin_lock_irq(&desc->lock);
- if (!irq_desc[i].action)
- irq_desc[i].handler->startup(i);
+ if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
+ /*
+ * An old-style architecture might still have
+ * the handle_bad_irq handler there:
+ */
+ compat_irq_chip_set_default_handler(desc);
+
+ /*
+ * Some chips need to know about probing in
+ * progress:
+ */
+ if (desc->chip->set_type)
+ desc->chip->set_type(i, IRQ_TYPE_PROBE);
+ desc->chip->startup(i);
+ }
spin_unlock_irq(&desc->lock);
}
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void)
desc = irq_desc + i;
spin_lock_irq(&desc->lock);
- if (!desc->action) {
+ if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
- if (desc->handler->startup(i))
+ if (desc->chip->startup(i))
desc->status |= IRQ_PENDING;
}
spin_unlock_irq(&desc->lock);
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void)
/*
* Now filter out any obviously spurious interrupts
*/
- val = 0;
+ mask = 0;
for (i = 0; i < NR_IRQS; i++) {
- irq_desc_t *desc = irq_desc + i;
unsigned int status;
+ desc = irq_desc + i;
spin_lock_irq(&desc->lock);
status = desc->status;
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void)
/* It triggered already - consider it spurious. */
if (!(status & IRQ_WAITING)) {
desc->status = status & ~IRQ_AUTODETECT;
- desc->handler->shutdown(i);
+ desc->chip->shutdown(i);
} else
if (i < 32)
- val |= 1 << i;
+ mask |= 1 << i;
}
spin_unlock_irq(&desc->lock);
}
- return val;
+ return mask;
}
-
EXPORT_SYMBOL(probe_irq_on);
/**
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
mask = 0;
for (i = 0; i < NR_IRQS; i++) {
- irq_desc_t *desc = irq_desc + i;
+ struct irq_desc *desc = irq_desc + i;
unsigned int status;
spin_lock_irq(&desc->lock);
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val)
mask |= 1 << i;
desc->status = status & ~IRQ_AUTODETECT;
- desc->handler->shutdown(i);
+ desc->chip->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
- up(&probe_sem);
+ mutex_unlock(&probing_active);
return mask & val;
}
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val)
int i, irq_found = 0, nr_irqs = 0;
for (i = 0; i < NR_IRQS; i++) {
- irq_desc_t *desc = irq_desc + i;
+ struct irq_desc *desc = irq_desc + i;
unsigned int status;
spin_lock_irq(&desc->lock);
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val)
nr_irqs++;
}
desc->status = status & ~IRQ_AUTODETECT;
- desc->handler->shutdown(i);
+ desc->chip->shutdown(i);
}
spin_unlock_irq(&desc->lock);
}
- up(&probe_sem);
+ mutex_unlock(&probing_active);
if (nr_irqs > 1)
irq_found = -irq_found;
+
return irq_found;
}
-
EXPORT_SYMBOL(probe_irq_off);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..736cb0bd498f
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,533 @@
+/*
+ * linux/kernel/irq/chip.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code, for irq-chip
+ * based architectures.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/**
+ * set_irq_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (!chip)
+ chip = &no_irq_chip;
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock, flags);
+ irq_chip_set_defaults(chip);
+ desc->chip = chip;
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ * set_irq_type - set the irq type for an irq
+ * @irq: irq number
+ * @type: interrupt type - see include/linux/interrupt.h
+ */
+int set_irq_type(unsigned int irq, unsigned int type)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+ int ret = -ENXIO;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+ return -ENODEV;
+ }
+
+ desc = irq_desc + irq;
+ if (desc->chip->set_type) {
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = desc->chip->set_type(irq, type);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(set_irq_type);
+
+/**
+ * set_irq_data - set irq type data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
+ *
+ * Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR
+ "Trying to install controller data for IRQ%d\n", irq);
+ return -EINVAL;
+ }
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->handler_data = data;
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ * set_irq_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
+ *
+ * Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS || !desc->chip) {
+ printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->chip_data = data;
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ desc->chip->unmask(irq);
+ desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (!(desc->status & IRQ_DELAYED_DISABLE))
+ desc->chip->mask(irq);
+}
+
+/*
+ * default startup function
+ */
+static unsigned int default_startup(unsigned int irq)
+{
+ irq_desc[irq].chip->enable(irq);
+
+ return 0;
+}
+
+/*
+ * Fixup enable/disable function pointers
+ */
+void irq_chip_set_defaults(struct irq_chip *chip)
+{
+ if (!chip->enable)
+ chip->enable = default_enable;
+ if (!chip->disable)
+ chip->disable = default_disable;
+ if (!chip->startup)
+ chip->startup = default_startup;
+ if (!chip->shutdown)
+ chip->shutdown = chip->disable;
+ if (!chip->name)
+ chip->name = chip->typename;
+}
+
+static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+{
+ if (desc->chip->mask_ack)
+ desc->chip->mask_ack(irq);
+ else {
+ desc->chip->mask(irq);
+ desc->chip->ack(irq);
+ }
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ * @regs: pointer to a register structure
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ */
+void fastcall
+handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+ struct irqaction *action;
+ irqreturn_t action_ret;
+ const unsigned int cpu = smp_processor_id();
+
+ spin_lock(&desc->lock);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out_unlock;
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_cpu(cpu).irqs[irq]++;
+
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, regs, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret, regs);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ * @regs: pointer to a register structure
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask
+ * it after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ */
+void fastcall
+handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+ unsigned int cpu = smp_processor_id();
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ spin_lock(&desc->lock);
+ mask_ack_irq(desc, irq);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out_unlock;
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_cpu(cpu).irqs[irq]++;
+
+ /*
+ * If its disabled or no action available
+ * keep it masked and get out of here
+ */
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+ desc->status |= IRQ_PENDING;
+ goto out_unlock;
+ }
+
+ desc->status |= IRQ_INPROGRESS;
+ desc->status &= ~IRQ_PENDING;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, regs, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret, regs);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+ desc->chip->unmask(irq);
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ * @regs: pointer to a register structure
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void fastcall
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
+ struct pt_regs *regs)
+{
+ unsigned int cpu = smp_processor_id();
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ spin_lock(&desc->lock);
+
+ if (unlikely(desc->status & IRQ_INPROGRESS))
+ goto out;
+
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ kstat_cpu(cpu).irqs[irq]++;
+
+ /*
+ * If its disabled or no action available
+ * keep it masked and get out of here
+ */
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+ desc->status |= IRQ_PENDING;
+ goto out;
+ }
+
+ desc->status |= IRQ_INPROGRESS;
+ desc->status &= ~IRQ_PENDING;
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, regs, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret, regs);
+
+ spin_lock(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+out:
+ desc->chip->eoi(irq);
+
+ spin_unlock(&desc->lock);
+}
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ * @regs: pointer to a register structure
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the assosiacted event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left.
+ */
+void fastcall
+handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+ const unsigned int cpu = smp_processor_id();
+
+ spin_lock(&desc->lock);
+
+ desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+ /*
+ * If we're currently running this IRQ, or its disabled,
+ * we shouldn't process the IRQ. Mark it pending, handle
+ * the necessary masking and go out
+ */
+ if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
+ !desc->action)) {
+ desc->status |= (IRQ_PENDING | IRQ_MASKED);
+ mask_ack_irq(desc, irq);
+ goto out_unlock;
+ }
+
+ kstat_cpu(cpu).irqs[irq]++;
+
+ /* Start handling the irq */
+ desc->chip->ack(irq);
+
+ /* Mark the IRQ currently in progress.*/
+ desc->status |= IRQ_INPROGRESS;
+
+ do {
+ struct irqaction *action = desc->action;
+ irqreturn_t action_ret;
+
+ if (unlikely(!action)) {
+ desc->chip->mask(irq);
+ goto out_unlock;
+ }
+
+ /*
+ * When another irq arrived while we were handling
+ * one, we could have masked the irq.
+ * Renable it, if it was not disabled in meantime.
+ */
+ if (unlikely((desc->status &
+ (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
+ (IRQ_PENDING | IRQ_MASKED))) {
+ desc->chip->unmask(irq);
+ desc->status &= ~IRQ_MASKED;
+ }
+
+ desc->status &= ~IRQ_PENDING;
+ spin_unlock(&desc->lock);
+ action_ret = handle_IRQ_event(irq, regs, action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret, regs);
+ spin_lock(&desc->lock);
+
+ } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+
+ desc->status &= ~IRQ_INPROGRESS;
+out_unlock:
+ spin_unlock(&desc->lock);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * handle_percpu_IRQ - Per CPU local irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ * @regs: pointer to a register structure
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void fastcall
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+ irqreturn_t action_ret;
+
+ kstat_this_cpu.irqs[irq]++;
+
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
+
+ action_ret = handle_IRQ_event(irq, regs, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret, regs);
+
+ if (desc->chip->eoi)
+ desc->chip->eoi(irq);
+}
+
+#endif /* CONFIG_SMP */
+
+void
+__set_irq_handler(unsigned int irq,
+ void fastcall (*handle)(unsigned int, irq_desc_t *,
+ struct pt_regs *),
+ int is_chained)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR
+ "Trying to install type control for IRQ%d\n", irq);
+ return;
+ }
+
+ desc = irq_desc + irq;
+
+ if (!handle)
+ handle = handle_bad_irq;
+
+ if (desc->chip == &no_irq_chip) {
+ printk(KERN_WARNING "Trying to install %sinterrupt handler "
+ "for IRQ%d\n", is_chained ? "chained " : " ", irq);
+ /*
+ * Some ARM implementations install a handler for really dumb
+ * interrupt hardware without setting an irq_chip. This worked
+ * with the ARM no_irq_chip but the check in setup_irq would
+ * prevent us to setup the interrupt at all. Switch it to
+ * dummy_irq_chip for easy transition.
+ */
+ desc->chip = &dummy_irq_chip;
+ }
+
+ spin_lock_irqsave(&desc->lock, flags);
+
+ /* Uninstall? */
+ if (handle == handle_bad_irq) {
+ if (desc->chip != &no_irq_chip) {
+ desc->chip->mask(irq);
+ desc->chip->ack(irq);
+ }
+ desc->status |= IRQ_DISABLED;
+ desc->depth = 1;
+ }
+ desc->handle_irq = handle;
+
+ if (handle != handle_bad_irq && is_chained) {
+ desc->status &= ~IRQ_DISABLED;
+ desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+ desc->depth = 0;
+ desc->chip->unmask(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void
+set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
+ void fastcall (*handle)(unsigned int,
+ struct irq_desc *,
+ struct pt_regs *))
+{
+ set_irq_chip(irq, chip);
+ __set_irq_handler(irq, handle, 0);
+}
+
+/*
+ * Get a descriptive string for the highlevel handler, for
+ * /proc/interrupts output:
+ */
+const char *
+handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
+ struct pt_regs *))
+{
+ if (handle == handle_level_irq)
+ return "level ";
+ if (handle == handle_fasteoi_irq)
+ return "fasteoi";
+ if (handle == handle_edge_irq)
+ return "edge ";
+ if (handle == handle_simple_irq)
+ return "simple ";
+#ifdef CONFIG_SMP
+ if (handle == handle_percpu_irq)
+ return "percpu ";
+#endif
+ if (handle == handle_bad_irq)
+ return "bad ";
+
+ return NULL;
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 0f6530117105..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,9 +1,13 @@
/*
* linux/kernel/irq/handle.c
*
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
* This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
*/
#include <linux/irq.h>
@@ -14,11 +18,27 @@
#include "internals.h"
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq: the interrupt number
+ * @desc: description of the interrupt
+ * @regs: pointer to a register structure
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
+ */
+void fastcall
+handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
+{
+ print_irq_desc(irq, desc);
+ kstat_this_cpu.irqs[irq]++;
+ ack_bad_irq(irq);
+}
+
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
* by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the apropriate
+ * interrupt source is transparently wired to the appropriate
* controller. Thus drivers need not be aware of the
* interrupt-controller.
*
@@ -28,41 +48,68 @@
*
* Controller mappings for all interrupt sources:
*/
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
- .handler = &no_irq_type,
- .lock = SPIN_LOCK_UNLOCKED
+ .chip = &no_irq_chip,
+ .handle_irq = handle_bad_irq,
+ .depth = 1,
+ .lock = SPIN_LOCK_UNLOCKED,
+#ifdef CONFIG_SMP
+ .affinity = CPU_MASK_ALL
+#endif
}
};
/*
- * Generic 'no controller' code
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
*/
-static void end_none(unsigned int irq) { }
-static void enable_none(unsigned int irq) { }
-static void disable_none(unsigned int irq) { }
-static void shutdown_none(unsigned int irq) { }
-static unsigned int startup_none(unsigned int irq) { return 0; }
-
-static void ack_none(unsigned int irq)
+static void ack_bad(unsigned int irq)
{
- /*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themself.
- */
+ print_irq_desc(irq, irq_desc + irq);
ack_bad_irq(irq);
}
-struct hw_interrupt_type no_irq_type = {
- .typename = "none",
- .startup = startup_none,
- .shutdown = shutdown_none,
- .enable = enable_none,
- .disable = disable_none,
- .ack = ack_none,
- .end = end_none,
- .set_affinity = NULL
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+ return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+ .name = "none",
+ .startup = noop_ret,
+ .shutdown = noop,
+ .enable = noop,
+ .disable = noop,
+ .ack = ack_bad,
+ .end = noop,
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+ .name = "dummy",
+ .startup = noop_ret,
+ .shutdown = noop,
+ .enable = noop,
+ .disable = noop,
+ .ack = noop,
+ .mask = noop,
+ .unmask = noop,
+ .end = noop,
};
/*
@@ -73,17 +120,24 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
return IRQ_NONE;
}
-/*
- * Have got an event to handle:
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq: the interrupt number
+ * @regs: pointer to a register structure
+ * @action: the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
*/
-fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
- struct irqaction *action)
+irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+ struct irqaction *action)
{
irqreturn_t ret, retval = IRQ_NONE;
unsigned int status = 0;
- if (!(action->flags & SA_INTERRUPT))
- local_irq_enable();
+ handle_dynamic_tick(action);
+
+ if (!(action->flags & IRQF_DISABLED))
+ local_irq_enable_in_hardirq();
do {
ret = action->handler(irq, action->dev_id, regs);
@@ -93,22 +147,30 @@ fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
action = action->next;
} while (action);
- if (status & SA_SAMPLE_RANDOM)
+ if (status & IRQF_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
local_irq_disable();
return retval;
}
-/*
- * do_IRQ handles all normal device IRQ's (the special
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+/**
+ * __do_IRQ - original all in one highlevel IRQ handler
+ * @irq: the interrupt number
+ * @regs: pointer to a register structure
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
+ *
+ * This is the original x86 implementation which is used for every
+ * interrupt type.
*/
fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
{
- irq_desc_t *desc = irq_desc + irq;
- struct irqaction * action;
+ struct irq_desc *desc = irq_desc + irq;
+ struct irqaction *action;
unsigned int status;
kstat_this_cpu.irqs[irq]++;
@@ -118,16 +180,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
/*
* No locking required for CPU-local interrupts:
*/
- if (desc->handler->ack)
- desc->handler->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
action_ret = handle_IRQ_event(irq, regs, desc->action);
- desc->handler->end(irq);
+ desc->chip->end(irq);
return 1;
}
spin_lock(&desc->lock);
- if (desc->handler->ack)
- desc->handler->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
@@ -187,9 +249,26 @@ out:
* The ->end() handler has to deal with interrupts which got
* disabled while the handler was running.
*/
- desc->handler->end(irq);
+ desc->chip->end(irq);
spin_unlock(&desc->lock);
return 1;
}
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+void early_init_irq_lock_class(void)
+{
+ int i;
+
+ for (i = 0; i < NR_IRQS; i++)
+ lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+}
+#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..08a849a22447 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
extern int noirqdebug;
+/* Set default functions for irq_chip structures: */
+extern void irq_chip_set_defaults(struct irq_chip *chip);
+
+/* Set default handler: */
+extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+ printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+ irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+ printk("->handle_irq(): %p, ", desc->handle_irq);
+ print_symbol("%s\n", (unsigned long)desc->handle_irq);
+ printk("->chip(): %p, ", desc->chip);
+ print_symbol("%s\n", (unsigned long)desc->chip);
+ printk("->action(): %p\n", desc->action);
+ if (desc->action) {
+ printk("->action->handler(): %p, ", desc->action->handler);
+ print_symbol("%s\n", (unsigned long)desc->action->handler);
+ }
+
+ P(IRQ_INPROGRESS);
+ P(IRQ_DISABLED);
+ P(IRQ_PENDING);
+ P(IRQ_REPLAY);
+ P(IRQ_AUTODETECT);
+ P(IRQ_WAITING);
+ P(IRQ_LEVEL);
+ P(IRQ_MASKED);
+#ifdef CONFIG_IRQ_PER_CPU
+ P(IRQ_PER_CPU);
+#endif
+ P(IRQ_NOPROBE);
+ P(IRQ_NOREQUEST);
+ P(IRQ_NOAUTOEN);
+}
+
+#undef P
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,12 +1,12 @@
/*
* linux/kernel/irq/manage.c
*
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006 Thomas Gleixner
*
* This file contains driver APIs to the irq subsystem.
*/
-#include <linux/config.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
@@ -16,12 +16,6 @@
#ifdef CONFIG_SMP
-cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
-
-#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
-cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-#endif
-
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq)
while (desc->status & IRQ_INPROGRESS)
cpu_relax();
}
-
EXPORT_SYMBOL(synchronize_irq);
#endif
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq);
*/
void disable_irq_nosync(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_desc + irq;
unsigned long flags;
if (irq >= NR_IRQS)
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
if (!desc->depth++) {
desc->status |= IRQ_DISABLED;
- desc->handler->disable(irq);
+ desc->chip->disable(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
}
-
EXPORT_SYMBOL(disable_irq_nosync);
/**
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
*/
void disable_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_desc + irq;
if (irq >= NR_IRQS)
return;
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq)
if (desc->action)
synchronize_irq(irq);
}
-
EXPORT_SYMBOL(disable_irq);
/**
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq);
*/
void enable_irq(unsigned int irq)
{
- irq_desc_t *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_desc + irq;
unsigned long flags;
if (irq >= NR_IRQS)
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq)
spin_lock_irqsave(&desc->lock, flags);
switch (desc->depth) {
case 0:
+ printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
WARN_ON(1);
break;
case 1: {
unsigned int status = desc->status & ~IRQ_DISABLED;
- desc->status = status;
- if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
- desc->status = status | IRQ_REPLAY;
- hw_resend_irq(desc->handler,irq);
- }
- desc->handler->enable(irq);
+ /* Prevent probing on this irq: */
+ desc->status = status | IRQ_NOPROBE;
+ check_irq_resend(desc, irq);
/* fall-through */
}
default:
@@ -141,9 +130,53 @@ void enable_irq(unsigned int irq)
}
spin_unlock_irqrestore(&desc->lock, flags);
}
-
EXPORT_SYMBOL(enable_irq);
+/**
+ * set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is
+ * disabled by default. Enables and disables must match,
+ * just as they match for non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep
+ * states like "suspend to RAM".
+ */
+int set_irq_wake(unsigned int irq, unsigned int on)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ unsigned long flags;
+ int ret = -ENXIO;
+ int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+
+ /* wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ spin_lock_irqsave(&desc->lock, flags);
+ if (on) {
+ if (desc->wake_depth++ == 0)
+ desc->status |= IRQ_WAKEUP;
+ else
+ set_wake = NULL;
+ } else {
+ if (desc->wake_depth == 0) {
+ printk(KERN_WARNING "Unbalanced IRQ %d "
+ "wake disable\n", irq);
+ WARN_ON(1);
+ } else if (--desc->wake_depth == 0)
+ desc->status &= ~IRQ_WAKEUP;
+ else
+ set_wake = NULL;
+ }
+ if (set_wake)
+ ret = desc->chip->set_wake(irq, on);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(set_irq_wake);
+
/*
* Internal function that tells the architecture code whether a
* particular irq has been exclusively allocated or is available
@@ -153,22 +186,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
{
struct irqaction *action;
- if (irq >= NR_IRQS)
+ if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
return 0;
action = irq_desc[irq].action;
if (action)
- if (irqflags & action->flags & SA_SHIRQ)
+ if (irqflags & action->flags & IRQF_SHARED)
action = NULL;
return !action;
}
+void compat_irq_chip_set_default_handler(struct irq_desc *desc)
+{
+ /*
+ * If the architecture still has not overriden
+ * the flow handler then zap the default. This
+ * should catch incorrect flow-type setting.
+ */
+ if (desc->handle_irq == &handle_bad_irq)
+ desc->handle_irq = NULL;
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
*/
-int setup_irq(unsigned int irq, struct irqaction * new)
+int setup_irq(unsigned int irq, struct irqaction *new)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *old, **p;
@@ -178,14 +222,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
if (irq >= NR_IRQS)
return -EINVAL;
- if (desc->handler == &no_irq_type)
+ if (desc->chip == &no_irq_chip)
return -ENOSYS;
/*
* Some drivers like serial.c use request_irq() heavily,
* so we have to be careful not to interfere with a
* running system.
*/
- if (new->flags & SA_SAMPLE_RANDOM) {
+ if (new->flags & IRQF_SAMPLE_RANDOM) {
/*
* This function might sleep, we want to call it first,
* outside of the atomic block.
@@ -200,16 +244,24 @@ int setup_irq(unsigned int irq, struct irqaction * new)
/*
* The following block of code has to be executed atomically
*/
- spin_lock_irqsave(&desc->lock,flags);
+ spin_lock_irqsave(&desc->lock, flags);
p = &desc->action;
- if ((old = *p) != NULL) {
- /* Can't share interrupts unless both agree to */
- if (!(old->flags & new->flags & SA_SHIRQ))
+ old = *p;
+ if (old) {
+ /*
+ * Can't share interrupts unless both agree to and are
+ * the same type (level, edge, polarity). So both flag
+ * fields must have IRQF_SHARED set and the bits which
+ * set the trigger type must match.
+ */
+ if (!((old->flags & new->flags) & IRQF_SHARED) ||
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
goto mismatch;
-#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+#if defined(CONFIG_IRQ_PER_CPU)
/* All handlers must agree on per-cpuness */
- if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
+ if ((old->flags & IRQF_PERCPU) !=
+ (new->flags & IRQF_PERCPU))
goto mismatch;
#endif
@@ -222,20 +274,45 @@ int setup_irq(unsigned int irq, struct irqaction * new)
}
*p = new;
-#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
- if (new->flags & SA_PERCPU_IRQ)
+#if defined(CONFIG_IRQ_PER_CPU)
+ if (new->flags & IRQF_PERCPU)
desc->status |= IRQ_PER_CPU;
#endif
if (!shared) {
- desc->depth = 0;
- desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
- IRQ_WAITING | IRQ_INPROGRESS);
- if (desc->handler->startup)
- desc->handler->startup(irq);
- else
- desc->handler->enable(irq);
+ irq_chip_set_defaults(desc->chip);
+
+ /* Setup the type (level, edge polarity) if configured: */
+ if (new->flags & IRQF_TRIGGER_MASK) {
+ if (desc->chip && desc->chip->set_type)
+ desc->chip->set_type(irq,
+ new->flags & IRQF_TRIGGER_MASK);
+ else
+ /*
+ * IRQF_TRIGGER_* but the PIC does not support
+ * multiple flow-types?
+ */
+ printk(KERN_WARNING "No IRQF_TRIGGER set_type "
+ "function for IRQ %d (%s)\n", irq,
+ desc->chip ? desc->chip->name :
+ "unknown");
+ } else
+ compat_irq_chip_set_default_handler(desc);
+
+ desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+ IRQ_INPROGRESS);
+
+ if (!(desc->status & IRQ_NOAUTOEN)) {
+ desc->depth = 0;
+ desc->status &= ~IRQ_DISABLED;
+ if (desc->chip->startup)
+ desc->chip->startup(irq);
+ else
+ desc->chip->enable(irq);
+ } else
+ /* Undo nested disables: */
+ desc->depth = 1;
}
- spin_unlock_irqrestore(&desc->lock,flags);
+ spin_unlock_irqrestore(&desc->lock, flags);
new->irq = irq;
register_irq_proc(irq);
@@ -246,8 +323,8 @@ int setup_irq(unsigned int irq, struct irqaction * new)
mismatch:
spin_unlock_irqrestore(&desc->lock, flags);
- if (!(new->flags & SA_PROBEIRQ)) {
- printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+ if (!(new->flags & IRQF_PROBE_SHARED)) {
+ printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
dump_stack();
}
return -EBUSY;
@@ -278,10 +355,10 @@ void free_irq(unsigned int irq, void *dev_id)
return;
desc = irq_desc + irq;
- spin_lock_irqsave(&desc->lock,flags);
+ spin_lock_irqsave(&desc->lock, flags);
p = &desc->action;
for (;;) {
- struct irqaction * action = *p;
+ struct irqaction *action = *p;
if (action) {
struct irqaction **pp = p;
@@ -295,18 +372,18 @@ void free_irq(unsigned int irq, void *dev_id)
/* Currently used only by UML, might disappear one day.*/
#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->handler->release)
- desc->handler->release(irq, dev_id);
+ if (desc->chip->release)
+ desc->chip->release(irq, dev_id);
#endif
if (!desc->action) {
desc->status |= IRQ_DISABLED;
- if (desc->handler->shutdown)
- desc->handler->shutdown(irq);
+ if (desc->chip->shutdown)
+ desc->chip->shutdown(irq);
else
- desc->handler->disable(irq);
+ desc->chip->disable(irq);
}
- spin_unlock_irqrestore(&desc->lock,flags);
+ spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
/* Make sure it's not being used on another CPU */
@@ -314,12 +391,11 @@ void free_irq(unsigned int irq, void *dev_id)
kfree(action);
return;
}
- printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
- spin_unlock_irqrestore(&desc->lock,flags);
+ printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+ spin_unlock_irqrestore(&desc->lock, flags);
return;
}
}
-
EXPORT_SYMBOL(free_irq);
/**
@@ -346,28 +422,36 @@ EXPORT_SYMBOL(free_irq);
*
* Flags:
*
- * SA_SHIRQ Interrupt is shared
- * SA_INTERRUPT Disable local interrupts while processing
- * SA_SAMPLE_RANDOM The interrupt can be used for entropy
+ * IRQF_SHARED Interrupt is shared
+ * IRQF_DISABLED Disable local interrupts while processing
+ * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
*
*/
int request_irq(unsigned int irq,
irqreturn_t (*handler)(int, void *, struct pt_regs *),
- unsigned long irqflags, const char * devname, void *dev_id)
+ unsigned long irqflags, const char *devname, void *dev_id)
{
- struct irqaction * action;
+ struct irqaction *action;
int retval;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * Lockdep wants atomic interrupt handlers:
+ */
+ irqflags |= SA_INTERRUPT;
+#endif
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*/
- if ((irqflags & SA_SHIRQ) && !dev_id)
+ if ((irqflags & IRQF_SHARED) && !dev_id)
return -EINVAL;
if (irq >= NR_IRQS)
return -EINVAL;
+ if (irq_desc[irq].status & IRQ_NOREQUEST)
+ return -EINVAL;
if (!handler)
return -EINVAL;
@@ -390,6 +474,5 @@ int request_irq(unsigned int irq,
return retval;
}
-
EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a12d00eb5e7c..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,19 +3,19 @@
void set_pending_irq(unsigned int irq, cpumask_t mask)
{
- irq_desc_t *desc = irq_desc + irq;
+ struct irq_desc *desc = irq_desc + irq;
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
desc->move_irq = 1;
- pending_irq_cpumask[irq] = mask;
+ irq_desc[irq].pending_mask = mask;
spin_unlock_irqrestore(&desc->lock, flags);
}
void move_native_irq(int irq)
{
+ struct irq_desc *desc = irq_desc + irq;
cpumask_t tmp;
- irq_desc_t *desc = irq_descp(irq);
if (likely(!desc->move_irq))
return;
@@ -30,15 +30,15 @@ void move_native_irq(int irq)
desc->move_irq = 0;
- if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
+ if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
return;
- if (!desc->handler->set_affinity)
+ if (!desc->chip->set_affinity)
return;
assert_spin_locked(&desc->lock);
- cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+ cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
/*
* If there was a valid mask to work with, please
@@ -51,12 +51,12 @@ void move_native_irq(int irq)
*/
if (likely(!cpus_empty(tmp))) {
if (likely(!(desc->status & IRQ_DISABLED)))
- desc->handler->disable(irq);
+ desc->chip->disable(irq);
- desc->handler->set_affinity(irq,tmp);
+ desc->chip->set_affinity(irq,tmp);
if (likely(!(desc->status & IRQ_DISABLED)))
- desc->handler->enable(irq);
+ desc->chip->enable(irq);
}
- cpus_clear(pending_irq_cpumask[irq]);
+ cpus_clear(irq_desc[irq].pending_mask);
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index afacd6f585fa..607c7809ad01 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,15 +12,10 @@
#include "internals.h"
-static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-/*
- * The /proc/irq/<irq>/smp_affinity values:
- */
-static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-
#ifdef CONFIG_GENERIC_PENDING_IRQ
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
@@ -36,15 +31,15 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
set_balance_irq_affinity(irq, mask_val);
- irq_affinity[irq] = mask_val;
- irq_desc[irq].handler->set_affinity(irq, mask_val);
+ irq_desc[irq].affinity = mask_val;
+ irq_desc[irq].chip->set_affinity(irq, mask_val);
}
#endif
static int irq_affinity_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
- int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+ int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
if (count - len < 2)
return -EINVAL;
@@ -59,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
unsigned int irq = (int)(long)data, full_count = count, err;
cpumask_t new_value, tmp;
- if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+ if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
return -EIO;
err = cpumask_parse(buffer, count, new_value);
@@ -102,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
{
char name [MAX_NAMELEN];
- if (!irq_dir[irq] || action->dir || !action->name ||
+ if (!irq_desc[irq].dir || action->dir || !action->name ||
!name_unique(irq, action))
return;
@@ -110,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
- action->dir = proc_mkdir(name, irq_dir[irq]);
+ action->dir = proc_mkdir(name, irq_desc[irq].dir);
}
#undef MAX_NAMELEN
@@ -122,22 +117,22 @@ void register_irq_proc(unsigned int irq)
char name [MAX_NAMELEN];
if (!root_irq_dir ||
- (irq_desc[irq].handler == &no_irq_type) ||
- irq_dir[irq])
+ (irq_desc[irq].chip == &no_irq_chip) ||
+ irq_desc[irq].dir)
return;
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
- irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+ irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
#ifdef CONFIG_SMP
{
struct proc_dir_entry *entry;
/* create /proc/irq/<irq>/smp_affinity */
- entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+ entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
if (entry) {
entry->nlink = 1;
@@ -145,7 +140,6 @@ void register_irq_proc(unsigned int irq)
entry->read_proc = irq_affinity_read_proc;
entry->write_proc = irq_affinity_write_proc;
}
- smp_affinity_entry[irq] = entry;
}
#endif
}
@@ -155,7 +149,7 @@ void register_irq_proc(unsigned int irq)
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
if (action->dir)
- remove_proc_entry(action->dir->name, irq_dir[irq]);
+ remove_proc_entry(action->dir->name, irq_desc[irq].dir);
}
void init_irq_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..35f10f7ff94a
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,77 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner
+ *
+ * This file contains the IRQ-resend code
+ *
+ * If the interrupt is waiting to be processed, we try to re-run it.
+ * We can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at the hardware level, so in those cases
+ * we allow the resending of IRQs via a tasklet.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ while (!bitmap_empty(irqs_resend, NR_IRQS)) {
+ irq = find_first_bit(irqs_resend, NR_IRQS);
+ clear_bit(irq, irqs_resend);
+ desc = irq_desc + irq;
+ local_irq_disable();
+ desc->handle_irq(irq, desc, NULL);
+ local_irq_enable();
+ }
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+#endif
+
+/*
+ * IRQ resend
+ *
+ * Is called with interrupts disabled and desc->lock held.
+ */
+void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+{
+ unsigned int status = desc->status;
+
+ /*
+ * Make sure the interrupt is enabled, before resending it:
+ */
+ desc->chip->enable(irq);
+
+ if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+ desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+
+ if (!desc->chip || !desc->chip->retrigger ||
+ !desc->chip->retrigger(irq)) {
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+ /* Set it pending and activate the softirq: */
+ set_bit(irq, irqs_resend);
+ tasklet_schedule(&resend_tasklet);
+#endif
+ }
+ }
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b2fb3c18d06b..417e98092cf2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -16,39 +16,39 @@ static int irqfixup __read_mostly;
/*
* Recovery handler for misrouted interrupts.
*/
-
static int misrouted_irq(int irq, struct pt_regs *regs)
{
int i;
- irq_desc_t *desc;
int ok = 0;
int work = 0; /* Did we do work for a real IRQ */
- for(i = 1; i < NR_IRQS; i++) {
+ for (i = 1; i < NR_IRQS; i++) {
+ struct irq_desc *desc = irq_desc + i;
struct irqaction *action;
if (i == irq) /* Already tried */
continue;
- desc = &irq_desc[i];
+
spin_lock(&desc->lock);
- action = desc->action;
/* Already running on another processor */
if (desc->status & IRQ_INPROGRESS) {
/*
* Already running: If it is shared get the other
* CPU to go looking for our mystery interrupt too
*/
- if (desc->action && (desc->action->flags & SA_SHIRQ))
+ if (desc->action && (desc->action->flags & IRQF_SHARED))
desc->status |= IRQ_PENDING;
spin_unlock(&desc->lock);
continue;
}
/* Honour the normal IRQ locking */
desc->status |= IRQ_INPROGRESS;
+ action = desc->action;
spin_unlock(&desc->lock);
+
while (action) {
/* Only shared IRQ handlers are safe to call */
- if (action->flags & SA_SHIRQ) {
+ if (action->flags & IRQF_SHARED) {
if (action->handler(i, action->dev_id, regs) ==
IRQ_HANDLED)
ok = 1;
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
/*
* While we were looking for a fixup someone queued a real
- * IRQ clashing with our walk
+ * IRQ clashing with our walk:
*/
-
while ((desc->status & IRQ_PENDING) && action) {
/*
* Perform real IRQ processing for the IRQ we deferred
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
* If we did actual work for the real IRQ line we must let the
* IRQ controller clean up too
*/
- if(work)
- desc->handler->end(i);
+ if (work && desc->chip && desc->chip->end)
+ desc->chip->end(i);
spin_unlock(&desc->lock);
}
/* So the caller can adjust the irq error counts */
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
*/
static void
-__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+__report_bad_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
}
dump_stack();
printk(KERN_ERR "handlers:\n");
+
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
}
}
-static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void
+report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
@@ -133,8 +135,8 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
}
}
-void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
- struct pt_regs *regs)
+void note_interrupt(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret, struct pt_regs *regs)
{
if (unlikely(action_ret != IRQ_HANDLED)) {
desc->irqs_unhandled++;
@@ -166,7 +168,8 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->status |= IRQ_DISABLED;
- desc->handler->disable(irq);
+ desc->depth = 1;
+ desc->chip->disable(irq);
}
desc->irqs_unhandled = 0;
}
@@ -177,6 +180,7 @@ int __init noirqdebug_setup(char *str)
{
noirqdebug = 1;
printk(KERN_INFO "IRQ lockup detection disabled\n");
+
return 1;
}
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str)
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
+
return 1;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value,
- &iter->type, iter->name);
+ &iter->value, &iter->type,
+ iter->name, sizeof(iter->name));
if (iter->owner == NULL)
return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 58f0f382597c..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
int kexec_should_crash(struct task_struct *p)
{
- if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+ if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
return 1;
return 0;
}
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
- xchg(&kexec_lock, 0); /* Release the mutex */
+ locked = xchg(&kexec_lock, 0); /* Release the mutex */
+ BUG_ON(!locked);
kimage_free(image);
return result;
@@ -1042,7 +1043,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
void crash_kexec(struct pt_regs *regs)
{
- struct kimage *image;
int locked;
@@ -1056,14 +1056,14 @@ void crash_kexec(struct pt_regs *regs)
*/
locked = xchg(&kexec_lock, 1);
if (!locked) {
- image = xchg(&kexec_crash_image, NULL);
- if (image) {
+ if (kexec_crash_image) {
struct pt_regs fixed_regs;
crash_setup_regs(&fixed_regs, regs);
machine_crash_shutdown(&fixed_regs);
- machine_kexec(image);
+ machine_kexec(kexec_crash_image);
}
- xchg(&kexec_lock, 0);
+ locked = xchg(&kexec_lock, 0);
+ BUG_ON(!locked);
}
}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
len = min(len, fifo->size - fifo->in + fifo->out);
+ /*
+ * Ensure that we sample the fifo->out index -before- we
+ * start putting bytes into the kfifo.
+ */
+
+ smp_mb();
+
/* first put the data starting from fifo->in to buffer end */
l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
/* then put the rest (if any) at the beginning of the buffer */
memcpy(fifo->buffer, buffer + l, len - l);
+ /*
+ * Ensure that we add the bytes to the kfifo -before-
+ * we update the fifo->in index.
+ */
+
+ smp_wmb();
+
fifo->in += len;
return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
len = min(len, fifo->in - fifo->out);
+ /*
+ * Ensure that we sample the fifo->in index -before- we
+ * start removing bytes from the kfifo.
+ */
+
+ smp_rmb();
+
/* first get the data from fifo->out until the end of the buffer */
l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
/* then get the rest (if any) from the beginning of the buffer */
memcpy(buffer + l, fifo->buffer, len - l);
+ /*
+ * Ensure that we remove the bytes from the kfifo -before-
+ * we update the fifo->out index.
+ */
+
+ smp_mb();
+
fifo->out += len;
return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 20a997c73c3d..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,7 +20,6 @@
*/
#define __KERNEL_SYSCALLS__
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
@@ -177,6 +176,8 @@ static int wait_for_helper(void *data)
if (pid < 0) {
sub_info->retval = pid;
} else {
+ int ret;
+
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
@@ -186,7 +187,15 @@ static int wait_for_helper(void *data)
*
* Thus the __user pointer cast is valid here.
*/
- sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either ____call_usermodehelper failed and the
+ * real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
}
complete(sub_info->complete);
@@ -198,11 +207,12 @@ static void __call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
+ int wait = sub_info->wait;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
- if (sub_info->wait)
+ if (wait)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
@@ -212,7 +222,7 @@ static void __call_usermodehelper(void *data)
if (pid < 0) {
sub_info->retval = pid;
complete(sub_info->complete);
- } else if (!sub_info->wait)
+ } else if (!wait)
complete(sub_info->complete);
}
@@ -234,7 +244,7 @@ static void __call_usermodehelper(void *data)
int call_usermodehelper_keys(char *path, char **argv, char **envp,
struct key *session_keyring, int wait)
{
- DECLARE_COMPLETION(done);
+ DECLARE_COMPLETION_ONSTACK(done);
struct subprocess_info sub_info = {
.complete = &done,
.path = path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+static atomic_t kprobe_count;
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct notifier_block kprobe_page_fault_nb = {
+ .notifier_call = kprobe_exceptions_notify,
+ .priority = 0x7fffffff /* we need to notified first */
+};
+
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
* kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
*/
static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
{
- struct kprobe *kp;
-
if (p->break_handler) {
- list_for_each_entry_rcu(kp, &old_p->list, list) {
- if (kp->break_handler)
- return -EEXIST;
- }
+ if (old_p->break_handler)
+ return -EEXIST;
list_add_tail_rcu(&p->list, &old_p->list);
+ old_p->break_handler = aggr_break_handler;
} else
list_add_rcu(&p->list, &old_p->list);
+ if (p->post_handler && !old_p->post_handler)
+ old_p->post_handler = aggr_post_handler;
return 0;
}
@@ -388,11 +393,14 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
copy_kprobe(p, ap);
+ flush_insn_slot(ap);
ap->addr = p->addr;
ap->pre_handler = aggr_pre_handler;
- ap->post_handler = aggr_post_handler;
ap->fault_handler = aggr_fault_handler;
- ap->break_handler = aggr_break_handler;
+ if (p->post_handler)
+ ap->post_handler = aggr_post_handler;
+ if (p->break_handler)
+ ap->break_handler = aggr_break_handler;
INIT_LIST_HEAD(&ap->list);
list_add_rcu(&p->list, &ap->list);
@@ -464,6 +472,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
old_p = get_kprobe(p->addr);
if (old_p) {
ret = register_aggr_kprobe(old_p, p);
+ if (!ret)
+ atomic_inc(&kprobe_count);
goto out;
}
@@ -474,6 +484,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+ if (atomic_add_return(1, &kprobe_count) == \
+ (ARCH_INACTIVE_KPROBE_COUNT + 1))
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+
arch_arm_kprobe(p);
out:
@@ -536,14 +550,40 @@ valid_p:
kfree(old_p);
}
arch_remove_kprobe(p);
+ } else {
+ mutex_lock(&kprobe_mutex);
+ if (p->break_handler)
+ old_p->break_handler = NULL;
+ if (p->post_handler){
+ list_for_each_entry_rcu(list_p, &old_p->list, list){
+ if (list_p->post_handler){
+ cleanup_p = 2;
+ break;
+ }
+ }
+ if (cleanup_p == 0)
+ old_p->post_handler = NULL;
+ }
+ mutex_unlock(&kprobe_mutex);
}
+
+ /* Call unregister_page_fault_notifier()
+ * if no probes are active
+ */
+ mutex_lock(&kprobe_mutex);
+ if (atomic_add_return(-1, &kprobe_count) == \
+ ARCH_INACTIVE_KPROBE_COUNT)
+ unregister_page_fault_notifier(&kprobe_page_fault_nb);
+ mutex_unlock(&kprobe_mutex);
+ return;
}
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
- .priority = 0x7fffffff /* we need to notified first */
+ .priority = 0x7fffffff /* we need to be notified first */
};
+
int __kprobes register_jprobe(struct jprobe *jp)
{
/* Todo: Verify probepoint is a function entry point */
@@ -652,6 +692,7 @@ static int __init init_kprobes(void)
INIT_HLIST_HEAD(&kprobe_table[i]);
INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
}
+ atomic_set(&kprobe_count, 0);
err = arch_init_kprobes();
if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9e28478a17a5..e0ffe4ab0917 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -8,7 +8,6 @@
*
*/
-#include <linux/config.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
static DEFINE_MUTEX(kthread_stop_lock);
static struct kthread_stop_info kthread_stop_info;
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop on your kthread, it will be woken
+ * and this will return true. You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
int kthread_should_stop(void)
{
return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
complete(&create->done);
}
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create);
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create().
+ */
void kthread_bind(struct task_struct *k, unsigned int cpu)
{
BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,14 +201,21 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
}
EXPORT_SYMBOL(kthread_bind);
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit. Your threadfn() must not call do_exit()
+ * itself if you use this function! This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
int kthread_stop(struct task_struct *k)
{
- return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
-{
int ret;
mutex_lock(&kthread_stop_lock);
@@ -187,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
/* Now set kthread_should_stop() to true, and wake it up. */
kthread_stop_info.k = k;
- if (s)
- up(s);
- else
- wake_up_process(k);
+ wake_up_process(k);
put_task_struct(k);
/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -201,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
return ret;
}
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
static __init int helper_init(void)
{
@@ -210,5 +249,5 @@ static __init int helper_init(void)
return 0;
}
-core_initcall(helper_init);
+core_initcall(helper_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 000000000000..e596525669ed
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,2724 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+#include <linux/utsname.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+/*
+ * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code.
+ */
+static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int lockdep_initialized;
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+ if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ return list_entries + nr_list_entries++;
+}
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
+#define CLASSHASH_MASK (CLASSHASH_SIZE - 1)
+#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
+#define classhashentry(key) (classhash_table + __classhashfn((key)))
+
+static struct list_head classhash_table[CLASSHASH_SIZE];
+
+unsigned long nr_lock_chains;
+static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
+#define CHAINHASH_MASK (CHAINHASH_SIZE - 1)
+#define __chainhashfn(chain) \
+ (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
+#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
+
+static struct list_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+#define iterate_chain_key(key1, key2) \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
+ (key2))
+
+void lockdep_off(void)
+{
+ current->lockdep_recursion++;
+}
+
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+ current->lockdep_recursion--;
+}
+
+EXPORT_SYMBOL(lockdep_on);
+
+int lockdep_internal(void)
+{
+ return current->lockdep_recursion != 0;
+}
+
+EXPORT_SYMBOL(lockdep_internal);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE 0
+#ifdef VERBOSE
+# define VERY_VERBOSE 0
+#endif
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE 1
+# define SOFTIRQ_VERBOSE 1
+#else
+# define HARDIRQ_VERBOSE 0
+# define SOFTIRQ_VERBOSE 0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+#if 0
+ /* Example */
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "lockname"))
+ return 1;
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "&struct->lockfield"))
+ return 1;
+#endif
+ /* Allow everything else. 0 would be filter everything else */
+ return 1;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+static int hardirq_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int softirq_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#endif
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static int save_trace(struct stack_trace *trace)
+{
+ trace->nr_entries = 0;
+ trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->entries = stack_trace + nr_stack_trace_entries;
+
+ trace->skip = 3;
+ trace->all_contexts = 0;
+
+ /* Make sure to not recurse in case the the unwinder needs to tak
+e locks. */
+ lockdep_off();
+ save_stack_trace(trace, NULL);
+ lockdep_on();
+
+ trace->max_entries = trace->nr_entries;
+
+ nr_stack_trace_entries += trace->nr_entries;
+ if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+ return 0;
+
+ if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ __raw_spin_unlock(&hash_lock);
+ if (debug_locks_off()) {
+ printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * We cannot printk in early bootup code. Not even early_printk()
+ * might work. So we mark any initialization errors and printk
+ * about it later on, in lockdep_info().
+ */
+static int lockdep_init_error;
+
+/*
+ * Various lockdep statistics:
+ */
+atomic_t chain_lookup_hits;
+atomic_t chain_lookup_misses;
+atomic_t hardirqs_on_events;
+atomic_t hardirqs_off_events;
+atomic_t redundant_hardirqs_on;
+atomic_t redundant_hardirqs_off;
+atomic_t softirqs_on_events;
+atomic_t softirqs_off_events;
+atomic_t redundant_softirqs_on;
+atomic_t redundant_softirqs_off;
+atomic_t nr_unused_locks;
+atomic_t nr_cyclic_checks;
+atomic_t nr_cyclic_check_recursions;
+atomic_t nr_find_usage_forwards_checks;
+atomic_t nr_find_usage_forwards_recursions;
+atomic_t nr_find_usage_backwards_checks;
+atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+static const char *usage_str[] =
+{
+ [LOCK_USED] = "initial-use ",
+ [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
+ [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
+ [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
+ [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
+ [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
+ [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
+ [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
+ [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+ unsigned long offs, size;
+ char *modname;
+
+ return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+}
+
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+{
+ *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ *c1 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ *c1 = '-';
+
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ *c2 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ *c2 = '-';
+
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
+ *c3 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '?';
+ }
+
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
+ *c4 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '?';
+ }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char str[128], c1, c2, c3, c4;
+ const char *name;
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ printk(" (%s", name);
+ } else {
+ printk(" (%s", name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ if (class->subclass)
+ printk("/%d", class->subclass);
+ }
+ printk("){%c%c%c%c}", c1, c2, c3, c4);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+ const char *name;
+ char str[128];
+
+ name = lock->name;
+ if (!name)
+ name = __get_key_name(lock->key->subkeys, str);
+
+ printk("%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+ print_lock_name(hlock->class);
+ printk(", at: ");
+ print_ip_sym(hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *curr)
+{
+ int i, depth = curr->lockdep_depth;
+
+ if (!depth) {
+ printk("no locks held by %s/%d.\n", curr->comm, curr->pid);
+ return;
+ }
+ printk("%d lock%s held by %s/%d:\n",
+ depth, depth > 1 ? "s" : "", curr->comm, curr->pid);
+
+ for (i = 0; i < depth; i++) {
+ printk(" #%d: ", i);
+ print_lock(curr->held_locks + i);
+ }
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
+
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
+
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
+ }
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk all lock dependencies starting at <entry>:
+ */
+static void print_lock_dependencies(struct lock_class *class, int depth)
+{
+ struct lock_list *entry;
+
+ if (DEBUG_LOCKS_WARN_ON(depth >= 20))
+ return;
+
+ print_lock_class_header(class, depth);
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ DEBUG_LOCKS_WARN_ON(!entry->class);
+ print_lock_dependencies(entry->class, depth + 1);
+
+ printk("%*s ... acquired at:\n",depth,"");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+ }
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+ struct list_head *head, unsigned long ip)
+{
+ struct lock_list *entry;
+ /*
+ * Lock not present yet - get a new dependency struct and
+ * add it to the list:
+ */
+ entry = alloc_list_entry();
+ if (!entry)
+ return 0;
+
+ entry->class = this;
+ save_trace(&entry->trace);
+
+ /*
+ * Since we never remove from the dependency list, the list can
+ * be walked lockless by other CPUs, it's only allocation
+ * that must be protected by the spinlock. But this also means
+ * we must make new entries visible only once writes to the
+ * entry become visible - hence the RCU op:
+ */
+ list_add_tail_rcu(&entry->entry, head);
+
+ return 1;
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ *
+ * (to keep the stackframe of the recursive functions small we
+ * use these global variables, and we also mark various helper
+ * functions as noinline.)
+ */
+static struct held_lock *check_source, *check_target;
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+{
+ if (debug_locks_silent)
+ return 0;
+ printk("\n-> #%u", depth);
+ print_lock_name(target->class);
+ printk(":\n");
+ print_stack_trace(&target->trace, 6);
+
+ return 0;
+}
+
+static void print_kernel_version(void)
+{
+ printk("%s %.*s\n", system_utsname.release,
+ (int)strcspn(system_utsname.version, " "),
+ system_utsname.version);
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+{
+ struct task_struct *curr = current;
+
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=======================================================\n");
+ printk( "[ INFO: possible circular locking dependency detected ]\n");
+ print_kernel_version();
+ printk( "-------------------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, curr->pid);
+ print_lock(check_source);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(check_target);
+ printk("\nwhich lock already depends on the new lock.\n\n");
+ printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+ print_circular_bug_entry(entry, depth);
+
+ return 0;
+}
+
+static noinline int print_circular_bug_tail(void)
+{
+ struct task_struct *curr = current;
+ struct lock_list this;
+
+ if (debug_locks_silent)
+ return 0;
+
+ this.class = check_source->class;
+ save_trace(&this.trace);
+ print_circular_bug_entry(&this, 0);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int noinline print_infinite_recursion_bug(void)
+{
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+
+ return 0;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+
+ debug_atomic_inc(&nr_cyclic_check_recursions);
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ if (entry->class == check_target->class)
+ return print_circular_bug_header(entry, depth+1);
+ debug_atomic_inc(&nr_cyclic_checks);
+ if (!check_noncircular(entry->class, depth+1))
+ return print_circular_bug_entry(entry, depth+1);
+ }
+ return 1;
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+static enum lock_usage_bit find_usage_bit;
+static struct lock_class *forwards_match, *backwards_match;
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <forwards_match>.
+ *
+ * Return 1 otherwise and keep <forwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_forwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_forwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ forwards_match = source;
+ return 2;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ debug_atomic_inc(&nr_find_usage_forwards_recursions);
+ ret = find_usage_forwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <backwards_match>.
+ *
+ * Return 1 otherwise and keep <backwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_backwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_backwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ backwards_match = source;
+ return 2;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_before, entry) {
+ debug_atomic_inc(&nr_find_usage_backwards_recursions);
+ ret = find_usage_backwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+ struct held_lock *prev,
+ struct held_lock *next,
+ enum lock_usage_bit bit1,
+ enum lock_usage_bit bit2,
+ const char *irqclass)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n======================================================\n");
+ printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ irqclass, irqclass);
+ print_kernel_version();
+ printk( "------------------------------------------------------\n");
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ curr->comm, curr->pid,
+ curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+ curr->hardirqs_enabled,
+ curr->softirqs_enabled);
+ print_lock(next);
+
+ printk("\nand this task is already holding:\n");
+ print_lock(prev);
+ printk("which would create a new lock dependency:\n");
+ print_lock_name(prev->class);
+ printk(" ->");
+ print_lock_name(next->class);
+ printk("\n");
+
+ printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ irqclass);
+ print_lock_name(backwards_match);
+ printk("\n... which became %s-irq-safe at:\n", irqclass);
+
+ print_stack_trace(backwards_match->usage_traces + bit1, 1);
+
+ printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(forwards_match);
+ printk("\n... which became %s-irq-unsafe at:\n", irqclass);
+ printk("...");
+
+ print_stack_trace(forwards_match->usage_traces + bit2, 1);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(backwards_match, 0);
+
+ printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(forwards_match, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit_backwards,
+ enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit_backwards;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(prev->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ find_usage_bit = bit_forwards;
+ ret = find_usage_forwards(next->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+ /* ret == 2 */
+ return print_bad_irq_dependency(curr, prev, next,
+ bit_backwards, bit_forwards, irqclass);
+}
+
+#endif
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ debug_locks_off();
+ __raw_spin_unlock(&hash_lock);
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=============================================\n");
+ printk( "[ INFO: possible recursive locking detected ]\n");
+ print_kernel_version();
+ printk( "---------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, curr->pid);
+ print_lock(next);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(prev);
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+ struct lockdep_map *next_instance, int read)
+{
+ struct held_lock *prev;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ prev = curr->held_locks + i;
+ if (prev->class != next->class)
+ continue;
+ /*
+ * Allow read-after-read recursion of the same
+ * lock class (i.e. read_lock(lock)+read_lock(lock)):
+ */
+ if ((read == 2) && prev->read)
+ return 2;
+ return print_deadlock_bug(curr, prev, next);
+ }
+ return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ * - would the adding of the <prev> -> <next> dependency create a
+ * circular dependency in the graph? [== circular deadlock]
+ *
+ * - does the new prev->next dependency connect any hardirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * hardirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ * - does the new prev->next dependency connect any softirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * softirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ struct lock_list *entry;
+ int ret;
+
+ /*
+ * Prove that the new <prev> -> <next> dependency would not
+ * create a circular dependency in the graph. (We do this by
+ * forward-recursing into the graph starting at <next>, and
+ * checking whether we can reach <prev>.)
+ *
+ * We are using global variables to control the recursion, to
+ * keep the stackframe size of the recursive functions low:
+ */
+ check_source = next;
+ check_target = prev;
+ if (!(check_noncircular(next->class, 0)))
+ return print_circular_bug_tail();
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe-read
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
+ LOCK_ENABLED_HARDIRQS, "hard-read"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a softirq-safe
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ /*
+ * Prove that the new dependency does not connect a softirq-safe-read
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+#endif
+ /*
+ * For recursive read-locks we do all the dependency checks,
+ * but we dont store read-triggered dependencies (only
+ * write-triggered dependencies). This ensures that only the
+ * write-side dependencies matter, and that if for example a
+ * write-lock never takes any other locks, then the reads are
+ * equivalent to a NOP.
+ */
+ if (next->read == 2 || prev->read == 2)
+ return 1;
+ /*
+ * Is the <prev> -> <next> dependency already present?
+ *
+ * (this may occur even though this is a new chain: consider
+ * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+ * chains - the second one will be new, but L1 already has
+ * L2 added to its dependency list, due to the first chain.)
+ */
+ list_for_each_entry(entry, &prev->class->locks_after, entry) {
+ if (entry->class == next->class)
+ return 2;
+ }
+
+ /*
+ * Ok, all validations passed, add the new lock
+ * to the previous lock's dependency list:
+ */
+ ret = add_lock_to_list(prev->class, next->class,
+ &prev->class->locks_after, next->acquire_ip);
+ if (!ret)
+ return 0;
+ /*
+ * Return value of 2 signals 'dependency already added',
+ * in that case we dont have to add the backlink either.
+ */
+ if (ret == 2)
+ return 2;
+ ret = add_lock_to_list(next->class, prev->class,
+ &next->class->locks_before, next->acquire_ip);
+
+ /*
+ * Debugging printouts:
+ */
+ if (verbose(prev->class) || verbose(next->class)) {
+ __raw_spin_unlock(&hash_lock);
+ printk("\n new dependency: ");
+ print_lock_name(prev->class);
+ printk(" => ");
+ print_lock_name(next->class);
+ printk("\n");
+ dump_stack();
+ __raw_spin_lock(&hash_lock);
+ }
+ return 1;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+ int depth = curr->lockdep_depth;
+ struct held_lock *hlock;
+
+ /*
+ * Debugging checks.
+ *
+ * Depth must not be zero for a non-head lock:
+ */
+ if (!depth)
+ goto out_bug;
+ /*
+ * At least two relevant locks must exist for this
+ * to be a head:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ goto out_bug;
+
+ for (;;) {
+ hlock = curr->held_locks + depth-1;
+ /*
+ * Only non-recursive-read entries get new dependencies
+ * added:
+ */
+ if (hlock->read != 2) {
+ check_prev_add(curr, hlock, next);
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
+ depth--;
+ /*
+ * End of lock-stack?
+ */
+ if (!depth)
+ break;
+ /*
+ * Stop the search if we cross into another context:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ break;
+ }
+ return 1;
+out_bug:
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+
+ return 0;
+}
+
+
+/*
+ * Is this the address of a static object:
+ */
+static int static_obj(void *obj)
+{
+ unsigned long start = (unsigned long) &_stext,
+ end = (unsigned long) &_end,
+ addr = (unsigned long) obj;
+#ifdef CONFIG_SMP
+ int i;
+#endif
+
+ /*
+ * static variable?
+ */
+ if ((addr >= start) && (addr < end))
+ return 1;
+
+#ifdef CONFIG_SMP
+ /*
+ * percpu var?
+ */
+ for_each_possible_cpu(i) {
+ start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+ end = (unsigned long) &__per_cpu_end + per_cpu_offset(i);
+
+ if ((addr >= start) && (addr < end))
+ return 1;
+ }
+#endif
+
+ /*
+ * module var?
+ */
+ return is_module_address(addr);
+}
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+ struct lock_class *class;
+ int count = 0;
+
+ if (!new_class->name)
+ return 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ if (new_class->key - new_class->subclass == class->key)
+ return class->name_version;
+ if (class->name && !strcmp(class->name, new_class->name))
+ count = max(count, class->name_version);
+ }
+
+ return count + 1;
+}
+
+extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * If the architecture calls into lockdep before initializing
+ * the hashes then we'll warn about it later. (we cannot printk
+ * right now)
+ */
+ if (unlikely(!lockdep_initialized)) {
+ lockdep_init();
+ lockdep_init_error = 1;
+ }
+#endif
+
+ /*
+ * Static locks do not have their class-keys yet - for them the key
+ * is the lock object itself:
+ */
+ if (unlikely(!lock->key))
+ lock->key = (void *)lock;
+
+ /*
+ * NOTE: the class-key must be unique. For dynamic locks, a static
+ * lock_class_key variable is passed in through the mutex_init()
+ * (or spin_lock_init()) call - which acts as the key. For static
+ * locks we use the lock object itself as the key.
+ */
+ if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
+ __error_too_big_MAX_LOCKDEP_SUBCLASSES();
+
+ key = lock->key->subkeys + subclass;
+
+ hash_head = classhashentry(key);
+
+ /*
+ * We can walk the hash lockfree, because the hash only
+ * grows, and we are careful when adding entries to the end:
+ */
+ list_for_each_entry(class, hash_head, hash_entry)
+ if (class->key == key)
+ return class;
+
+ return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+ class = look_up_lock_class(lock, subclass);
+ if (likely(class))
+ return class;
+
+ /*
+ * Debug-check: all keys must be persistent!
+ */
+ if (!static_obj(lock->key)) {
+ debug_locks_off();
+ printk("INFO: trying to register non-static key.\n");
+ printk("the code is fine but needs lockdep annotation.\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
+ return NULL;
+ }
+
+ key = lock->key->subkeys + subclass;
+ hash_head = classhashentry(key);
+
+ __raw_spin_lock(&hash_lock);
+ /*
+ * We have to do the hash-walk again, to avoid races
+ * with another CPU:
+ */
+ list_for_each_entry(class, hash_head, hash_entry)
+ if (class->key == key)
+ goto out_unlock_set;
+ /*
+ * Allocate a new key from the static array, and add it to
+ * the hash:
+ */
+ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ class = lock_classes + nr_lock_classes++;
+ debug_atomic_inc(&nr_unused_locks);
+ class->key = key;
+ class->name = lock->name;
+ class->subclass = subclass;
+ INIT_LIST_HEAD(&class->lock_entry);
+ INIT_LIST_HEAD(&class->locks_before);
+ INIT_LIST_HEAD(&class->locks_after);
+ class->name_version = count_matching_names(class);
+ /*
+ * We use RCU's safe list-add method to make
+ * parallel walking of the hash-list safe:
+ */
+ list_add_tail_rcu(&class->hash_entry, hash_head);
+
+ if (verbose(class)) {
+ __raw_spin_unlock(&hash_lock);
+ printk("\nnew class %p: %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ __raw_spin_lock(&hash_lock);
+ }
+out_unlock_set:
+ __raw_spin_unlock(&hash_lock);
+
+ if (!subclass)
+ lock->class_cache = class;
+
+ DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
+
+ return class;
+}
+
+/*
+ * Look up a dependency chain. If the key is not present yet then
+ * add it and return 0 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 1.
+ */
+static inline int lookup_chain_cache(u64 chain_key)
+{
+ struct list_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+cache_hit:
+ debug_atomic_inc(&chain_lookup_hits);
+ /*
+ * In the debugging case, force redundant checking
+ * by returning 1:
+ */
+#ifdef CONFIG_DEBUG_LOCKDEP
+ __raw_spin_lock(&hash_lock);
+ return 1;
+#endif
+ return 0;
+ }
+ }
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+ __raw_spin_lock(&hash_lock);
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ __raw_spin_unlock(&hash_lock);
+ goto cache_hit;
+ }
+ }
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ list_add_tail_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(&chain_lookup_misses);
+#ifdef CONFIG_TRACE_IRQFLAGS
+ if (current->hardirq_context)
+ nr_hardirq_chains++;
+ else {
+ if (current->softirq_context)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
+ }
+#else
+ nr_process_chains++;
+#endif
+
+ return 1;
+}
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ struct held_lock *hlock, *prev_hlock = NULL;
+ unsigned int i, id;
+ u64 chain_key = 0;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+ if (chain_key != hlock->prev_chain_key) {
+ debug_locks_off();
+ printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)hlock->prev_chain_key);
+ WARN_ON(1);
+ return;
+ }
+ id = hlock->class - lock_classes;
+ DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS);
+ if (prev_hlock && (prev_hlock->irq_context !=
+ hlock->irq_context))
+ chain_key = 0;
+ chain_key = iterate_chain_key(chain_key, id);
+ prev_hlock = hlock;
+ }
+ if (chain_key != curr->curr_chain_key) {
+ debug_locks_off();
+ printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)curr->curr_chain_key);
+ WARN_ON(1);
+ }
+#endif
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+ struct held_lock *this, int forwards,
+ const char *irqclass)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=========================================================\n");
+ printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ print_kernel_version();
+ printk( "---------------------------------------------------------\n");
+ printk("%s/%d just changed the state of lock:\n",
+ curr->comm, curr->pid);
+ print_lock(this);
+ if (forwards)
+ printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+ else
+ printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+ print_lock_name(other);
+ printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe first lock's dependencies:\n");
+ print_lock_dependencies(this->class, 0);
+
+ printk("\nthe second lock's dependencies:\n");
+ print_lock_dependencies(other, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <forwards_match> */
+ ret = find_usage_forwards(this->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(this->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+}
+
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+ printk("irq event stamp: %u\n", curr->irq_events);
+ printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
+ print_ip_sym(curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
+ print_ip_sym(curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
+ print_ip_sym(curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
+ print_ip_sym(curr->softirq_disable_ip);
+}
+
+#else
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=================================\n");
+ printk( "[ INFO: inconsistent lock state ]\n");
+ print_kernel_version();
+ printk( "---------------------------------\n");
+
+ printk("inconsistent {%s} -> {%s} usage.\n",
+ usage_str[prev_bit], usage_str[new_bit]);
+
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ curr->comm, curr->pid,
+ trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ trace_hardirqs_enabled(curr),
+ trace_softirqs_enabled(curr));
+ print_lock(this);
+
+ printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_stack_trace(this->class->usage_traces + prev_bit, 1);
+
+ print_irqtrace_events(curr);
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+ if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+ return print_usage_bug(curr, this, bad_bit, new_bit);
+ return 1;
+}
+
+#define STRICT_READ_CHECKS 1
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, unsigned long ip)
+{
+ unsigned int new_mask = 1 << new_bit, ret = 1;
+
+ /*
+ * If already set then do not dirty the cacheline,
+ * nor do any checks:
+ */
+ if (likely(this->class->usage_mask & new_mask))
+ return 1;
+
+ __raw_spin_lock(&hash_lock);
+ /*
+ * Make sure we didnt race:
+ */
+ if (unlikely(this->class->usage_mask & new_mask)) {
+ __raw_spin_unlock(&hash_lock);
+ return 1;
+ }
+
+ this->class->usage_mask |= new_mask;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ if (new_bit == LOCK_ENABLED_HARDIRQS ||
+ new_bit == LOCK_ENABLED_HARDIRQS_READ)
+ ip = curr->hardirq_enable_ip;
+ else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
+ new_bit == LOCK_ENABLED_SOFTIRQS_READ)
+ ip = curr->softirq_enable_ip;
+#endif
+ if (!save_trace(this->class->usage_traces + new_bit))
+ return 0;
+
+ switch (new_bit) {
+#ifdef CONFIG_TRACE_IRQFLAGS
+ case LOCK_USED_IN_HARDIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_HARDIRQS_READ))
+ return 0;
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_SOFTIRQS_READ))
+ return 0;
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_HARDIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ /*
+ * just marked it hardirq-read-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ /*
+ * just marked it softirq-read-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ /*
+ * just marked it hardirq-unsafe, check that no hardirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-unsafe, check that no
+ * hardirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ /*
+ * just marked it softirq-unsafe, check that no softirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-unsafe, check that no
+ * softirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-read-unsafe, check that no
+ * hardirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-read-unsafe, check that no
+ * softirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+#endif
+ case LOCK_USED:
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
+ debug_atomic_dec(&nr_unused_locks);
+ break;
+ default:
+ debug_locks_off();
+ WARN_ON(1);
+ return 0;
+ }
+
+ __raw_spin_unlock(&hash_lock);
+
+ /*
+ * We must printk outside of the hash_lock:
+ */
+ if (ret == 2) {
+ printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+ print_lock(this);
+ print_irqtrace_events(curr);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+{
+ enum lock_usage_bit usage_bit;
+ struct held_lock *hlock;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (hardirq) {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_HARDIRQS;
+ } else {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_SOFTIRQS;
+ }
+ if (!mark_lock(curr, hlock, usage_bit, ip))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Debugging helper: via this flag we know that we are in
+ * 'early bootup code', and will warn about any invalid irqs-on event:
+ */
+static int early_boot_irqs_enabled;
+
+void early_boot_irqs_off(void)
+{
+ early_boot_irqs_enabled = 0;
+}
+
+void early_boot_irqs_on(void)
+{
+ early_boot_irqs_enabled = 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+void trace_hardirqs_on(void)
+{
+ struct task_struct *curr = current;
+ unsigned long ip;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ return;
+
+ if (unlikely(curr->hardirqs_enabled)) {
+ debug_atomic_inc(&redundant_hardirqs_on);
+ return;
+ }
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+ ip = (unsigned long) __builtin_return_address(0);
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+ /*
+ * We are going to turn hardirqs on, so set the
+ * usage bit for all held locks:
+ */
+ if (!mark_held_locks(curr, 1, ip))
+ return;
+ /*
+ * If we have softirqs enabled, then set the usage
+ * bit for all held locks. (disabled hardirqs prevented
+ * this bit from being set before)
+ */
+ if (curr->softirqs_enabled)
+ if (!mark_held_locks(curr, 0, ip))
+ return;
+
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_on_events);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+void trace_hardirqs_off(void)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->hardirqs_enabled = 0;
+ curr->hardirq_disable_ip = _RET_IP_;
+ curr->hardirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_off_events);
+ } else
+ debug_atomic_inc(&redundant_hardirqs_off);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ debug_atomic_inc(&redundant_softirqs_on);
+ return;
+ }
+
+ /*
+ * We'll do an OFF -> ON transition:
+ */
+ curr->softirqs_enabled = 1;
+ curr->softirq_enable_ip = ip;
+ curr->softirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_on_events);
+ /*
+ * We are going to turn softirqs on, so set the
+ * usage bit for all held locks, if hardirqs are
+ * enabled too:
+ */
+ if (curr->hardirqs_enabled)
+ mark_held_locks(curr, 0, ip);
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->softirqs_enabled = 0;
+ curr->softirq_disable_ip = ip;
+ curr->softirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_off_events);
+ DEBUG_LOCKS_WARN_ON(!softirq_count());
+ } else
+ debug_atomic_inc(&redundant_softirqs_off);
+}
+
+#endif
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!key))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(!name))
+ return;
+ /*
+ * Sanity check, the lock-class key must be persistent:
+ */
+ if (!static_obj(key)) {
+ printk("BUG: key %p not in .data!\n", key);
+ DEBUG_LOCKS_WARN_ON(1);
+ return;
+ }
+ lock->name = name;
+ lock->key = key;
+ lock->class_cache = NULL;
+}
+
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, int hardirqs_off,
+ unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct lock_class *class = NULL;
+ struct held_lock *hlock;
+ unsigned int depth, id;
+ int chain_head = 0;
+ u64 chain_key;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+
+ if (!subclass)
+ class = lock->class_cache;
+ /*
+ * Not cached yet or subclass?
+ */
+ if (unlikely(!class)) {
+ class = register_lock_class(lock, subclass);
+ if (!class)
+ return 0;
+ }
+ debug_atomic_inc((atomic_t *)&class->ops);
+ if (very_verbose(class)) {
+ printk("\nacquire class [%p] %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ }
+
+ /*
+ * Add the lock to the list of currently held locks.
+ * (we dont increase the depth just yet, up until the
+ * dependency checks are done)
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+ return 0;
+
+ hlock = curr->held_locks + depth;
+
+ hlock->class = class;
+ hlock->acquire_ip = ip;
+ hlock->instance = lock;
+ hlock->trylock = trylock;
+ hlock->read = read;
+ hlock->check = check;
+ hlock->hardirqs_off = hardirqs_off;
+
+ if (check != 2)
+ goto out_calc_hash;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * If non-trylock use in a hardirq or softirq context, then
+ * mark the lock as used in these contexts:
+ */
+ if (!trylock) {
+ if (read) {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_HARDIRQ_READ, ip))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_SOFTIRQ_READ, ip))
+ return 0;
+ } else {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+ return 0;
+ }
+ }
+ if (!hardirqs_off) {
+ if (read) {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS_READ, ip))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS_READ, ip))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS, ip))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS, ip))
+ return 0;
+ }
+ }
+#endif
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED, ip))
+ return 0;
+out_calc_hash:
+ /*
+ * Calculate the chain hash: it's the combined has of all the
+ * lock keys along the dependency chain. We save the hash value
+ * at every step so that we can get the current hash easily
+ * after unlock. The chain hash is then used to cache dependency
+ * results.
+ *
+ * The 'key ID' is what is the most compact key value to drive
+ * the hash, not class->key.
+ */
+ id = class - lock_classes;
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return 0;
+
+ chain_key = curr->curr_chain_key;
+ if (!depth) {
+ if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ return 0;
+ chain_head = 1;
+ }
+
+ hlock->prev_chain_key = chain_key;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * Keep track of points where we cross into an interrupt context:
+ */
+ hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+ curr->softirq_context;
+ if (depth) {
+ struct held_lock *prev_hlock;
+
+ prev_hlock = curr->held_locks + depth-1;
+ /*
+ * If we cross into another context, reset the
+ * hash key (this also prevents the checking and the
+ * adding of the dependency to 'prev'):
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ chain_key = 0;
+ chain_head = 1;
+ }
+ }
+#endif
+ chain_key = iterate_chain_key(chain_key, id);
+ curr->curr_chain_key = chain_key;
+
+ /*
+ * Trylock needs to maintain the stack of held locks, but it
+ * does not add new dependencies, because trylock can be done
+ * in any order.
+ *
+ * We look up the chain_key and do the O(N^2) check and update of
+ * the dependencies only if this is a new dependency chain.
+ * (If lookup_chain_cache() returns with 1 it acquires
+ * hash_lock for us)
+ */
+ if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+ /*
+ * Check whether last held lock:
+ *
+ * - is irq-safe, if this lock is irq-unsafe
+ * - is softirq-safe, if this lock is hardirq-unsafe
+ *
+ * And check whether the new lock's dependency graph
+ * could lead back to the previous lock.
+ *
+ * any of these scenarios could lead to a deadlock. If
+ * All validations
+ */
+ int ret = check_deadlock(curr, hlock, lock, read);
+
+ if (!ret)
+ return 0;
+ /*
+ * Mark recursive read, as we jump over it when
+ * building dependencies (just like we jump over
+ * trylock entries):
+ */
+ if (ret == 2)
+ hlock->read = 2;
+ /*
+ * Add dependency only if this lock is not the head
+ * of the chain, and if it's not a secondary read-lock:
+ */
+ if (!chain_head && ret != 2)
+ if (!check_prevs_add(curr, hlock))
+ return 0;
+ __raw_spin_unlock(&hash_lock);
+ }
+ curr->lockdep_depth++;
+ check_chain_key(curr);
+ if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCK_DEPTH too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+ if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+ max_lockdep_depth = curr->lockdep_depth;
+
+ return 1;
+}
+
+static int
+print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: bad unlock balance detected! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is trying to release lock (",
+ curr->comm, curr->pid);
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no more locks to release!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Common debugging checks for both nested and non-nested unlock:
+ */
+static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (unlikely(!debug_locks))
+ return 0;
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (curr->lockdep_depth <= 0)
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks in a
+ * potentially non-nested (out of order) manner. This is a
+ * relatively rare operation, as all the unlock APIs default
+ * to nested mode (which uses lock_release()):
+ */
+static int
+lock_release_non_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock, *prev_hlock;
+ unsigned int depth;
+ int i;
+
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+ /*
+ * We have the right lock to unlock, 'hlock' points to it.
+ * Now we remove it from the stack, and add back the other
+ * entries (if any), recalculating the hash along the way:
+ */
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (i++; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock->class->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->acquire_ip))
+ return 0;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static int lock_release_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock;
+ unsigned int depth;
+
+ /*
+ * Pop off the top of the lock stack:
+ */
+ depth = curr->lockdep_depth - 1;
+ hlock = curr->held_locks + depth;
+
+ /*
+ * Is the unlock non-nested:
+ */
+ if (hlock->instance != lock)
+ return lock_release_non_nested(curr, lock, ip);
+ curr->lockdep_depth--;
+
+ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
+ return 0;
+
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ hlock->prev_chain_key = 0;
+ hlock->class = NULL;
+ hlock->acquire_ip = 0;
+ hlock->irq_context = 0;
+#endif
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static void
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (!check_unlock(curr, lock, ip))
+ return;
+
+ if (nested) {
+ if (!lock_release_nested(curr, lock, ip))
+ return;
+ } else {
+ if (!lock_release_non_nested(curr, lock, ip))
+ return;
+ }
+
+ check_chain_key(curr);
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
+ if (!debug_locks)
+ return;
+
+ if (irqs_disabled_flags(flags))
+ DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled);
+ else
+ DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled);
+
+ /*
+ * We dont accurately track softirq state in e.g.
+ * hardirq contexts (such as on 4KSTACKS), so only
+ * check if not in hardirq contexts:
+ */
+ if (!hardirq_count()) {
+ if (softirq_count())
+ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+ else
+ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
+
+ if (!debug_locks)
+ print_irqtrace_events(current);
+#endif
+}
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_acquire(lock, subclass, trylock, read, check,
+ irqs_disabled_flags(flags), ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_release(lock, nested, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_release);
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ current->curr_chain_key = 0;
+ current->lockdep_depth = 0;
+ current->lockdep_recursion = 0;
+ memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+ nr_hardirq_chains = 0;
+ nr_softirq_chains = 0;
+ nr_process_chains = 0;
+ debug_locks = 1;
+ raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+ int i;
+
+ /*
+ * Remove all dependencies this lock is
+ * involved in:
+ */
+ for (i = 0; i < nr_list_entries; i++) {
+ if (list_entries[i].class == class)
+ list_del_rcu(&list_entries[i].entry);
+ }
+ /*
+ * Unhash the class and remove it from the all_lock_classes list:
+ */
+ list_del_rcu(&class->hash_entry);
+ list_del_rcu(&class->lock_entry);
+
+}
+
+static inline int within(void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&hash_lock);
+
+ /*
+ * Unhash all classes that were created by this module:
+ */
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry)
+ if (within(class->key, start, size))
+ zap_class(class);
+ }
+
+ __raw_spin_unlock(&hash_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i, j;
+
+ raw_local_irq_save(flags);
+
+ /*
+ * Remove all classes this lock might have:
+ */
+ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+ /*
+ * If the class exists we look it up and zap it:
+ */
+ class = look_up_lock_class(lock, j);
+ if (class)
+ zap_class(class);
+ }
+ /*
+ * Debug check: in the end all mapped classes should
+ * be gone.
+ */
+ __raw_spin_lock(&hash_lock);
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry) {
+ if (unlikely(class == lock->class_cache)) {
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+ goto out_restore;
+ }
+ }
+ }
+ __raw_spin_unlock(&hash_lock);
+
+out_restore:
+ raw_local_irq_restore(flags);
+}
+
+void __init lockdep_init(void)
+{
+ int i;
+
+ /*
+ * Some architectures have their own start_kernel()
+ * code which calls lockdep_init(), while we also
+ * call lockdep_init() from the start_kernel() itself,
+ * and we want to initialize the hashes only once:
+ */
+ if (lockdep_initialized)
+ return;
+
+ for (i = 0; i < CLASSHASH_SIZE; i++)
+ INIT_LIST_HEAD(classhash_table + i);
+
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+
+ lockdep_initialized = 1;
+}
+
+void __init lockdep_info(void)
+{
+ printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ printk(" memory used by lock dependency info: %lu kB\n",
+ (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+ sizeof(struct list_head) * CLASSHASH_SIZE +
+ sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+ sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+ sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+
+ printk(" per task-struct memory footprint: %lu bytes\n",
+ sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (lockdep_init_error)
+ printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
+#endif
+}
+
+static inline int in_range(const void *start, const void *addr, const void *end)
+{
+ return addr >= start && addr <= end;
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+ const void *mem_to, struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=========================\n");
+ printk( "[ BUG: held lock freed! ]\n");
+ printk( "-------------------------\n");
+ printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ curr->comm, curr->pid, mem_from, mem_to-1);
+ print_lock(hlock);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+ const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned long flags;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ local_irq_save(flags);
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ lock_from = (void *)hlock->instance;
+ lock_to = (void *)(hlock->instance + 1);
+
+ if (!in_range(mem_from, lock_from, mem_to) &&
+ !in_range(mem_from, lock_to, mem_to))
+ continue;
+
+ print_freed_lock_bug(curr, mem_from, mem_to, hlock);
+ break;
+ }
+ local_irq_restore(flags);
+}
+
+static void print_held_locks_bug(struct task_struct *curr)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: lock held at task exit time! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is exiting with locks still held!\n",
+ curr->comm, curr->pid);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+void debug_check_no_locks_held(struct task_struct *task)
+{
+ if (unlikely(task->lockdep_depth > 0))
+ print_held_locks_bug(task);
+}
+
+void debug_show_all_locks(void)
+{
+ struct task_struct *g, *p;
+ int count = 10;
+ int unlock = 1;
+
+ printk("\nShowing all locks held in the system:\n");
+
+ /*
+ * Here we try to get the tasklist_lock as hard as possible,
+ * if not successful after 2 seconds we ignore it (but keep
+ * trying). This is to enable a debug printout even if a
+ * tasklist_lock-holding task deadlocks or crashes.
+ */
+retry:
+ if (!read_trylock(&tasklist_lock)) {
+ if (count == 10)
+ printk("hm, tasklist_lock locked, retrying... ");
+ if (count) {
+ count--;
+ printk(" #%d", 10-count);
+ mdelay(200);
+ goto retry;
+ }
+ printk(" ignoring it.\n");
+ unlock = 0;
+ }
+ if (count != 10)
+ printk(" locked it.\n");
+
+ do_each_thread(g, p) {
+ if (p->lockdep_depth)
+ lockdep_print_held_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+ printk("\n");
+ printk("=============================================\n\n");
+
+ if (unlock)
+ read_unlock(&tasklist_lock);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+ lockdep_print_held_locks(task);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 000000000000..eab043c83bb2
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES 8192UL
+
+#define MAX_LOCKDEP_KEYS_BITS 11
+#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
+
+#define MAX_LOCKDEP_CHAINS_BITS 13
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+
+extern struct list_head all_lock_classes;
+
+extern void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+extern atomic_t chain_lookup_hits;
+extern atomic_t chain_lookup_misses;
+extern atomic_t hardirqs_on_events;
+extern atomic_t hardirqs_off_events;
+extern atomic_t redundant_hardirqs_on;
+extern atomic_t redundant_hardirqs_off;
+extern atomic_t softirqs_on_events;
+extern atomic_t softirqs_off_events;
+extern atomic_t redundant_softirqs_on;
+extern atomic_t redundant_softirqs_off;
+extern atomic_t nr_unused_locks;
+extern atomic_t nr_cyclic_checks;
+extern atomic_t nr_cyclic_check_recursions;
+extern atomic_t nr_find_usage_forwards_checks;
+extern atomic_t nr_find_usage_forwards_recursions;
+extern atomic_t nr_find_usage_backwards_checks;
+extern atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
new file mode 100644
index 000000000000..f6e72eaab3fa
--- /dev/null
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,345 @@
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct lock_class *class = v;
+
+ (*pos)++;
+
+ if (class->lock_entry.next != &all_lock_classes)
+ class = list_entry(class->lock_entry.next, struct lock_class,
+ lock_entry);
+ else
+ class = NULL;
+ m->private = class;
+
+ return class;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_class *class = m->private;
+
+ if (&class->lock_entry == all_lock_classes.next)
+ seq_printf(m, "all lock classes:\n");
+
+ return class;
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static unsigned long count_forward_deps(struct lock_class *class)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_after, entry)
+ ret += count_forward_deps(entry->class);
+
+ return ret;
+}
+
+static unsigned long count_backward_deps(struct lock_class *class)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_before, entry)
+ ret += count_backward_deps(entry->class);
+
+ return ret;
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ unsigned long nr_forward_deps, nr_backward_deps;
+ struct lock_class *class = m->private;
+ char str[128], c1, c2, c3, c4;
+ const char *name;
+
+ seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+ nr_forward_deps = count_forward_deps(class);
+ seq_printf(m, " FD:%5ld", nr_forward_deps);
+
+ nr_backward_deps = count_backward_deps(class);
+ seq_printf(m, " BD:%5ld", nr_backward_deps);
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+ seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ seq_printf(m, ": %s", name);
+ } else{
+ seq_printf(m, ": %s", name);
+ if (class->name_version > 1)
+ seq_printf(m, "#%d", class->name_version);
+ if (class->subclass)
+ seq_printf(m, "/%d", class->subclass);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static struct seq_operations lockdep_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int lockdep_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &lockdep_ops);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+
+ if (!list_empty(&all_lock_classes))
+ m->private = list_entry(all_lock_classes.next,
+ struct lock_class, lock_entry);
+ else
+ m->private = NULL;
+ }
+ return res;
+}
+
+static struct file_operations proc_lockdep_operations = {
+ .open = lockdep_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
+ hi2 = debug_atomic_read(&hardirqs_off_events),
+ hr1 = debug_atomic_read(&redundant_hardirqs_on),
+ hr2 = debug_atomic_read(&redundant_hardirqs_off),
+ si1 = debug_atomic_read(&softirqs_on_events),
+ si2 = debug_atomic_read(&softirqs_off_events),
+ sr1 = debug_atomic_read(&redundant_softirqs_on),
+ sr2 = debug_atomic_read(&redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11u\n",
+ debug_atomic_read(&chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11u\n",
+ debug_atomic_read(&chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11u\n",
+ debug_atomic_read(&nr_cyclic_checks));
+ seq_printf(m, " cyclic-check recursions: %11u\n",
+ debug_atomic_read(&nr_cyclic_check_recursions));
+ seq_printf(m, " find-mask forwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask forwards recursions: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_recursions));
+ seq_printf(m, " find-mask backwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_checks));
+ seq_printf(m, " find-mask backwards recursions:%11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_recursions));
+
+ seq_printf(m, " hardirq on events: %11u\n", hi1);
+ seq_printf(m, " hardirq off events: %11u\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
+ seq_printf(m, " softirq on events: %11u\n", si1);
+ seq_printf(m, " softirq off events: %11u\n", si2);
+ seq_printf(m, " redundant softirq ons: %11u\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11u\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class;
+ unsigned long nr_unused = 0, nr_uncategorized = 0,
+ nr_irq_safe = 0, nr_irq_unsafe = 0,
+ nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+ nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+ nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+ nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+ nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+ sum_forward_deps = 0, factor = 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+ if (class->usage_mask == 0)
+ nr_unused++;
+ if (class->usage_mask == LOCKF_USED)
+ nr_uncategorized++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ)
+ nr_irq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS)
+ nr_irq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ nr_softirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ nr_softirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ nr_hardirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ nr_hardirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+ nr_irq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+ nr_irq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+ nr_softirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ nr_softirq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+ nr_hardirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ nr_hardirq_read_unsafe++;
+
+ sum_forward_deps += count_forward_deps(class);
+ }
+#ifdef CONFIG_LOCKDEP_DEBUG
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+#endif
+ seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
+ nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
+ nr_list_entries, MAX_LOCKDEP_ENTRIES);
+ seq_printf(m, " indirect dependencies: %11lu\n",
+ sum_forward_deps);
+
+ /*
+ * Total number of dependencies:
+ *
+ * All irq-safe locks may nest inside irq-unsafe locks,
+ * plus all the other known dependencies:
+ */
+ seq_printf(m, " all direct dependencies: %11lu\n",
+ nr_irq_unsafe * nr_irq_safe +
+ nr_hardirq_unsafe * nr_hardirq_safe +
+ nr_list_entries);
+
+ /*
+ * Estimated factor between direct and indirect
+ * dependencies:
+ */
+ if (nr_list_entries)
+ factor = sum_forward_deps / nr_list_entries;
+
+ seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
+ nr_lock_chains, MAX_LOCKDEP_CHAINS);
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ seq_printf(m, " in-hardirq chains: %11u\n",
+ nr_hardirq_chains);
+ seq_printf(m, " in-softirq chains: %11u\n",
+ nr_softirq_chains);
+#endif
+ seq_printf(m, " in-process chains: %11u\n",
+ nr_process_chains);
+ seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
+ nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+ seq_printf(m, " combined max dependencies: %11u\n",
+ (nr_hardirq_chains + 1) *
+ (nr_softirq_chains + 1) *
+ (nr_process_chains + 1)
+ );
+ seq_printf(m, " hardirq-safe locks: %11lu\n",
+ nr_hardirq_safe);
+ seq_printf(m, " hardirq-unsafe locks: %11lu\n",
+ nr_hardirq_unsafe);
+ seq_printf(m, " softirq-safe locks: %11lu\n",
+ nr_softirq_safe);
+ seq_printf(m, " softirq-unsafe locks: %11lu\n",
+ nr_softirq_unsafe);
+ seq_printf(m, " irq-safe locks: %11lu\n",
+ nr_irq_safe);
+ seq_printf(m, " irq-unsafe locks: %11lu\n",
+ nr_irq_unsafe);
+
+ seq_printf(m, " hardirq-read-safe locks: %11lu\n",
+ nr_hardirq_read_safe);
+ seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
+ nr_hardirq_read_unsafe);
+ seq_printf(m, " softirq-read-safe locks: %11lu\n",
+ nr_softirq_read_safe);
+ seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
+ nr_softirq_read_unsafe);
+ seq_printf(m, " irq-read-safe locks: %11lu\n",
+ nr_irq_read_safe);
+ seq_printf(m, " irq-read-unsafe locks: %11lu\n",
+ nr_irq_read_unsafe);
+
+ seq_printf(m, " uncategorized locks: %11lu\n",
+ nr_uncategorized);
+ seq_printf(m, " unused locks: %11lu\n",
+ nr_unused);
+ seq_printf(m, " max locking depth: %11u\n",
+ max_lockdep_depth);
+ seq_printf(m, " max recursion depth: %11u\n",
+ max_recursion_depth);
+ lockdep_stats_debug_show(m);
+ seq_printf(m, " debug_locks: %11u\n",
+ debug_locks);
+
+ return 0;
+}
+
+static int lockdep_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lockdep_stats_show, NULL);
+}
+
+static struct file_operations proc_lockdep_stats_operations = {
+ .open = lockdep_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init lockdep_proc_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_lockdep_operations;
+
+ entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_lockdep_stats_operations;
+
+ return 0;
+}
+
+__initcall(lockdep_proc_init);
+
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
-/* Rewritten by Rusty Russell, on the backs of many others...
+/*
Copyright (C) 2002 Richard Henderson
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
@@ -16,7 +16,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
@@ -40,9 +39,11 @@
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/mutex.h>
+#include <linux/unwind.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
+#include <linux/license.h>
#if 0
#define DEBUGP printk
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const struct kernel_symbol __start___ksymtab_gpl_future[];
extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
extern const unsigned long __start___kcrctab[];
extern const unsigned long __start___kcrctab_gpl[];
extern const unsigned long __start___kcrctab_gpl_future[];
+extern const unsigned long __start___kcrctab_unused[];
+extern const unsigned long __start___kcrctab_unused_gpl[];
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
return NULL;
}
+static void printk_unused_warning(const char *name)
+{
+ printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+ "however this module is using it.\n", name);
+ printk(KERN_WARNING "This symbol will go away in the future.\n");
+ printk(KERN_WARNING "Please evalute if this is the right api to use, "
+ "and if it really is, submit a report the linux kernel "
+ "mailinglist together with submitting your code for "
+ "inclusion.\n");
+}
+
/* Find a symbol, return value, crc and module which owns it */
static unsigned long __find_symbol(const char *name,
struct module **owner,
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name,
return ks->value;
}
+ ks = lookup_symbol(name, __start___ksymtab_unused,
+ __stop___ksymtab_unused);
+ if (ks) {
+ printk_unused_warning(name);
+ *crc = symversion(__start___kcrctab_unused,
+ (ks - __start___ksymtab_unused));
+ return ks->value;
+ }
+
+ if (gplok)
+ ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
+ __stop___ksymtab_unused_gpl);
+ if (ks) {
+ printk_unused_warning(name);
+ *crc = symversion(__start___kcrctab_unused_gpl,
+ (ks - __start___ksymtab_unused_gpl));
+ return ks->value;
+ }
+
/* Now try modules. */
list_for_each_entry(mod, &modules, list) {
*owner = mod;
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name,
return ks->value;
}
}
+ ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
+ if (ks) {
+ printk_unused_warning(name);
+ *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
+ return ks->value;
+ }
+
+ if (gplok) {
+ ks = lookup_symbol(name, mod->unused_gpl_syms,
+ mod->unused_gpl_syms + mod->num_unused_gpl_syms);
+ if (ks) {
+ printk_unused_warning(name);
+ *crc = symversion(mod->unused_gpl_crcs,
+ (ks - mod->unused_gpl_syms));
+ return ks->value;
+ }
+ }
ks = lookup_symbol(name, mod->gpl_future_syms,
(mod->gpl_future_syms +
mod->num_gpl_future_syms));
@@ -877,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
return sprintf(buf, "0x%lx\n", sattr->address);
}
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs);
+}
+
static void add_sect_attrs(struct module *mod, unsigned int nsect,
char *secstrings, Elf_Shdr *sechdrs)
{
@@ -893,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
+ nloaded * sizeof(sect_attrs->attrs[0]),
sizeof(sect_attrs->grp.attrs[0]));
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
- if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (sect_attrs == NULL)
return;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+ sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
gattr = &sect_attrs->grp.attrs[0];
for (i = 0; i < nsect; i++) {
if (! (sechdrs[i].sh_flags & SHF_ALLOC))
continue;
sattr->address = sechdrs[i].sh_addr;
- strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
- MODULE_SECT_NAME_LEN);
+ sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ GFP_KERNEL);
+ if (sattr->name == NULL)
+ goto out;
+ sect_attrs->nsections++;
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
@@ -923,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
mod->sect_attrs = sect_attrs;
return;
out:
- kfree(sect_attrs);
+ free_sect_attrs(sect_attrs);
}
static void remove_sect_attrs(struct module *mod)
@@ -933,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
&mod->sect_attrs->grp);
/* We are positive that no one is using any sect attrs
* at this point. Deallocate immediately. */
- kfree(mod->sect_attrs);
+ free_sect_attrs(mod->sect_attrs);
mod->sect_attrs = NULL;
}
}
-
#else
+
static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
char *sectstrings, Elf_Shdr *sechdrs)
{
@@ -998,6 +1068,12 @@ static int mod_sysfs_setup(struct module *mod,
{
int err;
+ if (!module_subsys.kset.subsys) {
+ printk(KERN_ERR "%s: module_subsys not initialized\n",
+ mod->name);
+ err = -EINVAL;
+ goto out;
+ }
memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
if (err)
@@ -1051,6 +1127,8 @@ static void free_module(struct module *mod)
remove_sect_attrs(mod);
mod_kobject_remove(mod);
+ unwind_remove_table(mod->unwind_info, 0);
+
/* Arch-specific cleanup. */
module_arch_cleanup(mod);
@@ -1063,6 +1141,9 @@ static void free_module(struct module *mod)
if (mod->percpu)
percpu_modfree(mod->percpu);
+ /* Free lock-classes: */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
/* Finally, free the core (containing the module structure) */
module_free(mod, mod->module_core);
}
@@ -1248,16 +1329,6 @@ static void layout_sections(struct module *mod,
}
}
-static inline int license_is_gpl_compatible(const char *license)
-{
- return (strcmp(license, "GPL") == 0
- || strcmp(license, "GPL v2") == 0
- || strcmp(license, "GPL and additional rights") == 0
- || strcmp(license, "Dual BSD/GPL") == 0
- || strcmp(license, "Dual MIT/GPL") == 0
- || strcmp(license, "Dual MPL/GPL") == 0);
-}
-
static void set_license(struct module *mod, const char *license)
{
if (!license)
@@ -1326,7 +1397,7 @@ int is_exported(const char *name, const struct module *mod)
if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
return 1;
else
- if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
+ if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
return 1;
else
return 0;
@@ -1409,10 +1480,27 @@ static struct module *load_module(void __user *umod,
Elf_Ehdr *hdr;
Elf_Shdr *sechdrs;
char *secstrings, *args, *modmagic, *strtab = NULL;
- unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
- exportindex, modindex, obsparmindex, infoindex, gplindex,
- crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
- gplfuturecrcindex;
+ unsigned int i;
+ unsigned int symindex = 0;
+ unsigned int strindex = 0;
+ unsigned int setupindex;
+ unsigned int exindex;
+ unsigned int exportindex;
+ unsigned int modindex;
+ unsigned int obsparmindex;
+ unsigned int infoindex;
+ unsigned int gplindex;
+ unsigned int crcindex;
+ unsigned int gplcrcindex;
+ unsigned int versindex;
+ unsigned int pcpuindex;
+ unsigned int gplfutureindex;
+ unsigned int gplfuturecrcindex;
+ unsigned int unwindex = 0;
+ unsigned int unusedindex;
+ unsigned int unusedcrcindex;
+ unsigned int unusedgplindex;
+ unsigned int unusedgplcrcindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,15 +1581,22 @@ static struct module *load_module(void __user *umod,
exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
+ unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+ unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+ unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
+ unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+#ifdef ARCH_UNWIND_SECTION_NAME
+ unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
+#endif
/* Don't keep modinfo section */
sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1605,8 @@ static struct module *load_module(void __user *umod,
sechdrs[symindex].sh_flags |= SHF_ALLOC;
sechdrs[strindex].sh_flags |= SHF_ALLOC;
#endif
+ if (unwindex)
+ sechdrs[unwindex].sh_flags |= SHF_ALLOC;
/* Check module struct version now, before we try to use module. */
if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1639,14 +1736,27 @@ static struct module *load_module(void __user *umod,
mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
sizeof(*mod->gpl_future_syms);
+ mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+ sizeof(*mod->unused_syms);
+ mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+ sizeof(*mod->unused_gpl_syms);
mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
if (gplfuturecrcindex)
mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+ mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
+ if (unusedcrcindex)
+ mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
+ mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
+ if (unusedgplcrcindex)
+ mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !crcindex) ||
(mod->num_gpl_syms && !gplcrcindex) ||
- (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
+ (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+ (mod->num_unused_syms && !unusedcrcindex) ||
+ (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
printk(KERN_WARNING "%s: No versions for exported symbols."
" Tainting kernel.\n", mod->name);
add_taint(TAINT_FORCED_MODULE);
@@ -1738,6 +1848,11 @@ static struct module *load_module(void __user *umod,
goto arch_cleanup;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+ /* Size of section 0 is 0, so this works well if no unwind info. */
+ mod->unwind_info = unwind_add_table(mod,
+ (void *)sechdrs[unwindex].sh_addr,
+ sechdrs[unwindex].sh_size);
+
/* Get rid of temporary copy */
vfree(hdr);
@@ -1836,6 +1951,7 @@ sys_init_module(void __user *umod,
mod->state = MODULE_STATE_LIVE;
/* Drop initial reference. */
module_put(mod);
+ unwind_remove_table(mod->unwind_info, 1);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
@@ -1923,10 +2039,8 @@ const char *module_address_lookup(unsigned long addr,
return NULL;
}
-struct module *module_get_kallsym(unsigned int symnum,
- unsigned long *value,
- char *type,
- char namebuf[128])
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, size_t namelen)
{
struct module *mod;
@@ -1935,9 +2049,8 @@ struct module *module_get_kallsym(unsigned int symnum,
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
- strncpy(namebuf,
- mod->strtab + mod->symtab[symnum].st_name,
- 127);
+ strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+ namelen);
mutex_unlock(&module_mutex);
return mod;
}
@@ -2066,6 +2179,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
return e;
}
+/*
+ * Is this a valid module address?
+ */
+int is_module_address(unsigned long addr)
+{
+ unsigned long flags;
+ struct module *mod;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_core, mod->core_size)) {
+ spin_unlock_irqrestore(&modlist_lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&modlist_lock, flags);
+
+ return 0;
+}
+
+
/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
struct module *__module_text_address(unsigned long addr)
{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e3203c654dda 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,395 +16,48 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/poison.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
#include "mutex-debug.h"
/*
- * We need a global lock when we walk through the multi-process
- * lock tree. Only used in the deadlock-debugging case.
- */
-DEFINE_SPINLOCK(debug_mutex_lock);
-
-/*
- * All locks held by all tasks, in a single global list:
- */
-LIST_HEAD(debug_mutex_held_locks);
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__ , unsigned long ip
-#define __IP__ , ip
-#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
-
-/*
- * "mutex debugging enabled" flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-int debug_mutex_on = 1;
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_ti(struct thread_info *ti)
-{
- if (ti)
- printk_task(ti->task);
- else
- printk("<none>");
-}
-
-static void printk_task_short(struct task_struct *p)
-{
- if (p)
- printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct mutex *lock, int print_owner)
-{
- printk(" [%p] {%s}\n", lock, lock->name);
-
- if (print_owner && lock->owner) {
- printk(".. held by: ");
- printk_ti(lock->owner);
- printk("\n");
- }
- if (lock->owner) {
- printk("... acquired at: ");
- print_symbol("%s\n", lock->acquire_ip);
- }
-}
-
-/*
- * printk locks held by a task:
- */
-static void show_task_locks(struct task_struct *p)
-{
- switch (p->state) {
- case TASK_RUNNING: printk("R"); break;
- case TASK_INTERRUPTIBLE: printk("S"); break;
- case TASK_UNINTERRUPTIBLE: printk("D"); break;
- case TASK_STOPPED: printk("T"); break;
- case EXIT_ZOMBIE: printk("Z"); break;
- case EXIT_DEAD: printk("X"); break;
- default: printk("?"); break;
- }
- printk_task(p);
- if (p->blocked_on) {
- struct mutex *lock = p->blocked_on->lock;
-
- printk(" blocked on mutex:");
- printk_lock(lock, 1);
- } else
- printk(" (not blocked on mutex)\n");
-}
-
-/*
- * printk all locks held in the system (if filter == NULL),
- * or all locks belonging to a single task (if filter != NULL):
- */
-void show_held_locks(struct task_struct *filter)
-{
- struct list_head *curr, *cursor = NULL;
- struct mutex *lock;
- struct thread_info *t;
- unsigned long flags;
- int count = 0;
-
- if (filter) {
- printk("------------------------------\n");
- printk("| showing all locks held by: | (");
- printk_task_short(filter);
- printk("):\n");
- printk("------------------------------\n");
- } else {
- printk("---------------------------\n");
- printk("| showing all locks held: |\n");
- printk("---------------------------\n");
- }
-
- /*
- * Play safe and acquire the global trace lock. We
- * cannot printk with that lock held so we iterate
- * very carefully:
- */
-next:
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each(curr, &debug_mutex_held_locks) {
- if (cursor && curr != cursor)
- continue;
- lock = list_entry(curr, struct mutex, held_list);
- t = lock->owner;
- if (filter && (t != filter->thread_info))
- continue;
- count++;
- cursor = curr->next;
- debug_spin_lock_restore(&debug_mutex_lock, flags);
-
- printk("\n#%03d: ", count);
- printk_lock(lock, filter ? 0 : 1);
- goto next;
- }
- debug_spin_lock_restore(&debug_mutex_lock, flags);
- printk("\n");
-}
-
-void mutex_debug_show_all_locks(void)
-{
- struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
-
- printk("\nShowing all blocking locks in the system:\n");
-
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- printk(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- printk(" ignoring it.\n");
- unlock = 0;
- }
- if (count != 10)
- printk(" locked it.\n");
-
- do_each_thread(g, p) {
- show_task_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
-
- printk("\n");
- show_held_locks(NULL);
- printk("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
-}
-
-static void report_deadlock(struct task_struct *task, struct mutex *lock,
- struct mutex *lockblk, unsigned long ip)
-{
- printk("\n%s/%d is trying to acquire this lock:\n",
- current->comm, current->pid);
- printk_lock(lock, 1);
- printk("... trying at: ");
- print_symbol("%s\n", ip);
- show_held_locks(current);
-
- if (lockblk) {
- printk("but %s/%d is deadlocking current task %s/%d!\n\n",
- task->comm, task->pid, current->comm, current->pid);
- printk("\n%s/%d is blocked on this lock:\n",
- task->comm, task->pid);
- printk_lock(lockblk, 1);
-
- show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task->pid);
- show_stack(task, NULL);
- }
-
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, current->pid);
- dump_stack();
- mutex_debug_show_all_locks();
- printk("[ turning off deadlock detection. Please report this. ]\n\n");
- local_irq_disable();
-}
-
-/*
- * Recursively check for mutex deadlocks:
- */
-static int check_deadlock(struct mutex *lock, int depth,
- struct thread_info *ti, unsigned long ip)
-{
- struct mutex *lockblk;
- struct task_struct *task;
-
- if (!debug_mutex_on)
- return 0;
-
- ti = lock->owner;
- if (!ti)
- return 0;
-
- task = ti->task;
- lockblk = NULL;
- if (task->blocked_on)
- lockblk = task->blocked_on->lock;
-
- /* Self-deadlock: */
- if (current == task) {
- DEBUG_OFF();
- if (depth)
- return 1;
- printk("\n==========================================\n");
- printk( "[ BUG: lock recursion deadlock detected! |\n");
- printk( "------------------------------------------\n");
- report_deadlock(task, lock, NULL, ip);
- return 0;
- }
-
- /* Ugh, something corrupted the lock data structure? */
- if (depth > 20) {
- DEBUG_OFF();
- printk("\n===========================================\n");
- printk( "[ BUG: infinite lock dependency detected!? |\n");
- printk( "-------------------------------------------\n");
- report_deadlock(task, lock, lockblk, ip);
- return 0;
- }
-
- /* Recursively check for dependencies: */
- if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
- printk("\n============================================\n");
- printk( "[ BUG: circular locking deadlock detected! ]\n");
- printk( "--------------------------------------------\n");
- report_deadlock(task, lock, lockblk, ip);
- return 0;
- }
- return 0;
-}
-
-/*
- * Called when a task exits, this function checks whether the
- * task is holding any locks, and reports the first one if so:
- */
-void mutex_debug_check_no_locks_held(struct task_struct *task)
-{
- struct list_head *curr, *next;
- struct thread_info *t;
- unsigned long flags;
- struct mutex *lock;
-
- if (!debug_mutex_on)
- return;
-
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each_safe(curr, next, &debug_mutex_held_locks) {
- lock = list_entry(curr, struct mutex, held_list);
- t = lock->owner;
- if (t != task->thread_info)
- continue;
- list_del_init(curr);
- DEBUG_OFF();
- debug_spin_lock_restore(&debug_mutex_lock, flags);
-
- printk("BUG: %s/%d, lock held at task exit time!\n",
- task->comm, task->pid);
- printk_lock(lock, 1);
- if (lock->owner != task->thread_info)
- printk("exiting task is not even the owner??\n");
- return;
- }
- debug_spin_lock_restore(&debug_mutex_lock, flags);
-}
-
-/*
- * Called when kernel memory is freed (or unmapped), or if a mutex
- * is destroyed or reinitialized - this code checks whether there is
- * any held lock in the memory range of <from> to <to>:
- */
-void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
-{
- struct list_head *curr, *next;
- const void *to = from + len;
- unsigned long flags;
- struct mutex *lock;
- void *lock_addr;
-
- if (!debug_mutex_on)
- return;
-
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each_safe(curr, next, &debug_mutex_held_locks) {
- lock = list_entry(curr, struct mutex, held_list);
- lock_addr = lock;
- if (lock_addr < from || lock_addr >= to)
- continue;
- list_del_init(curr);
- DEBUG_OFF();
- debug_spin_lock_restore(&debug_mutex_lock, flags);
-
- printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
- current->comm, current->pid, lock, from, to);
- dump_stack();
- printk_lock(lock, 1);
- if (lock->owner != current_thread_info())
- printk("freeing task is not even the owner??\n");
- return;
- }
- debug_spin_lock_restore(&debug_mutex_lock, flags);
-}
-
-/*
* Must be called with lock->wait_lock held.
*/
-void debug_mutex_set_owner(struct mutex *lock,
- struct thread_info *new_owner __IP_DECL__)
+void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
{
lock->owner = new_owner;
- DEBUG_WARN_ON(!list_empty(&lock->held_list));
- if (debug_mutex_on) {
- list_add_tail(&lock->held_list, &debug_mutex_held_locks);
- lock->acquire_ip = ip;
- }
}
-void debug_mutex_init_waiter(struct mutex_waiter *waiter)
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
- memset(waiter, 0x11, sizeof(*waiter));
+ memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
- SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
- DEBUG_WARN_ON(list_empty(&lock->wait_list));
- DEBUG_WARN_ON(waiter->magic != waiter);
- DEBUG_WARN_ON(list_empty(&waiter->list));
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
}
void debug_mutex_free_waiter(struct mutex_waiter *waiter)
{
- DEBUG_WARN_ON(!list_empty(&waiter->list));
- memset(waiter, 0x22, sizeof(*waiter));
+ DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
+ memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti __IP_DECL__)
+ struct thread_info *ti)
{
- SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
- check_deadlock(lock, 0, ti, ip);
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
waiter->lock = lock;
@@ -413,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti)
{
- DEBUG_WARN_ON(list_empty(&waiter->list));
- DEBUG_WARN_ON(waiter->task != ti->task);
- DEBUG_WARN_ON(ti->task->blocked_on != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+ DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
ti->task->blocked_on = NULL;
list_del_init(&waiter->list);
@@ -424,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void debug_mutex_unlock(struct mutex *lock)
{
- DEBUG_WARN_ON(lock->magic != lock);
- DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- DEBUG_WARN_ON(lock->owner != current_thread_info());
- if (debug_mutex_on) {
- DEBUG_WARN_ON(list_empty(&lock->held_list));
- list_del_init(&lock->held_list);
- }
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
}
-void debug_mutex_init(struct mutex *lock, const char *name)
+void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
- mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key);
+#endif
lock->owner = NULL;
- INIT_LIST_HEAD(&lock->held_list);
- lock->name = name;
lock->magic = lock;
}
@@ -455,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
*/
void fastcall mutex_destroy(struct mutex *lock)
{
- DEBUG_WARN_ON(mutex_is_locked(lock));
+ DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
lock->magic = NULL;
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..babfbdfc534b 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -10,125 +10,44 @@
* More details are in kernel/mutex-debug.c.
*/
-extern spinlock_t debug_mutex_lock;
-extern struct list_head debug_mutex_held_locks;
-extern int debug_mutex_on;
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__ , unsigned long ip
-#define __IP__ , ip
-#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
-
/*
* This must be called with lock->wait_lock held.
*/
-extern void debug_mutex_set_owner(struct mutex *lock,
- struct thread_info *new_owner __IP_DECL__);
+extern void
+debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
static inline void debug_mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
-extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
- struct thread_info *ti __IP_DECL__);
+ struct thread_info *ti);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti);
extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name);
-
-#define debug_spin_lock(lock) \
- do { \
- local_irq_disable(); \
- if (debug_mutex_on) \
- spin_lock(lock); \
- } while (0)
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
-#define debug_spin_unlock(lock) \
- do { \
- if (debug_mutex_on) \
- spin_unlock(lock); \
- local_irq_enable(); \
- preempt_check_resched(); \
- } while (0)
-
-#define debug_spin_lock_save(lock, flags) \
+#define spin_lock_mutex(lock, flags) \
do { \
+ struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+ \
+ DEBUG_LOCKS_WARN_ON(in_interrupt()); \
local_irq_save(flags); \
- if (debug_mutex_on) \
- spin_lock(lock); \
+ __raw_spin_lock(&(lock)->raw_lock); \
+ DEBUG_LOCKS_WARN_ON(l->magic != l); \
} while (0)
-#define debug_spin_lock_restore(lock, flags) \
+#define spin_unlock_mutex(lock, flags) \
do { \
- if (debug_mutex_on) \
- spin_unlock(lock); \
+ __raw_spin_unlock(&(lock)->raw_lock); \
local_irq_restore(flags); \
preempt_check_resched(); \
} while (0)
-
-#define spin_lock_mutex(lock) \
- do { \
- struct mutex *l = container_of(lock, struct mutex, wait_lock); \
- \
- DEBUG_WARN_ON(in_interrupt()); \
- debug_spin_lock(&debug_mutex_lock); \
- spin_lock(lock); \
- DEBUG_WARN_ON(l->magic != l); \
- } while (0)
-
-#define spin_unlock_mutex(lock) \
- do { \
- spin_unlock(lock); \
- debug_spin_unlock(&debug_mutex_lock); \
- } while (0)
-
-#define DEBUG_OFF() \
-do { \
- if (debug_mutex_on) { \
- debug_mutex_on = 0; \
- console_verbose(); \
- if (spin_is_locked(&debug_mutex_lock)) \
- spin_unlock(&debug_mutex_lock); \
- } \
-} while (0)
-
-#define DEBUG_BUG() \
-do { \
- if (debug_mutex_on) { \
- DEBUG_OFF(); \
- BUG(); \
- } \
-} while (0)
-
-#define DEBUG_WARN_ON(c) \
-do { \
- if (unlikely(c && debug_mutex_on)) { \
- DEBUG_OFF(); \
- WARN_ON(1); \
- } \
-} while (0)
-
-# define DEBUG_BUG_ON(c) \
-do { \
- if (unlikely(c)) \
- DEBUG_BUG(); \
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
-# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
-#else
-# define SMP_DEBUG_WARN_ON(c) do { } while (0)
-# define SMP_DEBUG_BUG_ON(c) do { } while (0)
-#endif
-
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..8c71cf72a497 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -38,13 +39,14 @@
*
* It is not allowed to initialize an already locked mutex.
*/
-void fastcall __mutex_init(struct mutex *lock, const char *name)
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
- debug_mutex_init(lock, name);
+ debug_mutex_init(lock, name, key);
}
EXPORT_SYMBOL(__mutex_init);
@@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
* branch is predicted by the CPU as default-untaken.
*/
static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_slowpath(atomic_t *lock_count);
/***
* mutex_lock - acquire the mutex
@@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
*
* This function is similar to (but not equivalent to) down().
*/
-void fastcall __sched mutex_lock(struct mutex *lock)
+void inline fastcall __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
@@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
static void fastcall noinline __sched
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_unlock_slowpath(atomic_t *lock_count);
/***
* mutex_unlock - release the mutex
@@ -120,17 +122,18 @@ EXPORT_SYMBOL(mutex_unlock);
* Lock a mutex (possibly interruptible), slowpath:
*/
static inline int __sched
-__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned int old_val;
+ unsigned long flags;
- debug_mutex_init_waiter(&waiter);
+ spin_lock_mutex(&lock->wait_lock, flags);
- spin_lock_mutex(&lock->wait_lock);
-
- debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
+ debug_mutex_lock_common(lock, &waiter);
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ debug_mutex_add_waiter(lock, &waiter, task->thread_info);
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -157,7 +160,8 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
if (unlikely(state == TASK_INTERRUPTIBLE &&
signal_pending(task))) {
mutex_remove_waiter(lock, &waiter, task->thread_info);
- spin_unlock_mutex(&lock->wait_lock);
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
return -EINTR;
@@ -165,48 +169,57 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
__set_task_state(task, state);
/* didnt get the lock, go to sleep: */
- spin_unlock_mutex(&lock->wait_lock);
+ spin_unlock_mutex(&lock->wait_lock, flags);
schedule();
- spin_lock_mutex(&lock->wait_lock);
+ spin_lock_mutex(&lock->wait_lock, flags);
}
/* got the lock - rejoice! */
mutex_remove_waiter(lock, &waiter, task->thread_info);
- debug_mutex_set_owner(lock, task->thread_info __IP__);
+ debug_mutex_set_owner(lock, task->thread_info);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
- spin_unlock_mutex(&lock->wait_lock);
+ spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
- DEBUG_WARN_ON(list_empty(&lock->held_list));
- DEBUG_WARN_ON(lock->owner != task->thread_info);
-
return 0;
}
static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+#endif
+
/*
* Release the lock, slowpath:
*/
-static fastcall noinline void
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
+static fastcall inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
- DEBUG_WARN_ON(lock->owner != current_thread_info());
-
- spin_lock_mutex(&lock->wait_lock);
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
/*
* some architectures leave the lock unlocked in the fastpath failure
@@ -216,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
- debug_mutex_unlock(lock);
-
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
@@ -231,7 +242,16 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
debug_mutex_clear_owner(lock);
- spin_unlock_mutex(&lock->wait_lock);
+ spin_unlock_mutex(&lock->wait_lock, flags);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+ __mutex_unlock_common_slowpath(lock_count, 1);
}
/*
@@ -239,7 +259,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
* mutex_lock_interruptible() and mutex_trylock().
*/
static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
/***
* mutex_lock_interruptible - acquire the mutex, interruptable
@@ -262,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
}
/*
@@ -276,18 +296,21 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
int prev;
- spin_lock_mutex(&lock->wait_lock);
+ spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
- if (likely(prev == 1))
- debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
+ if (likely(prev == 1)) {
+ debug_mutex_set_owner(lock, current_thread_info());
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
- spin_unlock_mutex(&lock->wait_lock);
+ spin_unlock_mutex(&lock->wait_lock, flags);
return prev == 1;
}
@@ -306,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
* This function must not be used in interrupt context. The
* mutex must be released by the same task that acquired it.
*/
-int fastcall mutex_trylock(struct mutex *lock)
+int fastcall __sched mutex_trylock(struct mutex *lock)
{
return __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..a075dafbb290 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,27 +9,22 @@
* !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
-#define spin_lock_mutex(lock) spin_lock(lock)
-#define spin_unlock_mutex(lock) spin_unlock(lock)
+#define spin_lock_mutex(lock, flags) \
+ do { spin_lock(lock); (void)(flags); } while (0)
+#define spin_unlock_mutex(lock, flags) \
+ do { spin_unlock(lock); (void)(flags); } while (0)
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#define DEBUG_WARN_ON(c) do { } while (0)
#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
#define debug_mutex_clear_owner(lock) do { } while (0)
-#define debug_mutex_init_waiter(waiter) do { } while (0)
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name) do { } while (0)
-
-/*
- * Return-address parameters/declarations. They are very useful for
- * debugging, but add overhead in the !DEBUG case - so we go the
- * trouble of using this not too elegant but zero-cost solution:
- */
-#define __IP_DECL__
-#define __IP__
-#define __RET_IP__
+#define debug_mutex_init(lock, name, key) do { } while (0)
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index cc2a4c9c36ac..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,7 +8,6 @@
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
@@ -19,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/nmi.h>
#include <linux/kexec.h>
+#include <linux/debug_locks.h>
int panic_on_oops;
int tainted;
@@ -173,6 +173,7 @@ const char *print_tainted(void)
void add_taint(unsigned flag)
{
+ debug_locks = 0; /* can't trust the integrity of the kernel anymore */
tainted |= flag;
}
EXPORT_SYMBOL(add_taint);
@@ -257,6 +258,7 @@ int oops_may_print(void)
*/
void oops_enter(void)
{
+ debug_locks_off(); /* can't trust the integrity of the kernel anymore */
do_oops_enter_exit();
}
@@ -268,3 +270,15 @@ void oops_exit(void)
{
do_oops_enter_exit();
}
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+ panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
diff --git a/kernel/params.c b/kernel/params.c
index af43ecdc8d9b..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/config.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -548,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
unsigned int name_skip)
{
struct module_kobject *mk;
+ int ret;
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -555,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
kobject_set_name(&mk->kobj, name);
- kobject_register(&mk->kobj);
+ ret = kobject_register(&mk->kobj);
+ BUG_ON(ret < 0);
/* no need to keep the kobject if no parameter is exported */
if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -685,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
*/
static int __init param_sysfs_init(void)
{
- subsystem_register(&module_subsys);
+ int ret;
+
+ ret = subsystem_register(&module_subsys);
+ if (ret < 0) {
+ printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+ __FILE__, __LINE__, ret);
+ return ret;
+ }
param_sysfs_builtin();
return 0;
}
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index eeb836b65ca4..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,14 +218,11 @@ struct pid * fastcall find_pid(int nr)
return NULL;
}
-int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
{
struct pid_link *link;
struct pid *pid;
- WARN_ON(!task->pid); /* to be removed soon */
- WARN_ON(!nr); /* to be removed soon */
-
link = &task->pids[type];
link->pid = pid = find_pid(nr);
hlist_add_head_rcu(&link->node, &pid->tasks[type]);
@@ -233,7 +230,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
return 0;
}
-void fastcall detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(struct task_struct *task, enum pid_type type)
{
struct pid_link *link;
struct pid *pid;
@@ -252,6 +249,15 @@ void fastcall detach_pid(task_t *task, enum pid_type type)
free_pid(pid);
}
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+ enum pid_type type)
+{
+ new->pids[type].pid = old->pids[type].pid;
+ hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+ old->pids[type].pid = NULL;
+}
+
struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
@@ -267,7 +273,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
-task_t *find_task_by_pid_type(int type, int nr)
+struct task_struct *find_task_by_pid_type(int type, int nr)
{
return pid_task(find_pid(nr), type);
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
}
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct itimerspec *it)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
struct k_itimer timer;
int error;
/*
- * Diagnose required errors first.
- */
- if (CPUCLOCK_PERTHREAD(which_clock) &&
- (CPUCLOCK_PID(which_clock) == 0 ||
- CPUCLOCK_PID(which_clock) == current->pid))
- return -EINVAL;
-
- /*
* Set up a temporary timer and then wait for it to go off.
*/
memset(&timer, 0, sizeof timer);
@@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
timer.it_process = current;
if (!error) {
static struct itimerspec zero_it;
- struct itimerspec it = { .it_value = *rqtp,
- .it_interval = {} };
+
+ memset(it, 0, sizeof *it);
+ it->it_value = *rqtp;
spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+ error = posix_cpu_timer_set(&timer, flags, it, NULL);
if (error) {
spin_unlock_irq(&timer.it_lock);
return error;
@@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+ posix_cpu_timer_set(&timer, 0, &zero_it, it);
spin_unlock_irq(&timer.it_lock);
- if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+ if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
*/
return 0;
}
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ struct restart_block *restart_block =
+ &current_thread_info()->restart_block;
+ struct itimerspec it;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+
+ if (flags & TIMER_ABSTIME)
+ return -ERESTARTNOHAND;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
- copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
- restart_block->fn = posix_cpu_clock_nanosleep_restart;
- /* Caller already set restart_block->arg1 */
+ restart_block->fn = posix_cpu_nsleep_restart;
restart_block->arg0 = which_clock;
restart_block->arg1 = (unsigned long) rmtp;
restart_block->arg2 = rqtp->tv_sec;
restart_block->arg3 = rqtp->tv_nsec;
-
- error = -ERESTART_RESTARTBLOCK;
}
-
return error;
}
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
clockid_t which_clock = restart_block->arg0;
struct timespec __user *rmtp;
struct timespec t;
+ struct itimerspec it;
+ int error;
rmtp = (struct timespec __user *) restart_block->arg1;
t.tv_sec = restart_block->arg2;
t.tv_nsec = restart_block->arg3;
restart_block->fn = do_no_restart_syscall;
- return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+ error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->arg0 = which_clock;
+ restart_block->arg1 = (unsigned long) rmtp;
+ restart_block->arg2 = t.tv_sec;
+ restart_block->arg3 = t.tv_nsec;
+ }
+ return error;
+
}
@@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
{
return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static int thread_cpu_clock_getres(const clockid_t which_clock,
struct timespec *tp)
{
@@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
{
return -EINVAL;
}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
static __init int init_posix_cpu_timers(void)
{
@@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = process_cpu_timer_create,
.nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
};
struct k_clock thread = {
.clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = thread_cpu_timer_create,
.nsleep = thread_cpu_nsleep,
+ .nsleep_restart = thread_cpu_nsleep_restart,
};
register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
return CLOCK_DISPATCH(which_clock, nsleep,
(which_clock, flags, &t, rmtp));
}
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+ return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->arg0;
+
+ return CLOCK_DISPATCH(which_clock, nsleep_restart,
+ (restart_block));
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,9 +36,49 @@ config PM_DEBUG
code. This is helpful when debugging and reporting various PM bugs,
like suspend support.
+config DISABLE_CONSOLE_SUSPEND
+ bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+ depends on PM && PM_DEBUG
+ default n
+ ---help---
+ This option turns off the console suspend mechanism that prevents
+ debug messages from reaching the console during the suspend/resume
+ operations. This may be helpful when debugging device drivers'
+ suspend/resume routines, but may itself lead to problems, for example
+ if netconsole is used.
+
+config PM_TRACE
+ bool "Suspend/resume event tracing"
+ depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+ default n
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
+ To use this debugging feature you should attempt to suspend the machine,
+ then reboot it, then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
+config PM_SYSFS_DEPRECATED
+ bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
+ depends on PM && SYSFS
+ default n
+ help
+ The driver model started out with a sysfs file intended to provide
+ a userspace hook for device power management. This feature has never
+ worked very well, except for limited testing purposes, and so it will
+ be removed. It's not clear that a generic mechanism could really
+ handle the wide variability of device power states; any replacements
+ are likely to be bus or driver specific.
+
config SOFTWARE_SUSPEND
bool "Software Suspend"
- depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
+ depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
---help---
Enable the possibility of suspending the machine.
It doesn't need ACPI or APM.
@@ -60,6 +100,10 @@ config SOFTWARE_SUSPEND
For more information take a look at <file:Documentation/power/swsusp.txt>.
+ (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386.
+ we need identity mapping for resume to work, and that is trivial
+ to get with 4MB pages, but less than trivial on PAE).
+
config PM_STD_PARTITION
string "Default resume partition"
depends on SOFTWARE_SUSPEND
@@ -82,18 +126,6 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
-config SWSUSP_ENCRYPT
- bool "Encrypt suspend image"
- depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
- default ""
- ---help---
- To prevent data gathering from swap after resume you can encrypt
- the suspend image with a temporary key that is deleted on
- resume.
-
- Note that the temporary key is stored unencrypted on disk while the
- system is suspended.
-
config SUSPEND_SMP
bool
depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y := main.o process.o console.o
obj-$(CONFIG_PM_LEGACY) += pm.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
-obj-$(CONFIG_SUSPEND_SMP) += smp.o
-
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/cpu.h>
#include "power.h"
@@ -72,7 +73,10 @@ static int prepare_processes(void)
int error;
pm_prepare_console();
- disable_nonboot_cpus();
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto enable_cpus;
if (freeze_processes()) {
error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
return 0;
thaw:
thaw_processes();
+enable_cpus:
enable_nonboot_cpus();
pm_restore_console();
return error;
@@ -98,7 +103,7 @@ static void unprepare_processes(void)
}
/**
- * pm_suspend_disk - The granpappy of power management.
+ * pm_suspend_disk - The granpappy of hibernation power management.
*
* If we're going through the firmware, then get it over with quickly.
*
@@ -207,7 +212,7 @@ static int software_resume(void)
pr_debug("PM: Preparing devices for restore.\n");
- if ((error = device_suspend(PMSG_FREEZE))) {
+ if ((error = device_suspend(PMSG_PRETHAW))) {
printk("Some devices failed to suspend\n");
swsusp_free();
goto Thaw;
@@ -231,7 +236,7 @@ static int software_resume(void)
late_initcall(software_resume);
-static char * pm_disk_modes[] = {
+static const char * const pm_disk_modes[] = {
[PM_DISK_FIRMWARE] = "firmware",
[PM_DISK_PLATFORM] = "platform",
[PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index cdf0f07af92f..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/resume-trace.h>
#include "power.h"
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
static int suspend_prepare(suspend_state_t state)
{
- int error = 0;
+ int error;
unsigned int free_pages;
if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
pm_prepare_console();
- disable_nonboot_cpus();
-
- if (num_online_cpus() != 1) {
- error = -EPERM;
+ error = disable_nonboot_cpus();
+ if (error)
goto Enable_cpu;
- }
if (freeze_processes()) {
error = -EAGAIN;
@@ -145,7 +144,7 @@ static void suspend_finish(suspend_state_t state)
-static char *pm_states[PM_SUSPEND_MAX] = {
+static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -262,7 +261,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
{
suspend_state_t state = PM_SUSPEND_STANDBY;
- char ** s;
+ const char * const *s;
char *p;
int error;
int len;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
power_attr(state);
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+ return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+ int val;
+
+ if (sscanf(buf, "%d", &val) == 1) {
+ pm_trace_enabled = !!val;
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static struct attribute * g[] = {
+ &state_attr.attr,
+ &pm_trace_attr.attr,
+ NULL,
+};
+#else
static struct attribute * g[] = {
&state_attr.attr,
NULL,
};
+#endif /* CONFIG_PM_TRACE */
static struct attribute_group attr_group = {
.attrs = g,
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
return dev;
}
-static void __pm_unregister(struct pm_dev *dev)
-{
- if (dev) {
- list_del(&dev->entry);
- kfree(dev);
- }
-}
-
-/**
- * pm_unregister_all - unregister all devices with matching callback
- * @callback: callback function pointer
- *
- * Unregister every device that would call the callback passed. This
- * is primarily meant as a helper function for loadable modules. It
- * enables a module to give up all its managed devices without keeping
- * its own private list.
- */
-
-void pm_unregister_all(pm_callback callback)
-{
- struct list_head *entry;
-
- if (!callback)
- return;
-
- mutex_lock(&pm_devs_lock);
- entry = pm_devs.next;
- while (entry != &pm_devs) {
- struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
- entry = entry->next;
- if (dev->callback == callback)
- __pm_unregister(dev);
- }
- mutex_unlock(&pm_devs_lock);
-}
-
/**
* pm_send - send request to a single device
* @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
}
EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister_all);
EXPORT_SYMBOL(pm_send_all);
EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98c41423f3b1..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
/* References to section boundaries */
extern const void __nosave_begin, __nosave_end;
-extern struct pbe *pagedir_nosave;
-
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
extern unsigned int count_data_pages(void);
+/**
+ * Auxiliary structure used for reading the snapshot image data and
+ * metadata from and writing them to the list of page backup entries
+ * (PBEs) which is the main data structure of swsusp.
+ *
+ * Using struct snapshot_handle we can transfer the image, including its
+ * metadata, as a continuous sequence of bytes with the help of
+ * snapshot_read_next() and snapshot_write_next().
+ *
+ * The code that writes the image to a storage or transfers it to
+ * the user land is required to use snapshot_read_next() for this
+ * purpose and it should not make any assumptions regarding the internal
+ * structure of the image. Similarly, the code that reads the image from
+ * a storage or transfers it from the user land is required to use
+ * snapshot_write_next().
+ *
+ * This may allow us to change the internal structure of the image
+ * in the future with considerably less effort.
+ */
+
struct snapshot_handle {
- loff_t offset;
- unsigned int page;
- unsigned int page_offset;
- unsigned int prev;
- struct pbe *pbe, *last_pbe;
- void *buffer;
- unsigned int buf_offset;
+ loff_t offset; /* number of the last byte ready for reading
+ * or writing in the sequence
+ */
+ unsigned int cur; /* number of the block of PAGE_SIZE bytes the
+ * next operation will refer to (ie. current)
+ */
+ unsigned int cur_offset; /* offset with respect to the current
+ * block (for the next operation)
+ */
+ unsigned int prev; /* number of the block of PAGE_SIZE bytes that
+ * was the current one previously
+ */
+ void *buffer; /* address of the block to read from
+ * or write to
+ */
+ unsigned int buf_offset; /* location to read from or write to,
+ * given as a displacement from 'buffer'
+ */
+ int sync_read; /* Set to one to notify the caller of
+ * snapshot_write_next() that it may
+ * need to call wait_on_bio_chain()
+ */
};
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+extern unsigned int snapshot_additional_pages(struct zone *zone);
extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
-int snapshot_image_loaded(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
#define SNAPSHOT_IOC_MAGIC '3'
#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -105,10 +144,6 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
-extern unsigned int count_special_pages(void);
-extern int save_special_mem(void);
-extern int restore_special_mem(void);
-
extern int swsusp_check(void);
extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b2a5f671d6cd..72e72d2c61e6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p)
}
}
+static void cancel_freezing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (freezing(p)) {
+ pr_debug(" clean up: %s\n", p->comm);
+ do_not_freeze(p);
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ recalc_sigpending_tsk(p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+}
+
/* 0 = success, else # of processes that we failed to stop */
int freeze_processes(void)
{
int todo, nr_user, user_frozen;
unsigned long start_time;
struct task_struct *g, *p;
- unsigned long flags;
printk( "Stopping tasks: " );
start_time = jiffies;
@@ -85,6 +97,10 @@ int freeze_processes(void)
continue;
if (frozen(p))
continue;
+ if (p->state == TASK_TRACED && frozen(p->parent)) {
+ cancel_freezing(p);
+ continue;
+ }
if (p->mm && !(p->flags & PF_BORROWED_MM)) {
/* The task is a user-space one.
* Freeze it unless there's a vfork completion
@@ -126,13 +142,7 @@ int freeze_processes(void)
do_each_thread(g, p) {
if (freezeable(p) && !frozen(p))
printk(KERN_ERR " %s\n", p->comm);
- if (freezing(p)) {
- pr_debug(" clean up: %s\n", p->comm);
- p->flags &= ~PF_FREEZE;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- recalc_sigpending_tsk(p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
+ cancel_freezing(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
return todo;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * drivers/power/smp.c - Functions for stopping other CPUs.
- *
- * Copyright 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
- *
- * This file is released under the GPLv2.
- */
-
-#undef DEBUG
-
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <asm/atomic.h>
-#include <asm/tlbflush.h>
-
-/* This is protected by pm_sem semaphore */
-static cpumask_t frozen_cpus;
-
-void disable_nonboot_cpus(void)
-{
- int cpu, error;
-
- error = 0;
- cpus_clear(frozen_cpus);
- printk("Freezing cpus ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == 0)
- continue;
- error = cpu_down(cpu);
- if (!error) {
- cpu_set(cpu, frozen_cpus);
- printk("CPU%d is down\n", cpu);
- continue;
- }
- printk("Error taking cpu %d down: %d\n", cpu, error);
- }
- BUG_ON(raw_smp_processor_id() != 0);
- if (error)
- panic("cpus not sleeping");
-}
-
-void enable_nonboot_cpus(void)
-{
- int cpu, error;
-
- printk("Thawing cpus ...\n");
- for_each_cpu_mask(cpu, frozen_cpus) {
- error = cpu_up(cpu);
- if (!error) {
- printk("CPU%d is up\n", cpu);
- continue;
- }
- printk("Error taking cpu %d up: %d\n", cpu, error);
- panic("Not enough cpus");
- }
- cpus_clear(frozen_cpus);
-}
-
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3d9284100b22..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,95 +34,15 @@
#include "power.h"
-struct pbe *pagedir_nosave;
+/* List of PBEs used for creating and restoring the suspend image */
+struct pbe *restore_pblist;
+
static unsigned int nr_copy_pages;
static unsigned int nr_meta_pages;
-static unsigned long *buffer;
-
-struct arch_saveable_page {
- unsigned long start;
- unsigned long end;
- char *data;
- struct arch_saveable_page *next;
-};
-static struct arch_saveable_page *arch_pages;
-
-int swsusp_add_arch_pages(unsigned long start, unsigned long end)
-{
- struct arch_saveable_page *tmp;
-
- while (start < end) {
- tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
- tmp->start = start;
- tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
- if (tmp->end > end)
- tmp->end = end;
- tmp->next = arch_pages;
- start = tmp->end;
- arch_pages = tmp;
- }
- return 0;
-}
-
-static unsigned int count_arch_pages(void)
-{
- unsigned int count = 0;
- struct arch_saveable_page *tmp = arch_pages;
- while (tmp) {
- count++;
- tmp = tmp->next;
- }
- return count;
-}
-
-static int save_arch_mem(void)
-{
- char *kaddr;
- struct arch_saveable_page *tmp = arch_pages;
- int offset;
-
- pr_debug("swsusp: Saving arch specific memory");
- while (tmp) {
- tmp->data = (char *)__get_free_page(GFP_ATOMIC);
- if (!tmp->data)
- return -ENOMEM;
- offset = tmp->start - (tmp->start & PAGE_MASK);
- /* arch pages might haven't a 'struct page' */
- kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
- memcpy(tmp->data + offset, kaddr + offset,
- tmp->end - tmp->start);
- kunmap_atomic(kaddr, KM_USER0);
-
- tmp = tmp->next;
- }
- return 0;
-}
-
-static int restore_arch_mem(void)
-{
- char *kaddr;
- struct arch_saveable_page *tmp = arch_pages;
- int offset;
-
- while (tmp) {
- if (!tmp->data)
- continue;
- offset = tmp->start - (tmp->start & PAGE_MASK);
- kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
- memcpy(kaddr + offset, tmp->data + offset,
- tmp->end - tmp->start);
- kunmap_atomic(kaddr, KM_USER0);
- free_page((long)tmp->data);
- tmp->data = NULL;
- tmp = tmp->next;
- }
- return 0;
-}
+static void *buffer;
#ifdef CONFIG_HIGHMEM
-static unsigned int count_highmem_pages(void)
+unsigned int count_highmem_pages(void)
{
struct zone *zone;
unsigned long zone_pfn;
@@ -199,7 +119,7 @@ static int save_highmem_zone(struct zone *zone)
return 0;
}
-static int save_highmem(void)
+int save_highmem(void)
{
struct zone *zone;
int res = 0;
@@ -216,7 +136,7 @@ static int save_highmem(void)
return 0;
}
-static int restore_highmem(void)
+int restore_highmem(void)
{
printk("swsusp: Restoring Highmem\n");
while (highmem_copy) {
@@ -238,256 +158,637 @@ static inline int save_highmem(void) {return 0;}
static inline int restore_highmem(void) {return 0;}
#endif
-unsigned int count_special_pages(void)
+/**
+ * @safe_needed - on resume, for storing the PBE list and the image,
+ * we can only use memory pages that do not conflict with the pages
+ * used before suspend.
+ *
+ * The unsafe pages are marked with the PG_nosave_free flag
+ * and we count them using unsafe_pages
+ */
+
+#define PG_ANY 0
+#define PG_SAFE 1
+#define PG_UNSAFE_CLEAR 1
+#define PG_UNSAFE_KEEP 0
+
+static unsigned int allocated_unsafe_pages;
+
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
{
- return count_arch_pages() + count_highmem_pages();
+ void *res;
+
+ res = (void *)get_zeroed_page(gfp_mask);
+ if (safe_needed)
+ while (res && PageNosaveFree(virt_to_page(res))) {
+ /* The page is unsafe, mark it for swsusp_free() */
+ SetPageNosave(virt_to_page(res));
+ allocated_unsafe_pages++;
+ res = (void *)get_zeroed_page(gfp_mask);
+ }
+ if (res) {
+ SetPageNosave(virt_to_page(res));
+ SetPageNosaveFree(virt_to_page(res));
+ }
+ return res;
}
-int save_special_mem(void)
+unsigned long get_safe_page(gfp_t gfp_mask)
{
- int ret;
- ret = save_arch_mem();
- if (!ret)
- ret = save_highmem();
- return ret;
+ return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+}
+
+/**
+ * free_image_page - free page represented by @addr, allocated with
+ * alloc_image_page (page flags set by it must be cleared)
+ */
+
+static inline void free_image_page(void *addr, int clear_nosave_free)
+{
+ ClearPageNosave(virt_to_page(addr));
+ if (clear_nosave_free)
+ ClearPageNosaveFree(virt_to_page(addr));
+ free_page((unsigned long)addr);
+}
+
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
+
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+{
+ while (list) {
+ struct linked_page *lp = list->next;
+
+ free_image_page(list, clear_page_nosave);
+ list = lp;
+ }
+}
+
+/**
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
+
+struct chain_allocator {
+ struct linked_page *chain; /* the chain */
+ unsigned int used_space; /* total size of objects allocated out
+ * of the current page
+ */
+ gfp_t gfp_mask; /* mask for allocating pages */
+ int safe_needed; /* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+{
+ ca->chain = NULL;
+ ca->used_space = LINKED_PAGE_DATA_SIZE;
+ ca->gfp_mask = gfp_mask;
+ ca->safe_needed = safe_needed;
}
-int restore_special_mem(void)
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
{
- int ret;
- ret = restore_arch_mem();
- if (!ret)
- ret = restore_highmem();
+ void *ret;
+
+ if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+ struct linked_page *lp;
+
+ lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+ if (!lp)
+ return NULL;
+
+ lp->next = ca->chain;
+ ca->chain = lp;
+ ca->used_space = 0;
+ }
+ ret = ca->chain->data + ca->used_space;
+ ca->used_space += size;
return ret;
}
-static int pfn_is_nosave(unsigned long pfn)
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
{
- unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+ free_list_of_pages(ca->chain, clear_page_nosave);
+ memset(ca, 0, sizeof(struct chain_allocator));
}
/**
- * saveable - Determine whether a page should be cloned or not.
- * @pfn: The page
+ * Data types related to memory bitmaps.
+ *
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bit chunks in which information is
+ * stored.
+ *
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
+ *
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
*
- * We save a page if it's Reserved, and not in the range of pages
- * statically defined as 'unsaveable', or if it isn't reserved, and
- * isn't part of a free chunk of pages.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * pfns that correspond to the start and end of the represented zone.
+ *
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bit chunks
+ * of type unsigned long each). It also contains the pfns that
+ * correspond to the start and end of the represented memory area and
+ * the number of bit chunks in the block.
+ *
+ * NOTE: Memory bitmaps are used for two types of operations only:
+ * "set a bit" and "find the next bit set". Moreover, the searching
+ * is always carried out after all of the "set a bit" operations
+ * on given bitmap.
*/
-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+#define BM_END_OF_MAP (~0UL)
+
+#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
+#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
+#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
+
+struct bm_block {
+ struct bm_block *next; /* next element of the list */
+ unsigned long start_pfn; /* pfn represented by the first bit */
+ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
+ unsigned int size; /* number of bit chunks */
+ unsigned long *data; /* chunks of bits representing pages */
+};
+
+struct zone_bitmap {
+ struct zone_bitmap *next; /* next element of the list */
+ unsigned long start_pfn; /* minimal pfn in this zone */
+ unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
+ struct bm_block *bm_blocks; /* list of bitmap blocks */
+ struct bm_block *cur_block; /* recently used bitmap block */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+ struct zone_bitmap *zone_bm;
+ struct bm_block *block;
+ int chunk;
+ int bit;
+};
+
+struct memory_bitmap {
+ struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
+ struct linked_page *p_list; /* list of pages used to store zone
+ * bitmap objects and bitmap block
+ * objects
+ */
+ struct bm_position cur; /* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
{
- unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
- struct page *page;
+ bm->cur.chunk = 0;
+ bm->cur.bit = -1;
+}
- if (!pfn_valid(pfn))
- return 0;
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ struct zone_bitmap *zone_bm;
- page = pfn_to_page(pfn);
- if (PageNosave(page))
- return 0;
- if (PageReserved(page) && pfn_is_nosave(pfn))
- return 0;
- if (PageNosaveFree(page))
- return 0;
+ zone_bm = bm->zone_bm_list;
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ memory_bm_reset_chunk(bm);
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
- return 1;
+/**
+ * create_bm_block_list - create a list of block bitmap objects
+ */
+
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+{
+ struct bm_block *bblist = NULL;
+
+ while (nr_blocks-- > 0) {
+ struct bm_block *bb;
+
+ bb = chain_alloc(ca, sizeof(struct bm_block));
+ if (!bb)
+ return NULL;
+
+ bb->next = bblist;
+ bblist = bb;
+ }
+ return bblist;
}
-unsigned int count_data_pages(void)
+/**
+ * create_zone_bm_list - create a list of zone bitmap objects
+ */
+
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
{
- struct zone *zone;
- unsigned long zone_pfn;
- unsigned int n = 0;
+ struct zone_bitmap *zbmlist = NULL;
- for_each_zone (zone) {
- if (is_highmem(zone))
- continue;
- mark_free_pages(zone);
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- n += saveable(zone, &zone_pfn);
+ while (nr_zones-- > 0) {
+ struct zone_bitmap *zbm;
+
+ zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+ if (!zbm)
+ return NULL;
+
+ zbm->next = zbmlist;
+ zbmlist = zbm;
}
- return n;
+ return zbmlist;
}
-static void copy_data_pages(struct pbe *pblist)
+/**
+ * memory_bm_create - allocate memory for a memory bitmap
+ */
+
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
{
+ struct chain_allocator ca;
struct zone *zone;
- unsigned long zone_pfn;
- struct pbe *pbe, *p;
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ unsigned int nr;
+
+ chain_init(&ca, gfp_mask, safe_needed);
+
+ /* Compute the number of zones */
+ nr = 0;
+ for_each_zone (zone)
+ if (populated_zone(zone) && !is_highmem(zone))
+ nr++;
+
+ /* Allocate the list of zones bitmap objects */
+ zone_bm = create_zone_bm_list(nr, &ca);
+ bm->zone_bm_list = zone_bm;
+ if (!zone_bm) {
+ chain_free(&ca, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
+ }
- pbe = pblist;
+ /* Initialize the zone bitmap objects */
for_each_zone (zone) {
- if (is_highmem(zone))
+ unsigned long pfn;
+
+ if (!populated_zone(zone) || is_highmem(zone))
continue;
- mark_free_pages(zone);
- /* This is necessary for swsusp_free() */
- for_each_pb_page (p, pblist)
- SetPageNosaveFree(virt_to_page(p));
- for_each_pbe (p, pblist)
- SetPageNosaveFree(virt_to_page(p->address));
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
- if (saveable(zone, &zone_pfn)) {
- struct page *page;
- page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
- BUG_ON(!pbe);
- pbe->orig_address = (unsigned long)page_address(page);
- /* copy_page is not usable for copying task structs. */
- memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
- pbe = pbe->next;
+
+ zone_bm->start_pfn = zone->zone_start_pfn;
+ zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ /* Allocate the list of bitmap block objects */
+ nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ bb = create_bm_block_list(nr, &ca);
+ zone_bm->bm_blocks = bb;
+ zone_bm->cur_block = bb;
+ if (!bb)
+ goto Free;
+
+ nr = zone->spanned_pages;
+ pfn = zone->zone_start_pfn;
+ /* Initialize the bitmap block objects */
+ while (bb) {
+ unsigned long *ptr;
+
+ ptr = alloc_image_page(gfp_mask, safe_needed);
+ bb->data = ptr;
+ if (!ptr)
+ goto Free;
+
+ bb->start_pfn = pfn;
+ if (nr >= BM_BITS_PER_BLOCK) {
+ pfn += BM_BITS_PER_BLOCK;
+ bb->size = BM_CHUNKS_PER_BLOCK;
+ nr -= BM_BITS_PER_BLOCK;
+ } else {
+ /* This is executed only once in the loop */
+ pfn += nr;
+ bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
}
+ bb->end_pfn = pfn;
+ bb = bb->next;
}
+ zone_bm = zone_bm->next;
}
- BUG_ON(pbe);
-}
+ bm->p_list = ca.chain;
+ memory_bm_position_reset(bm);
+ return 0;
+Free:
+ bm->p_list = ca.chain;
+ memory_bm_free(bm, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
+}
/**
- * free_pagedir - free pages allocated with alloc_pagedir()
- */
+ * memory_bm_free - free memory occupied by the memory bitmap @bm
+ */
-static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
- struct pbe *pbe;
-
- while (pblist) {
- pbe = (pblist + PB_PAGE_SKIP)->next;
- ClearPageNosave(virt_to_page(pblist));
- if (clear_nosave_free)
- ClearPageNosaveFree(virt_to_page(pblist));
- free_page((unsigned long)pblist);
- pblist = pbe;
+ struct zone_bitmap *zone_bm;
+
+ /* Free the list of bit blocks for each zone_bitmap object */
+ zone_bm = bm->zone_bm_list;
+ while (zone_bm) {
+ struct bm_block *bb;
+
+ bb = zone_bm->bm_blocks;
+ while (bb) {
+ if (bb->data)
+ free_image_page(bb->data, clear_nosave_free);
+ bb = bb->next;
+ }
+ zone_bm = zone_bm->next;
}
+ free_list_of_pages(bm->p_list, clear_nosave_free);
+ bm->zone_bm_list = NULL;
}
/**
- * fill_pb_page - Create a list of PBEs on a given memory page
+ * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ * to given pfn. The cur_zone_bm member of @bm and the cur_block member
+ * of @bm->cur_zone_bm are updated.
+ *
+ * If the bit cannot be set, the function returns -EINVAL .
*/
-static inline void fill_pb_page(struct pbe *pbpage)
+static int
+memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
- struct pbe *p;
-
- p = pbpage;
- pbpage += PB_PAGE_SKIP;
- do
- p->next = p + 1;
- while (++p < pbpage);
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+
+ /* Check if the pfn is from the current zone */
+ zone_bm = bm->cur.zone_bm;
+ if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = bm->zone_bm_list;
+ /* We don't assume that the zones are sorted by pfns */
+ while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = zone_bm->next;
+ if (unlikely(!zone_bm))
+ return -EINVAL;
+ }
+ bm->cur.zone_bm = zone_bm;
+ }
+ /* Check if the pfn corresponds to the current bitmap block */
+ bb = zone_bm->cur_block;
+ if (pfn < bb->start_pfn)
+ bb = zone_bm->bm_blocks;
+
+ while (pfn >= bb->end_pfn) {
+ bb = bb->next;
+ if (unlikely(!bb))
+ return -EINVAL;
+ }
+ zone_bm->cur_block = bb;
+ pfn -= bb->start_pfn;
+ set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
+ return 0;
}
-/**
- * create_pbe_list - Create a list of PBEs on top of a given chain
- * of memory pages allocated with alloc_pagedir()
- */
+/* Two auxiliary functions for memory_bm_next_pfn */
-static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
-{
- struct pbe *pbpage, *p;
- unsigned int num = PBES_PER_PAGE;
+/* Find the first set bit in the given chunk, if there is one */
- for_each_pb_page (pbpage, pblist) {
- if (num >= nr_pages)
- break;
+static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
+{
+ bit++;
+ while (bit < BM_BITS_PER_CHUNK) {
+ if (test_bit(bit, chunk_p))
+ return bit;
- fill_pb_page(pbpage);
- num += PBES_PER_PAGE;
- }
- if (pbpage) {
- for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
- p->next = p + 1;
- p->next = NULL;
+ bit++;
}
+ return -1;
}
-static unsigned int unsafe_pages;
+/* Find a chunk containing some bits set in given block of bits */
+
+static inline int next_chunk_in_block(int n, struct bm_block *bb)
+{
+ n++;
+ while (n < bb->size) {
+ if (bb->data[n])
+ return n;
+
+ n++;
+ }
+ return -1;
+}
/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend.
+ * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
+ * returned.
*
- * The unsafe pages are marked with the PG_nosave_free flag
- * and we count them using unsafe_pages
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function.
*/
-static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
- void *res;
-
- res = (void *)get_zeroed_page(gfp_mask);
- if (safe_needed)
- while (res && PageNosaveFree(virt_to_page(res))) {
- /* The page is unsafe, mark it for swsusp_free() */
- SetPageNosave(virt_to_page(res));
- unsafe_pages++;
- res = (void *)get_zeroed_page(gfp_mask);
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ int chunk;
+ int bit;
+
+ do {
+ bb = bm->cur.block;
+ do {
+ chunk = bm->cur.chunk;
+ bit = bm->cur.bit;
+ do {
+ bit = next_bit_in_chunk(bit, bb->data + chunk);
+ if (bit >= 0)
+ goto Return_pfn;
+
+ chunk = next_chunk_in_block(chunk, bb);
+ bit = -1;
+ } while (chunk >= 0);
+ bb = bb->next;
+ bm->cur.block = bb;
+ memory_bm_reset_chunk(bm);
+ } while (bb);
+ zone_bm = bm->cur.zone_bm->next;
+ if (zone_bm) {
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ memory_bm_reset_chunk(bm);
}
- if (res) {
- SetPageNosave(virt_to_page(res));
- SetPageNosaveFree(virt_to_page(res));
- }
+ } while (zone_bm);
+ memory_bm_position_reset(bm);
+ return BM_END_OF_MAP;
+
+Return_pfn:
+ bm->cur.chunk = chunk;
+ bm->cur.bit = bit;
+ return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+}
+
+/**
+ * snapshot_additional_pages - estimate the number of additional pages
+ * be needed for setting up the suspend image data structures for given
+ * zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+ unsigned int res;
+
+ res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
return res;
}
-unsigned long get_safe_page(gfp_t gfp_mask)
+/**
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+static inline int pfn_is_nosave(unsigned long pfn)
{
- return (unsigned long)alloc_image_page(gfp_mask, 1);
+ unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
/**
- * alloc_pagedir - Allocate the page directory.
- *
- * First, determine exactly how many pages we need and
- * allocate them.
- *
- * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- * struct pbe elements (pbes) and the last element in the page points
- * to the next page.
+ * saveable - Determine whether a page should be cloned or not.
+ * @pfn: The page
*
- * On each page we set up a list of struct_pbe elements.
+ * We save a page if it isn't Nosave, and is not in the range of pages
+ * statically defined as 'unsaveable', and it
+ * isn't a part of a free chunk of pages.
*/
-static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
- int safe_needed)
+static struct page *saveable_page(unsigned long pfn)
{
- unsigned int num;
- struct pbe *pblist, *pbe;
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
- if (!nr_pages)
+ page = pfn_to_page(pfn);
+
+ if (PageNosave(page))
+ return NULL;
+ if (PageReserved(page) && pfn_is_nosave(pfn))
return NULL;
+ if (PageNosaveFree(page))
+ return NULL;
+
+ return page;
+}
+
+unsigned int count_data_pages(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+ unsigned int n = 0;
+
+ for_each_zone (zone) {
+ if (is_highmem(zone))
+ continue;
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ n += !!saveable_page(pfn);
+ }
+ return n;
+}
+
+static inline void copy_data_page(long *dst, long *src)
+{
+ int n;
+
+ /* copy_page and memcpy are not usable for copying task structs. */
+ for (n = PAGE_SIZE / sizeof(long); n; n--)
+ *dst++ = *src++;
+}
+
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ for_each_zone (zone) {
+ unsigned long max_zone_pfn;
- pblist = alloc_image_page(gfp_mask, safe_needed);
- /* FIXME: rewrite this ugly loop */
- for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
- pbe = pbe->next, num += PBES_PER_PAGE) {
- pbe += PB_PAGE_SKIP;
- pbe->next = alloc_image_page(gfp_mask, safe_needed);
+ if (is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_page(pfn))
+ memory_bm_set_bit(orig_bm, pfn);
}
- if (!pbe) { /* get_zeroed_page() failed */
- free_pagedir(pblist, 1);
- pblist = NULL;
- } else
- create_pbe_list(pblist, nr_pages);
- return pblist;
+ memory_bm_position_reset(orig_bm);
+ memory_bm_position_reset(copy_bm);
+ do {
+ pfn = memory_bm_next_pfn(orig_bm);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ struct page *page;
+ void *src;
+
+ page = pfn_to_page(pfn);
+ src = page_address(page);
+ page = pfn_to_page(memory_bm_next_pfn(copy_bm));
+ copy_data_page(page_address(page), src);
+ }
+ } while (pfn != BM_END_OF_MAP);
}
/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
+ * swsusp_free - free pages allocated for the suspend.
+ *
+ * Suspend pages are alocated before the atomic copy is made, so we
+ * need to release them after the resume.
*/
void swsusp_free(void)
{
struct zone *zone;
- unsigned long zone_pfn;
+ unsigned long pfn, max_zone_pfn;
for_each_zone(zone) {
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
- struct page *page;
- page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
if (PageNosave(page) && PageNosaveFree(page)) {
ClearPageNosave(page);
ClearPageNosaveFree(page);
@@ -497,7 +798,7 @@ void swsusp_free(void)
}
nr_copy_pages = 0;
nr_meta_pages = 0;
- pagedir_nosave = NULL;
+ restore_pblist = NULL;
buffer = NULL;
}
@@ -512,46 +813,57 @@ void swsusp_free(void)
static int enough_free_mem(unsigned int nr_pages)
{
struct zone *zone;
- unsigned int n = 0;
+ unsigned int free = 0, meta = 0;
for_each_zone (zone)
- if (!is_highmem(zone))
- n += zone->free_pages;
- pr_debug("swsusp: available memory: %u pages\n", n);
- return n > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
+ if (!is_highmem(zone)) {
+ free += zone->free_pages;
+ meta += snapshot_additional_pages(zone);
+ }
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
-{
- struct pbe *p;
+ pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, meta, free);
- for_each_pbe (p, pblist) {
- p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
- if (!p->address)
- return -ENOMEM;
- }
- return 0;
+ return free > nr_pages + PAGES_FOR_IO + meta;
}
-static struct pbe *swsusp_alloc(unsigned int nr_pages)
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+ unsigned int nr_pages)
{
- struct pbe *pblist;
+ int error;
- if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
- printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
- return NULL;
- }
+ error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
- if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
- printk(KERN_ERR "suspend: Allocating image pages failed.\n");
- swsusp_free();
- return NULL;
+ error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
+
+ while (nr_pages-- > 0) {
+ struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+ if (!page)
+ goto Free;
+
+ SetPageNosave(page);
+ SetPageNosaveFree(page);
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
+ return 0;
- return pblist;
+Free:
+ swsusp_free();
+ return -ENOMEM;
}
+/* Memory bitmap used for marking saveable pages */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used for marking allocated pages that will contain the copies
+ * of saveable pages
+ */
+static struct memory_bitmap copy_bm;
+
asmlinkage int swsusp_save(void)
{
unsigned int nr_pages;
@@ -562,25 +874,19 @@ asmlinkage int swsusp_save(void)
nr_pages = count_data_pages();
printk("swsusp: Need to copy %u pages\n", nr_pages);
- pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
- nr_pages,
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
- PAGES_FOR_IO, nr_free_pages());
-
if (!enough_free_mem(nr_pages)) {
printk(KERN_ERR "swsusp: Not enough free memory\n");
return -ENOMEM;
}
- pagedir_nosave = swsusp_alloc(nr_pages);
- if (!pagedir_nosave)
+ if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
return -ENOMEM;
/* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages();
- copy_data_pages(pagedir_nosave);
+ copy_data_pages(&copy_bm, &orig_bm);
/*
* End of critical section. From now on, we can write to memory,
@@ -609,22 +915,20 @@ static void init_header(struct swsusp_info *info)
}
/**
- * pack_orig_addresses - the .orig_address fields of the PBEs from the
- * list starting at @pbe are stored in the array @buf[] (1 page)
+ * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ * are stored in the array @buf[] (1 page at a time)
*/
-static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+static inline void
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- buf[j] = pbe->orig_address;
- pbe = pbe->next;
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ buf[j] = memory_bm_next_pfn(bm);
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
}
- if (!pbe)
- for (; j < PAGE_SIZE / sizeof(long); j++)
- buf[j] = 0;
- return pbe;
}
/**
@@ -651,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
int snapshot_read_next(struct snapshot_handle *handle, size_t count)
{
- if (handle->page > nr_meta_pages + nr_copy_pages)
+ if (handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
+
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, 0);
+ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset) {
init_header((struct swsusp_info *)buffer);
handle->buffer = buffer;
- handle->pbe = pagedir_nosave;
+ memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&copy_bm);
}
- if (handle->prev < handle->page) {
- if (handle->page <= nr_meta_pages) {
- handle->pbe = pack_orig_addresses(buffer, handle->pbe);
- if (!handle->pbe)
- handle->pbe = pagedir_nosave;
+ if (handle->prev < handle->cur) {
+ if (handle->cur <= nr_meta_pages) {
+ memset(buffer, 0, PAGE_SIZE);
+ pack_pfns(buffer, &orig_bm);
} else {
- handle->buffer = (void *)handle->pbe->address;
- handle->pbe = handle->pbe->next;
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+
+ handle->buffer = page_address(pfn_to_page(pfn));
}
- handle->prev = handle->page;
+ handle->prev = handle->cur;
}
- handle->buf_offset = handle->page_offset;
- if (handle->page_offset + count >= PAGE_SIZE) {
- count = PAGE_SIZE - handle->page_offset;
- handle->page_offset = 0;
- handle->page++;
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
} else {
- handle->page_offset += count;
+ handle->cur_offset += count;
}
handle->offset += count;
return count;
@@ -693,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
* had been used before suspend
*/
-static int mark_unsafe_pages(struct pbe *pblist)
+static int mark_unsafe_pages(struct memory_bitmap *bm)
{
struct zone *zone;
- unsigned long zone_pfn;
- struct pbe *p;
-
- if (!pblist) /* a sanity check */
- return -EINVAL;
+ unsigned long pfn, max_zone_pfn;
/* Clear page flags */
for_each_zone (zone) {
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- if (pfn_valid(zone_pfn + zone->zone_start_pfn))
- ClearPageNosaveFree(pfn_to_page(zone_pfn +
- zone->zone_start_pfn));
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn))
+ ClearPageNosaveFree(pfn_to_page(pfn));
}
- /* Mark orig addresses */
- for_each_pbe (p, pblist) {
- if (virt_addr_valid(p->orig_address))
- SetPageNosaveFree(virt_to_page(p->orig_address));
- else
- return -EFAULT;
- }
+ /* Mark pages that correspond to the "original" pfns as "unsafe" */
+ memory_bm_position_reset(bm);
+ do {
+ pfn = memory_bm_next_pfn(bm);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ if (likely(pfn_valid(pfn)))
+ SetPageNosaveFree(pfn_to_page(pfn));
+ else
+ return -EFAULT;
+ }
+ } while (pfn != BM_END_OF_MAP);
- unsafe_pages = 0;
+ allocated_unsafe_pages = 0;
return 0;
}
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
{
- /* We assume both lists contain the same number of elements */
- while (src) {
- dst->orig_address = src->orig_address;
- dst = dst->next;
- src = src->next;
+ unsigned long pfn;
+
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
}
-static int check_header(struct swsusp_info *info)
+static inline int check_header(struct swsusp_info *info)
{
char *reason = NULL;
@@ -760,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
* load header - check the image header and copy data from it
*/
-static int load_header(struct snapshot_handle *handle,
- struct swsusp_info *info)
+static int
+load_header(struct swsusp_info *info)
{
int error;
- struct pbe *pblist;
+ restore_pblist = NULL;
error = check_header(info);
if (!error) {
- pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
- if (!pblist)
- return -ENOMEM;
- pagedir_nosave = pblist;
- handle->pbe = pblist;
nr_copy_pages = info->image_pages;
nr_meta_pages = info->pages - info->image_pages - 1;
}
@@ -780,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
}
/**
- * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- * the PBEs in the list starting at @pbe
+ * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ * the corresponding bit in the memory bitmap @bm
*/
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
- struct pbe *pbe)
+static inline void
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- pbe->orig_address = buf[j];
- pbe = pbe->next;
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+
+ memory_bm_set_bit(bm, buf[j]);
}
- return pbe;
}
/**
- * prepare_image - use metadata contained in the PBE list
- * pointed to by pagedir_nosave to mark the pages that will
- * be overwritten in the process of restoring the system
- * memory state from the image ("unsafe" pages) and allocate
- * memory for the image
+ * prepare_image - use the memory bitmap @bm to mark the pages that will
+ * be overwritten in the process of restoring the system memory state
+ * from the suspend image ("unsafe" pages) and allocate memory for the
+ * image.
*
- * The idea is to allocate the PBE list first and then
- * allocate as many pages as it's needed for the image data,
- * but not to assign these pages to the PBEs initially.
- * Instead, we just mark them as allocated and create a list
- * of "safe" which will be used later
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for the image data, but not to assign these
+ * pages to specific tasks initially. Instead, we just mark them as
+ * allocated and create a list of "safe" pages that will be used later.
*/
-struct safe_page {
- struct safe_page *next;
- char padding[PAGE_SIZE - sizeof(void *)];
-};
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static struct safe_page *safe_pages;
+static struct linked_page *safe_pages_list;
-static int prepare_image(struct snapshot_handle *handle)
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
- int error = 0;
- unsigned int nr_pages = nr_copy_pages;
- struct pbe *p, *pblist = NULL;
+ unsigned int nr_pages;
+ struct linked_page *sp_list, *lp;
+ int error;
- p = pagedir_nosave;
- error = mark_unsafe_pages(p);
- if (!error) {
- pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
- if (pblist)
- copy_page_backup_list(pblist, p);
- free_pagedir(p, 0);
- if (!pblist)
+ error = mark_unsafe_pages(bm);
+ if (error)
+ goto Free;
+
+ error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(new_bm, bm);
+ memory_bm_free(bm, PG_UNSAFE_KEEP);
+ /* Reserve some safe pages for potential later use.
+ *
+ * NOTE: This way we make sure there will be enough safe pages for the
+ * chain_alloc() in get_buffer(). It is a bit wasteful, but
+ * nr_copy_pages cannot be greater than 50% of the memory anyway.
+ */
+ sp_list = NULL;
+ /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+ nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+ while (nr_pages > 0) {
+ lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!lp) {
error = -ENOMEM;
+ goto Free;
+ }
+ lp->next = sp_list;
+ sp_list = lp;
+ nr_pages--;
}
- safe_pages = NULL;
- if (!error && nr_pages > unsafe_pages) {
- nr_pages -= unsafe_pages;
- while (nr_pages--) {
- struct safe_page *ptr;
-
- ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
- if (!ptr) {
- error = -ENOMEM;
- break;
- }
- if (!PageNosaveFree(virt_to_page(ptr))) {
- /* The page is "safe", add it to the list */
- ptr->next = safe_pages;
- safe_pages = ptr;
- }
- /* Mark the page as allocated */
- SetPageNosave(virt_to_page(ptr));
- SetPageNosaveFree(virt_to_page(ptr));
+ /* Preallocate memory for the image */
+ safe_pages_list = NULL;
+ nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ while (nr_pages > 0) {
+ lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ if (!PageNosaveFree(virt_to_page(lp))) {
+ /* The page is "safe", add it to the list */
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
}
+ /* Mark the page as allocated */
+ SetPageNosave(virt_to_page(lp));
+ SetPageNosaveFree(virt_to_page(lp));
+ nr_pages--;
}
- if (!error) {
- pagedir_nosave = pblist;
- } else {
- handle->pbe = NULL;
- swsusp_free();
+ /* Free the reserved safe pages so that chain_alloc() can use them */
+ while (sp_list) {
+ lp = sp_list->next;
+ free_image_page(sp_list, PG_UNSAFE_CLEAR);
+ sp_list = lp;
}
+ return 0;
+
+Free:
+ swsusp_free();
return error;
}
-static void *get_buffer(struct snapshot_handle *handle)
+/**
+ * get_buffer - compute the address that snapshot_write_next() should
+ * set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
- struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
- struct page *page = virt_to_page(pbe->orig_address);
+ struct pbe *pbe;
+ struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
- if (PageNosave(page) && PageNosaveFree(page)) {
- /*
- * We have allocated the "original" page frame and we can
- * use it directly to store the read page
+ if (PageNosave(page) && PageNosaveFree(page))
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
*/
- pbe->address = 0;
- if (last && last->next)
- last->next = NULL;
- return (void *)pbe->orig_address;
- }
- /*
- * The "original" page frame has not been allocated and we have to
- * use a "safe" page frame to store the read page
+ return page_address(page);
+
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
*/
- pbe->address = (unsigned long)safe_pages;
- safe_pages = safe_pages->next;
- if (last)
- last->next = pbe;
- handle->last_pbe = pbe;
+ pbe = chain_alloc(ca, sizeof(struct pbe));
+ if (!pbe) {
+ swsusp_free();
+ return NULL;
+ }
+ pbe->orig_address = (unsigned long)page_address(page);
+ pbe->address = (unsigned long)safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->next = restore_pblist;
+ restore_pblist = pbe;
return (void *)pbe->address;
}
@@ -914,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
int snapshot_write_next(struct snapshot_handle *handle, size_t count)
{
+ static struct chain_allocator ca;
int error = 0;
- if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+ /* Check if we have already loaded the entire image */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
+
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, 0);
+ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset)
handle->buffer = buffer;
- if (handle->prev < handle->page) {
- if (!handle->prev) {
- error = load_header(handle, (struct swsusp_info *)buffer);
+ handle->sync_read = 1;
+ if (handle->prev < handle->cur) {
+ if (handle->prev == 0) {
+ error = load_header(buffer);
+ if (error)
+ return error;
+
+ error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
+
} else if (handle->prev <= nr_meta_pages) {
- handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
- if (!handle->pbe) {
- error = prepare_image(handle);
+ unpack_orig_pfns(buffer, &copy_bm);
+ if (handle->prev == nr_meta_pages) {
+ error = prepare_image(&orig_bm, &copy_bm);
if (error)
return error;
- handle->pbe = pagedir_nosave;
- handle->last_pbe = NULL;
- handle->buffer = get_buffer(handle);
+
+ chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+ memory_bm_position_reset(&orig_bm);
+ restore_pblist = NULL;
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
+ if (!handle->buffer)
+ return -ENOMEM;
}
} else {
- handle->pbe = handle->pbe->next;
- handle->buffer = get_buffer(handle);
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
}
- handle->prev = handle->page;
+ handle->prev = handle->cur;
}
- handle->buf_offset = handle->page_offset;
- if (handle->page_offset + count >= PAGE_SIZE) {
- count = PAGE_SIZE - handle->page_offset;
- handle->page_offset = 0;
- handle->page++;
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
} else {
- handle->page_offset += count;
+ handle->cur_offset += count;
}
handle->offset += count;
return count;
@@ -961,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
int snapshot_image_loaded(struct snapshot_handle *handle)
{
- return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
- handle->page <= nr_meta_pages + nr_copy_pages);
+ return !(!nr_copy_pages ||
+ handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+void snapshot_free_unused_memory(struct snapshot_handle *handle)
+{
+ /* Free only if we have loaded the image entirely */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+ memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
{
int error;
- rw_swap_page_sync(READ,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)&swsusp_header));
+ rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)&swsusp_header), NULL);
if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
swsusp_header.image = start;
- error = rw_swap_page_sync(WRITE,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)
- &swsusp_header));
+ error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)&swsusp_header),
+ NULL);
} else {
pr_debug("swsusp: Partition is not swap space.\n");
error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
* write_page - Write one page to given swap location.
* @buf: Address we're writing.
* @offset: Offset of the swap page we're writing to.
+ * @bio_chain: Link the next write BIO here
*/
-static int write_page(void *buf, unsigned long offset)
+static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
{
swp_entry_t entry;
int error = -ENOSPC;
if (offset) {
+ struct page *page = virt_to_page(buf);
+
+ if (bio_chain) {
+ /*
+ * Whether or not we successfully allocated a copy page,
+ * we take a ref on the page here. It gets undone in
+ * wait_on_bio_chain().
+ */
+ struct page *page_copy;
+ page_copy = alloc_page(GFP_ATOMIC);
+ if (page_copy == NULL) {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ get_page(page);
+ } else {
+ memcpy(page_address(page_copy),
+ page_address(page), PAGE_SIZE);
+ page = page_copy;
+ }
+ }
entry = swp_entry(root_swap, offset);
- error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+ error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
}
return error;
}
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
handle->bitmap = NULL;
}
+static void show_speed(struct timeval *start, struct timeval *stop,
+ unsigned nr_pages, char *msg)
+{
+ s64 elapsed_centisecs64;
+ int centisecs;
+ int k;
+ int kps;
+
+ elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ centisecs = elapsed_centisecs64;
+ if (centisecs == 0)
+ centisecs = 1; /* avoid div-by-zero */
+ k = nr_pages * (PAGE_SIZE / 1024);
+ kps = (k * 100) / centisecs;
+ printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+ centisecs / 100, centisecs % 100,
+ kps / 1000, (kps % 1000) / 10);
+}
+
static int get_swap_writer(struct swap_map_handle *handle)
{
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
return 0;
}
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
+static int wait_on_bio_chain(struct bio **bio_chain)
{
- int error;
+ struct bio *bio;
+ struct bio *next_bio;
+ int ret = 0;
+
+ if (bio_chain == NULL)
+ return 0;
+
+ bio = *bio_chain;
+ if (bio == NULL)
+ return 0;
+ while (bio) {
+ struct page *page;
+
+ next_bio = bio->bi_private;
+ page = bio->bi_io_vec[0].bv_page;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page))
+ ret = -EIO;
+ put_page(page);
+ bio_put(bio);
+ bio = next_bio;
+ }
+ *bio_chain = NULL;
+ return ret;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ int error = 0;
unsigned long offset;
if (!handle->cur)
return -EINVAL;
offset = alloc_swap_page(root_swap, handle->bitmap);
- error = write_page(buf, offset);
+ error = write_page(buf, offset, bio_chain);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
if (handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
offset = alloc_swap_page(root_swap, handle->bitmap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap);
+ error = write_page(handle->cur, handle->cur_swap, NULL);
if (error)
- return error;
+ goto out;
memset(handle->cur, 0, PAGE_SIZE);
handle->cur_swap = offset;
handle->k = 0;
}
- return 0;
+out:
+ return error;
}
static int flush_swap_writer(struct swap_map_handle *handle)
{
if (handle->cur && handle->cur_swap)
- return write_page(handle->cur, handle->cur_swap);
+ return write_page(handle->cur, handle->cur_swap, NULL);
else
return -EINVAL;
}
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
static int save_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
- unsigned int nr_pages)
+ unsigned int nr_to_write)
{
unsigned int m;
int ret;
int error = 0;
+ int nr_pages;
+ int err2;
+ struct bio *bio;
+ struct timeval start;
+ struct timeval stop;
- printk("Saving image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
+ printk("Saving image data pages (%u pages) ... ", nr_to_write);
+ m = nr_to_write / 100;
if (!m)
m = 1;
nr_pages = 0;
+ bio = NULL;
+ do_gettimeofday(&start);
do {
ret = snapshot_read_next(snapshot, PAGE_SIZE);
if (ret > 0) {
- error = swap_write_page(handle, data_of(*snapshot));
+ error = swap_write_page(handle, data_of(*snapshot),
+ &bio);
if (error)
break;
if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
} while (ret > 0);
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
if (!error)
printk("\b\b\b\bdone\n");
+ show_speed(&start, &stop, nr_to_write, "Wrote");
return error;
}
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
unsigned int free_swap = count_swap_pages(root_swap, 1);
pr_debug("swsusp: free swap pages: %u\n", free_swap);
- return free_swap > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+ return free_swap > nr_pages + PAGES_FOR_IO;
}
/**
@@ -263,11 +348,11 @@ int swsusp_write(void)
struct swap_map_handle handle;
struct snapshot_handle snapshot;
struct swsusp_info *header;
- unsigned long start;
int error;
if ((error = swsusp_swap_check())) {
- printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+ printk(KERN_ERR "swsusp: Cannot find swap device, try "
+ "swapon -a.\n");
return error;
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,16 +366,17 @@ int swsusp_write(void)
}
error = get_swap_writer(&handle);
if (!error) {
- start = handle.cur_swap;
- error = swap_write_page(&handle, header);
- }
- if (!error)
- error = save_image(&handle, &snapshot, header->pages - 1);
- if (!error) {
- flush_swap_writer(&handle);
- printk("S");
- error = mark_swapfiles(swp_entry(root_swap, start));
- printk("|\n");
+ unsigned long start = handle.cur_swap;
+ error = swap_write_page(&handle, header, NULL);
+ if (!error)
+ error = save_image(&handle, &snapshot,
+ header->pages - 1);
+ if (!error) {
+ flush_swap_writer(&handle);
+ printk("S");
+ error = mark_swapfiles(swp_entry(root_swap, start));
+ printk("|\n");
+ }
}
if (error)
free_all_swap_pages(root_swap, handle.bitmap);
@@ -298,25 +384,6 @@ int swsusp_write(void)
return error;
}
-/*
- * Using bio to read from swap.
- * This code requires a bit more work than just using buffer heads
- * but, it is the recommended way for 2.5/2.6.
- * The following are to signal the beginning and end of I/O. Bios
- * finish asynchronously, while we want them to happen synchronously.
- * A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- panic("I/O error reading memory image");
- atomic_set(&io_done, 0);
- return 0;
-}
-
static struct block_device *resume_bdev;
/**
@@ -324,15 +391,15 @@ static struct block_device *resume_bdev;
* @rw: READ or WRITE.
* @off physical offset of page.
* @page: page we're reading or writing.
+ * @bio_chain: list of pending biod (for async reading)
*
* Straight from the textbook - allocate and initialize the bio.
- * If we're writing, make sure the page is marked as dirty.
- * Then submit it and wait.
+ * If we're reading, make sure the page is marked as dirty.
+ * Then submit it and, if @bio_chain == NULL, wait.
*/
-
-static int submit(int rw, pgoff_t page_off, void *page)
+static int submit(int rw, pgoff_t page_off, struct page *page,
+ struct bio **bio_chain)
{
- int error = 0;
struct bio *bio;
bio = bio_alloc(GFP_ATOMIC, 1);
@@ -340,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
return -ENOMEM;
bio->bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = resume_bdev;
- bio->bi_end_io = end_io;
+ bio->bi_end_io = end_swap_bio_read;
- if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
- printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
- error = -EFAULT;
- goto Done;
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+ bio_put(bio);
+ return -EFAULT;
}
- atomic_set(&io_done, 1);
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
- while (atomic_read(&io_done))
- yield();
- if (rw == READ)
- bio_set_pages_dirty(bio);
- Done:
- bio_put(bio);
- return error;
+ lock_page(page);
+ bio_get(bio);
+
+ if (bio_chain == NULL) {
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ wait_on_page_locked(page);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ bio_put(bio);
+ } else {
+ get_page(page);
+ bio->bi_private = *bio_chain;
+ *bio_chain = bio;
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ }
+ return 0;
}
-static int bio_read_page(pgoff_t page_off, void *page)
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
{
- return submit(READ, page_off, page);
+ return submit(READ, page_off, virt_to_page(addr), bio_chain);
}
-static int bio_write_page(pgoff_t page_off, void *page)
+static int bio_write_page(pgoff_t page_off, void *addr)
{
- return submit(WRITE, page_off, page);
+ return submit(WRITE, page_off, virt_to_page(addr), NULL);
}
/**
@@ -391,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
if (!handle->cur)
return -ENOMEM;
- error = bio_read_page(swp_offset(start), handle->cur);
+ error = bio_read_page(swp_offset(start), handle->cur, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -400,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
return 0;
}
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
{
unsigned long offset;
int error;
@@ -410,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = bio_read_page(offset, buf);
+ error = bio_read_page(offset, buf, bio_chain);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
handle->k = 0;
offset = handle->cur->next_swap;
if (!offset)
release_swap_reader(handle);
- else
- error = bio_read_page(offset, handle->cur);
+ else if (!error)
+ error = bio_read_page(offset, handle->cur, NULL);
}
return error;
}
@@ -432,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
static int load_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
- unsigned int nr_pages)
+ unsigned int nr_to_read)
{
unsigned int m;
- int ret;
int error = 0;
+ struct timeval start;
+ struct timeval stop;
+ struct bio *bio;
+ int err2;
+ unsigned nr_pages;
- printk("Loading image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
+ printk("Loading image data pages (%u pages) ... ", nr_to_read);
+ m = nr_to_read / 100;
if (!m)
m = 1;
nr_pages = 0;
- do {
- ret = snapshot_write_next(snapshot, PAGE_SIZE);
- if (ret > 0) {
- error = swap_read_page(handle, data_of(*snapshot));
- if (error)
- break;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
- }
- } while (ret > 0);
+ bio = NULL;
+ do_gettimeofday(&start);
+ for ( ; ; ) {
+ error = snapshot_write_next(snapshot, PAGE_SIZE);
+ if (error <= 0)
+ break;
+ error = swap_read_page(handle, data_of(*snapshot), &bio);
+ if (error)
+ break;
+ if (snapshot->sync_read)
+ error = wait_on_bio_chain(&bio);
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
if (!error) {
printk("\b\b\b\bdone\n");
+ snapshot_free_unused_memory(snapshot);
if (!snapshot_image_loaded(snapshot))
error = -ENODATA;
}
+ show_speed(&start, &stop, nr_to_read, "Read");
return error;
}
@@ -481,7 +573,7 @@ int swsusp_read(void)
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, swsusp_header.image);
if (!error)
- error = swap_read_page(&handle, header);
+ error = swap_read_page(&handle, header, NULL);
if (!error)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
@@ -507,7 +599,7 @@ int swsusp_check(void)
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
memset(&swsusp_header, 0, sizeof(swsusp_header));
- if ((error = bio_read_page(0, &swsusp_header)))
+ if ((error = bio_read_page(0, &swsusp_header, NULL)))
return error;
if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f0ee4e7780d6..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,6 +62,16 @@ unsigned long image_size = 500 * 1024 * 1024;
int in_suspend __nosavedata = 0;
+#ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static inline int save_highmem(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif
+
/**
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
@@ -182,15 +192,14 @@ int swsusp_shrink_memory(void)
printk("Shrinking memory... ");
do {
- size = 2 * count_special_pages();
- size += size / 50 + count_data_pages();
- size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
- PAGES_FOR_IO;
+ size = 2 * count_highmem_pages();
+ size += size / 50 + count_data_pages() + PAGES_FOR_IO;
tmp = size;
for_each_zone (zone)
if (!is_highmem(zone) && populated_zone(zone)) {
tmp -= zone->free_pages;
tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ tmp += snapshot_additional_pages(zone);
}
if (tmp > 0) {
tmp = __shrink_memory(tmp);
@@ -226,7 +235,7 @@ int swsusp_suspend(void)
goto Enable_irqs;
}
- if ((error = save_special_mem())) {
+ if ((error = save_highmem())) {
printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
goto Restore_highmem;
}
@@ -237,7 +246,10 @@ int swsusp_suspend(void)
/* Restore control flow magically appears here */
restore_processor_state();
Restore_highmem:
- restore_special_mem();
+ restore_highmem();
+ /* NOTE: device_power_up() is just a resume() for devices
+ * that suspended with irqs off ... no overall powerup.
+ */
device_power_up();
Enable_irqs:
local_irq_enable();
@@ -247,8 +259,12 @@ Enable_irqs:
int swsusp_resume(void)
{
int error;
+
local_irq_disable();
- if (device_power_down(PMSG_FREEZE))
+ /* NOTE: device_power_down() is just a suspend() with irqs off;
+ * it has no special "power things down" semantics
+ */
+ if (device_power_down(PMSG_PRETHAW))
printk(KERN_ERR "Some devices failed to power down, very bad\n");
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
@@ -263,7 +279,7 @@ int swsusp_resume(void)
*/
swsusp_free();
restore_processor_state();
- restore_special_mem();
+ restore_highmem();
touch_softlockup_watchdog();
device_power_up();
local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/fs.h>
+#include <linux/cpu.h>
#include <asm/uaccess.h>
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
if (data->frozen)
break;
down(&pm_sem);
- disable_nonboot_cpus();
- if (freeze_processes()) {
- thaw_processes();
- enable_nonboot_cpus();
- error = -EBUSY;
+ error = disable_nonboot_cpus();
+ if (!error) {
+ error = freeze_processes();
+ if (error) {
+ thaw_processes();
+ error = -EBUSY;
+ }
}
+ enable_nonboot_cpus();
up(&pm_sem);
if (!error)
data->frozen = 1;
@@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -EPERM;
break;
}
+ snapshot_free_unused_memory(&data->handle);
down(&pm_sem);
pm_prepare_console();
- error = device_suspend(PMSG_FREEZE);
+ error = device_suspend(PMSG_PRETHAW);
if (!error) {
error = swsusp_resume();
device_resume();
diff --git a/kernel/printk.c b/kernel/printk.c
index 19a955619294..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,8 +24,8 @@
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/interrupt.h> /* For in_interrupt() */
-#include <linux/config.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
@@ -52,7 +52,7 @@ int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
-EXPORT_SYMBOL(console_printk);
+EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
/*
* Low lever drivers may need that to know if they can schedule in
@@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
struct console *con;
for (con = console_drivers; con; con = con->next) {
- if ((con->flags & CON_ENABLED) && con->write)
+ if ((con->flags & CON_ENABLED) && con->write &&
+ (cpu_online(smp_processor_id()) ||
+ (con->flags & CON_ANYTIME)))
con->write(con, &LOG_BUF(start), end - start);
}
}
@@ -437,6 +439,7 @@ static int printk_time = 1;
#else
static int printk_time = 0;
#endif
+module_param(printk_time, int, S_IRUGO | S_IWUSR);
static int __init printk_time_setup(char *str)
{
@@ -453,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
return sched_clock();
}
+/* Check if we have any console registered that can be called early in boot. */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for (con = console_drivers; con; con = con->next)
+ if (con->flags & CON_ANYTIME)
+ return 1;
+
+ return 0;
+}
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -503,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();
/* This stops the holder of console_sem just where we want him */
- spin_lock_irqsave(&logbuf_lock, flags);
+ local_irq_save(flags);
+ lockdep_off();
+ spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();
/* Emit the output into the temporary buffer */
@@ -566,27 +583,31 @@ asmlinkage int vprintk(const char *fmt, va_list args)
log_level_unknown = 1;
}
- if (!cpu_online(smp_processor_id())) {
+ if (!down_trylock(&console_sem)) {
/*
- * Some console drivers may assume that per-cpu resources have
- * been allocated. So don't allow them to be called by this
- * CPU until it is officially up. We shouldn't be calling into
- * random console drivers on a CPU which doesn't exist yet..
+ * We own the drivers. We can drop the spinlock and
+ * let release_console_sem() print the text, maybe ...
*/
- printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
- goto out;
- }
- if (!down_trylock(&console_sem)) {
console_locked = 1;
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+
/*
- * We own the drivers. We can drop the spinlock and let
- * release_console_sem() print the text
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
*/
- printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
- console_may_schedule = 0;
- release_console_sem();
+ if (cpu_online(smp_processor_id()) || have_callable_console()) {
+ console_may_schedule = 0;
+ release_console_sem();
+ } else {
+ /* Release by hand to avoid flushing the buffer. */
+ console_locked = 0;
+ up(&console_sem);
+ }
+ lockdep_on();
+ local_irq_restore(flags);
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
@@ -594,9 +615,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* console drivers with the output which we just produced.
*/
printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
}
-out:
+
preempt_enable();
return printed_len;
}
@@ -698,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
return 0;
}
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
/**
* suspend_console - suspend the console subsystem
*
@@ -705,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
*/
void suspend_console(void)
{
+ printk("Suspending console(s)\n");
acquire_console_sem();
console_suspended = 1;
}
@@ -714,6 +739,7 @@ void resume_console(void)
console_suspended = 0;
release_console_sem();
}
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
/**
* acquire_console_sem - lock the console system for exclusive use.
@@ -750,7 +776,7 @@ int is_console_locked(void)
{
return console_locked;
}
-EXPORT_SYMBOL(is_console_locked);
+EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
/**
* release_console_sem - unlock the console system
@@ -776,6 +802,9 @@ void release_console_sem(void)
up(&secondary_console_sem);
return;
}
+
+ console_may_schedule = 0;
+
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
@@ -789,11 +818,17 @@ void release_console_sem(void)
local_irq_restore(flags);
}
console_locked = 0;
- console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
- wake_up_interruptible(&log_wait);
+ if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
+ /*
+ * If we printk from within the lock dependency code,
+ * from within the scheduler code, then do not lock
+ * up due to self-recursion:
+ */
+ if (!lockdep_internal())
+ wake_up_interruptible(&log_wait);
+ }
}
EXPORT_SYMBOL(release_console_sem);
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,6 @@
* to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/profile.h>
#include <linux/bootmem.h>
@@ -299,7 +298,7 @@ out:
}
#ifdef CONFIG_HOTPLUG_CPU
-static int profile_cpu_callback(struct notifier_block *info,
+static int __devinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
@@ -310,13 +309,17 @@ static int profile_cpu_callback(struct notifier_block *info,
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
return NOTIFY_BAD;
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_free;
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -492,12 +495,16 @@ static int __init create_hash_tables(void)
int node = cpu_to_node(cpu);
struct page *page;
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,7 +28,7 @@
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(task_t *child, task_t *new_parent)
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
BUG_ON(!list_empty(&child->ptrace_list));
if (child->parent == new_parent)
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent)
* TASK_TRACED, resume it now.
* Requires that irqs be disabled.
*/
-void ptrace_untrace(task_t *child)
+void ptrace_untrace(struct task_struct *child)
{
spin_lock(&child->sighand->siglock);
if (child->state == TASK_TRACED) {
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child)
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_unlink(task_t *child)
+void __ptrace_unlink(struct task_struct *child)
{
BUG_ON(!child->ptrace);
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
static int may_attach(struct task_struct *task)
{
- if (!task->mm)
- return -EPERM;
+ /* May we inspect the given task?
+ * This check is used both for attaching with ptrace
+ * and for allowing access to sensitive information in /proc.
+ *
+ * ptrace_attach denies several cases that /proc allows
+ * because setting up the necessary parent/child relationship
+ * or halting the specified task is impossible.
+ */
+ int dumpable = 0;
+ /* Don't let security modules deny introspection */
+ if (task == current)
+ return 0;
if (((current->uid != task->euid) ||
(current->uid != task->suid) ||
(current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
(current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
return -EPERM;
smp_rmb();
- if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+ if (task->mm)
+ dumpable = task->mm->dumpable;
+ if (!dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
goto repeat;
}
+ if (!task->mm)
+ goto bad;
/* the same process cannot be attached many times */
if (task->ptrace & PT_PTRACED)
goto bad;
@@ -200,7 +214,7 @@ out:
return retval;
}
-void __ptrace_detach(struct task_struct *child, unsigned int data)
+static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
{
child->exit_code = data;
/* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
ptrace_disable(child);
write_lock_irq(&tasklist_lock);
+ /* protect against de_thread()->release_task() */
if (child->ptrace)
__ptrace_detach(child, data);
write_unlock_irq(&tasklist_lock);
@@ -226,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
return 0;
}
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
- struct mm_struct *mm;
- struct vm_area_struct *vma;
- struct page *page;
- void *old_buf = buf;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
- down_read(&mm->mmap_sem);
- /* ignore errors, just check how much was sucessfully transfered */
- while (len) {
- int bytes, ret, offset;
- void *maddr;
-
- ret = get_user_pages(tsk, mm, addr, 1,
- write, 1, &page, &vma);
- if (ret <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
- } else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
- }
- kunmap(page);
- page_cache_release(page);
- len -= bytes;
- buf += bytes;
- addr += bytes;
- }
- up_read(&mm->mmap_sem);
- mmput(mm);
-
- return buf - old_buf;
-}
-
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
@@ -479,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
child = find_task_by_pid(pid);
if (child)
get_task_struct(child);
+
read_unlock(&tasklist_lock);
if (!child)
return ERR_PTR(-ESRCH);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21c..523e46483b99 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,13 +53,13 @@
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
return rcu_ctrlblk.completed;
}
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+ return rcu_bh_ctrlblk.completed;
+}
+
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -232,12 +241,16 @@ static void rcu_do_batch(struct rcu_data *rdp)
next = rdp->donelist = list->next;
list->func(list);
list = next;
- rdp->qlen--;
if (++count >= rdp->blimit)
break;
}
+
+ local_irq_disable();
+ rdp->qlen -= count;
+ local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
+
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
@@ -539,7 +552,7 @@ static void __devinit rcu_online_cpu(int cpu)
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
-static int rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -556,7 +569,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
@@ -619,6 +632,7 @@ module_param(qlowmark, int, 0);
module_param(rsinterval, int, 0);
#endif
EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
EXPORT_SYMBOL_GPL(call_rcu);
EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
/*
- * Read-Copy Update /proc-based torture test facility
+ * Read-Copy Update module-based torture test facility
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
static int verbose; /* Print more debug info. */
static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static char *torture_type = "rcu"; /* What to torture. */
module_param(nreaders, int, 0);
MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
module_param(shuffle_interval, int, 0);
MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-#define TORTURE_FLAG "rcutorture: "
+module_param(torture_type, charp, 0);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+
+#define TORTURE_FLAG "-torture:"
#define PRINTK_STRING(s) \
- do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+ do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
static char printk_buf[4096];
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
spin_unlock_bh(&rcu_torture_lock);
}
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
- int i;
- struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
- if (fullstop) {
- /* Test is ending, just drop callbacks on the floor. */
- /* The next initialization will pick up the pieces. */
- return;
- }
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- rcu_torture_free(rp);
- } else
- call_rcu(p, rcu_torture_cb);
-}
-
struct rcu_random_state {
unsigned long rrs_state;
unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
}
/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ int (*completed)(void);
+ void (*deferredfree)(struct rcu_torture *p);
+ int (*stats)(char *page);
+ char *name;
+};
+static struct rcu_torture_ops *cur_ops = NULL;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+ return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ int i;
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (fullstop) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ rcu_torture_free(rp);
+ } else
+ cur_ops->deferredfree(rp);
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferredfree = rcu_torture_deferred_free,
+ .stats = NULL,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+ return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferredfree = rcu_bh_torture_deferred_free,
+ .stats = NULL,
+ .name = "rcu_bh"
+};
+
+static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_bh_ops, NULL };
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
do {
schedule_timeout_uninterruptible(1);
- if (rcu_batches_completed() == oldbatch)
- continue;
if ((rp = rcu_torture_alloc()) == NULL)
continue;
rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+ cur_ops->deferredfree(old_rp);
}
rcu_torture_current_version++;
- oldbatch = rcu_batches_completed();
+ oldbatch = cur_ops->completed();
} while (!kthread_should_stop() && !fullstop);
VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
rcu_torture_reader(void *arg)
{
int completed;
+ int idx;
DEFINE_RCU_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
set_user_nice(current, 19);
do {
- rcu_read_lock();
- completed = rcu_batches_completed();
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
p = rcu_dereference(rcu_torture_current);
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcu_read_unlock();
+ cur_ops->readunlock(idx);
schedule_timeout_interruptible(HZ);
continue;
}
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
pipe_count = RCU_TORTURE_PIPE_LEN;
}
++__get_cpu_var(rcu_torture_count)[pipe_count];
- completed = rcu_batches_completed() - completed;
+ completed = cur_ops->completed() - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
++__get_cpu_var(rcu_torture_batch)[completed];
preempt_enable();
- rcu_read_unlock();
+ cur_ops->readunlock(idx);
schedule();
} while (!kthread_should_stop() && !fullstop);
VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
if (pipesummary[i] != 0)
break;
}
- cnt += sprintf(&page[cnt], "rcutorture: ");
+ cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
"rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_mberror));
if (atomic_read(&n_rcu_torture_mberror) != 0)
cnt += sprintf(&page[cnt], " !!!");
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
cnt += sprintf(&page[cnt], "!!! ");
atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt], "Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt], "Reader Batch: ");
- for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
cnt += sprintf(&page[cnt], " %d",
atomic_read(&rcu_torture_wcount[i]));
}
cnt += sprintf(&page[cnt], "\n");
+ if (cur_ops->stats != NULL)
+ cnt += cur_ops->stats(&page[cnt]);
return cnt;
}
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
static inline void
rcu_torture_print_module_parms(char *tag)
{
- printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+ printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval = %d\n",
- tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
- shuffle_interval);
+ torture_type, tag, nrealreaders, stat_interval, verbose,
+ test_no_idle_hz, shuffle_interval);
}
static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
rcu_barrier();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
rcu_torture_print_module_parms("End of test: FAILURE");
else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
/* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0) {
+ break;
+ }
+ }
+ if (cur_ops == NULL) {
+ printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+ torture_type);
+ return (-EINVAL);
+ }
+ if (cur_ops->init != NULL)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
if (nreaders >= 0)
nrealreaders = nreaders;
else
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
* @buf: the buffer struct
* @size: total size of the buffer
*
- * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
* passed in size will get page aligned, if it isn't already.
*/
static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
/**
* relay_create_buf - allocate and initialize a channel buffer
- * @alloc_size: size of the buffer to allocate
- * @n_subbufs: number of sub-buffers in the channel
+ * @chan: the relay channel
*
- * Returns channel buffer if successful, NULL otherwise
+ * Returns channel buffer if successful, %NULL otherwise.
*/
struct rchan_buf *relay_create_buf(struct rchan *chan)
{
@@ -163,6 +162,7 @@ free_buf:
/**
* relay_destroy_channel - free the channel struct
+ * @kref: target kernel reference that contains the relay channel
*
* Should only be called from kref_put().
*/
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
/**
* relay_remove_buf - remove a channel buffer
+ * @kref: target kernel reference that contains the relay buffer
*
* Removes the file from the fileystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
}
EXPORT_SYMBOL_GPL(relay_reset);
-/**
+/*
* relay_open_buf - create a new relay channel buffer
*
* Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
/**
* relay_open - create a new relay channel
* @base_filename: base name of files to create
- * @parent: dentry of parent directory, NULL for root directory
+ * @parent: dentry of parent directory, %NULL for root directory
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
*
- * Returns channel pointer if successful, NULL otherwise.
+ * Returns channel pointer if successful, %NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
- * NOTE: kernel clients don't need to call this function if the channel
+ * NOTE: Kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
* relay_flush - close the channel
* @chan: the channel
*
- * Flushes all channel buffers i.e. forces buffer switch.
+ * Flushes all channel buffers, i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
@@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
*/
static int relay_file_open(struct inode *inode, struct file *filp)
{
- struct rchan_buf *buf = inode->u.generic_ip;
+ struct rchan_buf *buf = inode->i_private;
kref_get(&buf->kref);
filp->private_data = buf;
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
return 0;
}
-/**
+/*
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
}
}
-/**
+/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
+ * @read_pos: file read position
+ * @buf: relay channel buffer
*
* If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
/**
* relay_file_read_end_pos - return the new read position
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ * @count: number of bytes to be read
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
return end_pos;
}
-/**
+/*
* subbuf_read_actor - read up to one subbuf's worth of data
*/
static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
return ret;
}
-/**
+/*
* subbuf_send_actor - send up to one subbuf's worth of data
*/
static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
read_descriptor_t *desc,
read_actor_t actor);
-/**
+/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static inline ssize_t relay_file_read_subbufs(struct file *filp,
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,6 @@
* Arbitrary resource management.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/errno.h>
@@ -23,20 +22,18 @@
struct resource ioport_resource = {
.name = "PCI IO",
- .start = 0x0000,
+ .start = 0,
.end = IO_SPACE_LIMIT,
.flags = IORESOURCE_IO,
};
-
EXPORT_SYMBOL(ioport_resource);
struct resource iomem_resource = {
.name = "PCI mem",
- .start = 0UL,
- .end = ~0UL,
+ .start = 0,
+ .end = -1,
.flags = IORESOURCE_MEM,
};
-
EXPORT_SYMBOL(iomem_resource);
static DEFINE_RWLOCK(resource_lock);
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v)
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
- seq_printf(m, "%*s%0*lx-%0*lx : %s\n",
+ seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, r->start,
- width, r->end,
+ width, (unsigned long long) r->start,
+ width, (unsigned long long) r->end,
r->name ? r->name : "<BAD>");
return 0;
}
@@ -151,8 +148,8 @@ __initcall(ioresources_init);
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
- unsigned long start = new->start;
- unsigned long end = new->end;
+ resource_size_t start = new->start;
+ resource_size_t end = new->end;
struct resource *tmp, **p;
if (end < start)
@@ -232,15 +229,55 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+ resource_size_t start, end;
+ struct resource *p;
+
+ BUG_ON(!res);
+
+ start = res->start;
+ end = res->end;
+ BUG_ON(start >= end);
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ /* system ram is just marked as IORESOURCE_MEM */
+ if (p->flags != res->flags)
+ continue;
+ if (p->start > end) {
+ p = NULL;
+ break;
+ }
+ if ((p->end >= start) && (p->start < end))
+ break;
+ }
+ read_unlock(&resource_lock);
+ if (!p)
+ return -1;
+ /* copy data */
+ if (res->start < p->start)
+ res->start = p->start;
+ if (res->end > p->end)
+ res->end = p->end;
+ return 0;
+}
+#endif
+
/*
* Find empty slot in the resource tree given range and alignment.
*/
static int find_resource(struct resource *root, struct resource *new,
- unsigned long size,
- unsigned long min, unsigned long max,
- unsigned long align,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
void (*alignf)(void *, struct resource *,
- unsigned long, unsigned long),
+ resource_size_t, resource_size_t),
void *alignf_data)
{
struct resource *this = root->child;
@@ -282,11 +319,10 @@ static int find_resource(struct resource *root, struct resource *new,
* Allocate empty slot in the resource tree given range and alignment.
*/
int allocate_resource(struct resource *root, struct resource *new,
- unsigned long size,
- unsigned long min, unsigned long max,
- unsigned long align,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
void (*alignf)(void *, struct resource *,
- unsigned long, unsigned long),
+ resource_size_t, resource_size_t),
void *alignf_data)
{
int err;
@@ -308,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
*
* Returns 0 on success, -EBUSY if the resource can't be inserted.
*
- * This function is equivalent of request_resource when no conflict
+ * This function is equivalent to request_resource when no conflict
* happens. If a conflict happens, and the conflicting resources
* entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become childs of
- * the new resource. Otherwise the new resource becomes the child of
- * the conflicting resource
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
*/
int insert_resource(struct resource *parent, struct resource *new)
{
@@ -321,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
struct resource *first, *next;
write_lock(&resource_lock);
- begin:
- result = 0;
- first = __request_resource(parent, new);
- if (!first)
- goto out;
- result = -EBUSY;
- if (first == parent)
- goto out;
+ for (;; parent = first) {
+ result = 0;
+ first = __request_resource(parent, new);
+ if (!first)
+ goto out;
+
+ result = -EBUSY;
+ if (first == parent)
+ goto out;
- /* Resource fully contained by the clashing resource? Recurse into it */
- if (first->start <= new->start && first->end >= new->end) {
- parent = first;
- goto begin;
+ if ((first->start > new->start) || (first->end < new->end))
+ break;
+ if ((first->start == new->start) && (first->end == new->end))
+ break;
}
for (next = first; ; next = next->sibling) {
@@ -371,17 +407,15 @@ int insert_resource(struct resource *parent, struct resource *new)
return result;
}
-EXPORT_SYMBOL(insert_resource);
-
/*
* Given an existing resource, change its start and size to match the
* arguments. Returns -EBUSY if it can't fit. Existing children of
* the resource are assumed to be immutable.
*/
-int adjust_resource(struct resource *res, unsigned long start, unsigned long size)
+int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
- unsigned long end = start + size - 1;
+ resource_size_t end = start + size - 1;
int result = -EBUSY;
write_lock(&resource_lock);
@@ -428,7 +462,9 @@ EXPORT_SYMBOL(adjust_resource);
*
* Release-region releases a matching busy region.
*/
-struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
+struct resource * __request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name)
{
struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
@@ -464,7 +500,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
EXPORT_SYMBOL(__request_region);
-int __check_region(struct resource *parent, unsigned long start, unsigned long n)
+int __check_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
{
struct resource * res;
@@ -479,10 +516,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
EXPORT_SYMBOL(__check_region);
-void __release_region(struct resource *parent, unsigned long start, unsigned long n)
+void __release_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
{
struct resource **p;
- unsigned long end;
+ resource_size_t end;
p = &parent->child;
end = start + n - 1;
@@ -511,7 +549,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
+ printk(KERN_WARNING "Trying to free nonexistent resource "
+ "<%016llx-%016llx>\n", (unsigned long long)start,
+ (unsigned long long)end);
}
EXPORT_SYMBOL(__release_region);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..0c1faa950af7
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,242 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Copyright (C) 2006 Esben Nielsen
+ * Copyright (C) 2006 Kihon Technologies Inc.,
+ * Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/debug_locks.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x) WARN_ON(x)
+# define TRACE_BUG_ON(x) BUG_ON(x)
+
+# define TRACE_OFF() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ if (spin_is_locked(&current->pi_lock)) \
+ spin_unlock(&current->pi_lock); \
+ } \
+} while (0)
+
+# define TRACE_OFF_NOLOCK() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ } \
+} while (0)
+
+# define TRACE_BUG_LOCKED() \
+do { \
+ TRACE_OFF(); \
+ BUG(); \
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) { \
+ TRACE_OFF(); \
+ WARN_ON(1); \
+ } \
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) \
+ TRACE_BUG_LOCKED(); \
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+ rt_trace_on = 0;
+}
+
+static void printk_task(struct task_struct *p)
+{
+ if (p)
+ printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+ if (lock->name)
+ printk(" [%p] {%s}\n",
+ lock, lock->name);
+ else
+ printk(" [%p] {%s:%d}\n",
+ lock, lock->file, lock->line);
+
+ if (print_owner && rt_mutex_owner(lock)) {
+ printk(".. ->owner: %p\n", lock->owner);
+ printk(".. held by: ");
+ printk_task(rt_mutex_owner(lock));
+ printk("\n");
+ }
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ WARN_ON(!plist_head_empty(&task->pi_waiters));
+ WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+ struct rt_mutex *lock)
+{
+ struct task_struct *task;
+
+ if (!rt_trace_on || detect || !act_waiter)
+ return;
+
+ task = rt_mutex_owner(act_waiter->lock);
+ if (task && task != current) {
+ act_waiter->deadlock_task_pid = task->pid;
+ act_waiter->deadlock_lock = lock;
+ }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+ struct task_struct *task;
+
+ if (!waiter->deadlock_lock || !rt_trace_on)
+ return;
+
+ task = find_task_by_pid(waiter->deadlock_task_pid);
+ if (!task)
+ return;
+
+ TRACE_OFF_NOLOCK();
+
+ printk("\n============================================\n");
+ printk( "[ BUG: circular locking deadlock detected! ]\n");
+ printk( "--------------------------------------------\n");
+ printk("%s/%d is deadlocking current task %s/%d\n\n",
+ task->comm, task->pid, current->comm, current->pid);
+
+ printk("\n1) %s/%d is trying to acquire this lock:\n",
+ current->comm, current->pid);
+ printk_lock(waiter->lock, 1);
+
+ printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+ printk_lock(waiter->deadlock_lock, 1);
+
+ debug_show_held_locks(current);
+ debug_show_held_locks(task);
+
+ printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+ show_stack(task, NULL);
+ printk("\n%s/%d's [current] stackdump:\n\n",
+ current->comm, current->pid);
+ dump_stack();
+ debug_show_all_locks();
+
+ printk("[ turning off deadlock detection."
+ "Please report this trace. ]\n\n");
+ local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock)
+{
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+}
+
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
+{
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ memset(waiter, 0x11, sizeof(*waiter));
+ plist_node_init(&waiter->list_entry, MAX_PRIO);
+ plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+ TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+ TRACE_WARN_ON(waiter->task);
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lock->name = name;
+}
+
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..14193d596d78
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,33 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w) \
+ do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ int detect)
+{
+ return (waiter != NULL);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..948bd8f643e2
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,441 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS 8
+#define MAX_RT_TEST_MUTEXES 8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+ int opcode;
+ int opdata;
+ int mutexes[MAX_RT_TEST_MUTEXES];
+ int bkl;
+ int event;
+ struct sys_device sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+ RTTEST_NOP = 0,
+ RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
+ RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
+ RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
+ RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
+ RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
+ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
+ RTTEST_LOCKBKL, /* 9 Lock BKL */
+ RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
+ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
+ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
+ RTTEST_RESET = 99, /* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+ int i, id, ret = -EINVAL;
+
+ switch(td->opcode) {
+
+ case RTTEST_NOP:
+ return 0;
+
+ case RTTEST_LOCKCONT:
+ td->mutexes[td->opdata] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return 0;
+
+ case RTTEST_RESET:
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+ if (td->mutexes[i] == 4) {
+ rt_mutex_unlock(&mutexes[i]);
+ td->mutexes[i] = 0;
+ }
+ }
+
+ if (!lockwakeup && td->bkl == 4) {
+ unlock_kernel();
+ td->bkl = 0;
+ }
+ return 0;
+
+ case RTTEST_RESETEVENT:
+ atomic_set(&rttest_event, 0);
+ return 0;
+
+ default:
+ if (lockwakeup)
+ return ret;
+ }
+
+ switch(td->opcode) {
+
+ case RTTEST_LOCK:
+ case RTTEST_LOCKNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_lock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 4;
+ return 0;
+
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKINTNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = ret ? 0 : 4;
+ return ret ? -EINTR : 0;
+
+ case RTTEST_UNLOCK:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+ return ret;
+
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_unlock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 0;
+ return 0;
+
+ case RTTEST_LOCKBKL:
+ if (td->bkl)
+ return 0;
+ td->bkl = 1;
+ lock_kernel();
+ td->bkl = 4;
+ return 0;
+
+ case RTTEST_UNLOCKBKL:
+ if (td->bkl != 4)
+ break;
+ unlock_kernel();
+ td->bkl = 0;
+ return 0;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+ int tid, op, dat;
+ struct test_thread_data *td;
+
+ /* We have to lookup the task */
+ for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+ if (threads[tid] == current)
+ break;
+ }
+
+ BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+ td = &thread_data[tid];
+
+ op = td->opcode;
+ dat = td->opdata;
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ break;
+
+ if (td->mutexes[dat] != 1)
+ break;
+
+ td->mutexes[dat] = 2;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKBKL:
+ default:
+ break;
+ }
+
+ schedule();
+
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 3;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return;
+
+ case RTTEST_LOCKBKL:
+ return;
+ default:
+ return;
+ }
+
+ td->opcode = 0;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ int ret;
+
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 1);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (td->opcode == RTTEST_LOCKCONT)
+ break;
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ }
+
+ /* Restore previous command and data */
+ td->opcode = op;
+ td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+ struct test_thread_data *td = data;
+ int ret;
+
+ current->flags |= PF_MUTEX_TESTER;
+ allow_signal(SIGHUP);
+
+ for(;;) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 0);
+ set_current_state(TASK_INTERRUPTIBLE);
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ try_to_freeze();
+
+ if (signal_pending(current))
+ flush_signals(current);
+
+ if(kthread_should_stop())
+ break;
+ }
+ return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev: thread reference
+ * @buf: command for actual step
+ * @count: length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+ size_t count)
+{
+ struct sched_param schedpar;
+ struct test_thread_data *td;
+ char cmdbuf[32];
+ int op, dat, tid, ret;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tid = td->sysdev.id;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(cmdbuf))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+ if (count < 1)
+ return -EINVAL;
+
+ memcpy(cmdbuf, buf, count);
+ cmdbuf[count] = 0;
+
+ if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+ return -EINVAL;
+
+ switch (op) {
+ case RTTEST_SCHEDOT:
+ schedpar.sched_priority = 0;
+ ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+ if (ret)
+ return ret;
+ set_user_nice(current, 0);
+ break;
+
+ case RTTEST_SCHEDRT:
+ schedpar.sched_priority = dat;
+ ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+ if (ret)
+ return ret;
+ break;
+
+ case RTTEST_SIGNAL:
+ send_sig(SIGHUP, threads[tid], 0);
+ break;
+
+ default:
+ if (td->opcode > 0)
+ return -EBUSY;
+ td->opdata = dat;
+ td->opcode = op;
+ wake_up_process(threads[tid]);
+ }
+
+ return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev: thread to query
+ * @buf: char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+ struct test_thread_data *td;
+ struct task_struct *tsk;
+ char *curr = buf;
+ int i;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tsk = threads[td->sysdev.id];
+
+ spin_lock(&rttest_lock);
+
+ curr += sprintf(curr,
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+ td->opcode, td->event, tsk->state,
+ (MAX_RT_PRIO - 1) - tsk->prio,
+ (MAX_RT_PRIO - 1) - tsk->normal_prio,
+ tsk->pi_blocked_on, td->bkl);
+
+ for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+ curr += sprintf(curr, "%d", td->mutexes[i]);
+
+ spin_unlock(&rttest_lock);
+
+ curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+ mutexes[td->sysdev.id].owner);
+
+ return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+ set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+ thread_data[id].sysdev.cls = &rttest_sysclass;
+ thread_data[id].sysdev.id = id;
+
+ threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+ if (IS_ERR(threads[id]))
+ return PTR_ERR(threads[id]);
+
+ return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+ int ret, i;
+
+ spin_lock_init(&rttest_lock);
+
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+ rt_mutex_init(&mutexes[i]);
+
+ ret = sysdev_class_register(&rttest_sysclass);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+ ret = init_test_thread(i);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+ if (ret)
+ break;
+ }
+
+ printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+ return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..4ab17da46fd8
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ * Copyright (C) 2006 Esben Nielsen
+ *
+ * See Documentation/rt-mutex-design.txt for details.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner bit1 bit0
+ * NULL 0 0 lock is free (fast acquire possible)
+ * NULL 0 1 invalid state
+ * NULL 1 0 Transitional State*
+ * NULL 1 1 invalid state
+ * taskpointer 0 0 lock is held (fast release possible)
+ * taskpointer 0 1 task is pending owner
+ * taskpointer 1 0 lock is held and has waiters
+ * taskpointer 1 1 task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set. This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask)
+{
+ unsigned long val = (unsigned long)owner | mask;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return task->normal_prio;
+
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+ task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+ int prio = rt_mutex_getprio(task);
+
+ if (task->prio != prio)
+ rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
+{
+ struct rt_mutex *lock;
+ struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+ int detect_deadlock, ret = 0, depth = 0;
+ unsigned long flags;
+
+ detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+ deadlock_detect);
+
+ /*
+ * The (de)boosting is a step by step approach with a lot of
+ * pitfalls. We want this to be preemptible and we want hold a
+ * maximum of two locks per step. So we have to check
+ * carefully whether things change under us.
+ */
+ again:
+ if (++depth > max_lock_depth) {
+ static int prev_max;
+
+ /*
+ * Print this only once. If the admin changes the limit,
+ * print a new message when reaching the limit again.
+ */
+ if (prev_max != max_lock_depth) {
+ prev_max = max_lock_depth;
+ printk(KERN_WARNING "Maximum lock depth %d reached "
+ "task: %s (%d)\n", max_lock_depth,
+ top_task->comm, top_task->pid);
+ }
+ put_task_struct(task);
+
+ return deadlock_detect ? -EDEADLK : 0;
+ }
+ retry:
+ /*
+ * Task can not go away as we did a get_task() before !
+ */
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ /*
+ * Check whether the end of the boosting chain has been
+ * reached or the state of the chain has changed while we
+ * dropped the locks.
+ */
+ if (!waiter || !waiter->task)
+ goto out_unlock_pi;
+
+ if (top_waiter && (!task_has_pi_waiters(task) ||
+ top_waiter != task_top_pi_waiter(task)))
+ goto out_unlock_pi;
+
+ /*
+ * When deadlock detection is off then we check, if further
+ * priority adjustment is necessary.
+ */
+ if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+ goto out_unlock_pi;
+
+ lock = waiter->lock;
+ if (!spin_trylock(&lock->wait_lock)) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ cpu_relax();
+ goto retry;
+ }
+
+ /* Deadlock detection */
+ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+ debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ spin_unlock(&lock->wait_lock);
+ ret = deadlock_detect ? -EDEADLK : 0;
+ goto out_unlock_pi;
+ }
+
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* Requeue the waiter */
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->list_entry.prio = task->prio;
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ /* Release the task */
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /* Grab the next task */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ /* Boost the owner */
+ plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+
+ } else if (top_waiter == waiter) {
+ /* Deboost the owner */
+ plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+ waiter = rt_mutex_top_waiter(lock);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+ }
+
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ top_waiter = rt_mutex_top_waiter(lock);
+ spin_unlock(&lock->wait_lock);
+
+ if (!detect_deadlock && waiter != top_waiter)
+ goto out_put_task;
+
+ goto again;
+
+ out_unlock_pi:
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+ put_task_struct(task);
+
+ return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+ struct task_struct *pendowner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *next;
+ unsigned long flags;
+
+ if (!rt_mutex_owner_pending(lock))
+ return 0;
+
+ if (pendowner == current)
+ return 1;
+
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+ if (current->prio >= pendowner->prio) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 0;
+ }
+
+ /*
+ * Check if a waiter is enqueued on the pending owners
+ * pi_waiters list. Remove it and readjust pending owners
+ * priority.
+ */
+ if (likely(!rt_mutex_has_waiters(lock))) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 1;
+ }
+
+ /* No chain handling, pending owner is not blocked on anything: */
+ next = rt_mutex_top_waiter(lock);
+ plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+ __rt_mutex_adjust_prio(pendowner);
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ /*
+ * We are going to steal the lock and a waiter was
+ * enqueued on the pending owners pi_waiters queue. So
+ * we have to enqueue this waiter into
+ * current->pi_waiters list. This covers the case,
+ * where current is boosted because it holds another
+ * lock and gets unboosted because the booster is
+ * interrupted, so we would delay a waiter with higher
+ * priority as current->normal_prio.
+ *
+ * Note: in the rare case of a SCHED_OTHER task changing
+ * its priority and thus stealing the lock, next->task
+ * might be current:
+ */
+ if (likely(next->task != current)) {
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_add(&next->pi_list_entry, &current->pi_waiters);
+ __rt_mutex_adjust_prio(current);
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+ }
+ return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock)
+{
+ /*
+ * We have to be careful here if the atomic speedups are
+ * enabled, such that, when
+ * - no other waiter is on the lock
+ * - the lock has been released since we did the cmpxchg
+ * the lock can be released or taken while we are doing the
+ * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ *
+ * The atomic acquire/release aware variant of
+ * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+ * the WAITERS bit, the atomic release / acquire can not
+ * happen anymore and lock->wait_lock protects us from the
+ * non-atomic case.
+ *
+ * Note, that this might set lock->owner =
+ * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+ * any more. This is fixed up when we take the ownership.
+ * This is the transitional state explained at the top of this file.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ return 0;
+
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock);
+
+ rt_mutex_set_owner(lock, current, 0);
+
+ rt_mutex_deadlock_account_lock(lock, current);
+
+ return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *top_waiter = waiter;
+ unsigned long flags;
+ int chain_walk = 0, res;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ __rt_mutex_adjust_prio(current);
+ waiter->task = current;
+ waiter->lock = lock;
+ plist_node_init(&waiter->list_entry, current->prio);
+ plist_node_init(&waiter->pi_list_entry, current->prio);
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ current->pi_blocked_on = waiter;
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ spin_lock_irqsave(&owner->pi_lock, flags);
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+ else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ chain_walk = 1;
+
+ if (!chain_walk)
+ return 0;
+
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
+ spin_unlock(&lock->wait_lock);
+
+ res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+ current);
+
+ spin_lock(&lock->wait_lock);
+
+ return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+ struct task_struct *pendowner;
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+
+ waiter = rt_mutex_top_waiter(lock);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ /*
+ * Remove it from current->pi_waiters. We do not adjust a
+ * possible priority boost right now. We execute wakeup in the
+ * boosted mode and go back to normal after releasing
+ * lock->wait_lock.
+ */
+ plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+ pendowner = waiter->task;
+ waiter->task = NULL;
+
+ rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ /*
+ * Clear the pi_blocked_on variable and enqueue a possible
+ * waiter into the pi_waiters list of the pending owner. This
+ * prevents that in case the pending owner gets unboosted a
+ * waiter with higher priority than pending-owner->normal_prio
+ * is blocked on the unboosted (pending) owner.
+ */
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+ WARN_ON(!pendowner->pi_blocked_on);
+ WARN_ON(pendowner->pi_blocked_on != waiter);
+ WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+ pendowner->pi_blocked_on = NULL;
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+ }
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ int first = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ unsigned long flags;
+ int chain_walk = 0;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->task = NULL;
+ current->pi_blocked_on = NULL;
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (first && owner != current) {
+
+ spin_lock_irqsave(&owner->pi_lock, flags);
+
+ plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &owner->pi_waiters);
+ }
+ __rt_mutex_adjust_prio(owner);
+
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+
+ WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+ if (!chain_walk)
+ return;
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
+ spin_unlock(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+
+ spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || waiter->list_entry.prio == task->prio) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+ rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock)) {
+ spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout))
+ hrtimer_start(&timeout->timer, timeout->timer.expires,
+ HRTIMER_ABS);
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ /*
+ * waiter.task is NULL the first time we come here and
+ * when we have been woken up by the previous owner
+ * but the lock got stolen by a higher prio task.
+ */
+ if (!waiter.task) {
+ ret = task_blocks_on_rt_mutex(lock, &waiter,
+ detect_deadlock);
+ /*
+ * If we got woken up by the owner then start loop
+ * all over without going into schedule to try
+ * to get the lock now:
+ */
+ if (unlikely(!waiter.task))
+ continue;
+
+ if (unlikely(ret))
+ break;
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(&waiter);
+
+ if (waiter.task)
+ schedule_rt_mutex(lock);
+
+ spin_lock(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (unlikely(waiter.task))
+ remove_waiter(lock, &waiter);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout))
+ hrtimer_cancel(&timeout->timer);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might
+ * have been the pending owner and boosted. Since we did not
+ * take the lock, the PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = 0;
+
+ spin_lock(&lock->wait_lock);
+
+ if (likely(rt_mutex_owner(lock) != current)) {
+
+ ret = try_to_take_rt_mutex(lock);
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters
+ * bit unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ spin_lock(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ spin_unlock(&lock->wait_lock);
+ return;
+ }
+
+ wakeup_next_waiter(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, NULL, detect_deadlock);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout, int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, timeout, detect_deadlock);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 1;
+ }
+ return slowfn(lock);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEOUT when the timeout expired
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ lock->owner = NULL;
+ spin_lock_init(&lock->wait_lock);
+ plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+ debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ __rt_mutex_init(lock, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
+ rt_mutex_set_owner(lock, proxy_owner, 0);
+ rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL, 0);
+ rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ return NULL;
+
+ return rt_mutex_top_waiter(lock)->task;
+}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..a1a1dd06421d
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,26 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define rt_mutex_deadlock_check(l) (0)
+#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d) (d)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock) \
+ do { \
+ if (!(current->flags & PF_MUTEX_TESTER)) \
+ schedule(); \
+ else \
+ schedule_rt_mutex_test(_lock); \
+ } while (0)
+
+#else
+# define schedule_rt_mutex(_lock) schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry: pi node to enqueue into the mutex waiters list
+ * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct plist_node list_entry;
+ struct plist_node pi_list_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ pid_t deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *w;
+
+ w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+ list_entry);
+ BUG_ON(w->lock != lock);
+
+ return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+ pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING 1UL
+#define RT_MUTEX_HAS_WAITERS 2UL
+#define RT_MUTEX_OWNER_MASKALL 3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+ return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 000000000000..291ded556aa0
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,147 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+/*
+ * lock for reading
+ */
+void down_read(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_read_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void down_write(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ __down_write(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_write_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+ /*
+ * lockdep: a downgraded write will live on as a write
+ * dependency.
+ */
+ __downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ __down_write_nested(sem, subclass);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
diff --git a/kernel/sched.c b/kernel/sched.c
index 5dbc42694477..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -50,6 +51,7 @@
#include <linux/times.h>
#include <linux/acct.h>
#include <linux/kprobes.h>
+#include <linux/delayacct.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
@@ -168,29 +170,28 @@
*/
#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-static unsigned int task_timeslice(task_t *p)
+static unsigned int static_prio_timeslice(int static_prio)
{
- if (p->static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+ if (static_prio < NICE_TO_PRIO(0))
+ return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
else
- return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+ return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+}
+
+static inline unsigned int task_timeslice(struct task_struct *p)
+{
+ return static_prio_timeslice(p->static_prio);
}
-#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
- < (long long) (sd)->cache_hot_time)
/*
* These are the runqueue data structures:
*/
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
-typedef struct runqueue runqueue_t;
-
struct prio_array {
unsigned int nr_active;
- unsigned long bitmap[BITMAP_SIZE];
+ DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_PRIO];
};
@@ -201,7 +202,7 @@ struct prio_array {
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
-struct runqueue {
+struct rq {
spinlock_t lock;
/*
@@ -209,6 +210,7 @@ struct runqueue {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
+ unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
#endif
@@ -224,9 +226,9 @@ struct runqueue {
unsigned long expired_timestamp;
unsigned long long timestamp_last_tick;
- task_t *curr, *idle;
+ struct task_struct *curr, *idle;
struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
+ struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
atomic_t nr_iowait;
@@ -236,10 +238,10 @@ struct runqueue {
/* For active balancing */
int active_balance;
int push_cpu;
+ int cpu; /* cpu of this runqueue */
- task_t *migration_thread;
+ struct task_struct *migration_thread;
struct list_head migration_queue;
- int cpu;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -261,9 +263,19 @@ struct runqueue {
unsigned long ttwu_cnt;
unsigned long ttwu_local;
#endif
+ struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues);
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -272,8 +284,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
-#define for_each_domain(cpu, domain) \
-for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
@@ -288,26 +300,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
spin_unlock_irq(&rq->lock);
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->oncpu;
@@ -316,7 +335,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
#endif
}
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
@@ -333,7 +352,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
#endif
}
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
@@ -351,14 +370,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+repeat_lock_task:
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (unlikely(rq != task_rq(p))) {
+ spin_unlock(&rq->lock);
+ goto repeat_lock_task;
+ }
+ return rq;
+}
+
+/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(rq->lock)
{
- struct runqueue *rq;
+ struct rq *rq;
repeat_lock_task:
local_irq_save(*flags);
@@ -371,7 +409,13 @@ repeat_lock_task:
return rq;
}
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ spin_unlock(&rq->lock);
+}
+
+static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
__releases(rq->lock)
{
spin_unlock_irqrestore(&rq->lock, *flags);
@@ -391,7 +435,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
for_each_online_cpu(cpu) {
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
@@ -468,9 +512,36 @@ struct file_operations proc_schedstat_operations = {
.release = single_release,
};
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta_jiffies;
+ rq->rq_sched_info.pcnt++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+ if (rq)
+ rq->rq_sched_info.cpu_time += delta_jiffies;
+}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
#endif
@@ -478,10 +549,10 @@ struct file_operations proc_schedstat_operations = {
/*
* rq_lock - lock a given runqueue and disable interrupts.
*/
-static inline runqueue_t *this_rq_lock(void)
+static inline struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
- runqueue_t *rq;
+ struct rq *rq;
local_irq_disable();
rq = this_rq();
@@ -490,7 +561,7 @@ static inline runqueue_t *this_rq_lock(void)
return rq;
}
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
@@ -506,7 +577,7 @@ static inline runqueue_t *this_rq_lock(void)
* long it was from the *first* time it was queued to the time that it
* finally hit a cpu.
*/
-static inline void sched_info_dequeued(task_t *t)
+static inline void sched_info_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
}
@@ -516,23 +587,18 @@ static inline void sched_info_dequeued(task_t *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(task_t *t)
+static void sched_info_arrive(struct task_struct *t)
{
- unsigned long now = jiffies, diff = 0;
- struct runqueue *rq = task_rq(t);
+ unsigned long now = jiffies, delta_jiffies = 0;
if (t->sched_info.last_queued)
- diff = now - t->sched_info.last_queued;
+ delta_jiffies = now - t->sched_info.last_queued;
sched_info_dequeued(t);
- t->sched_info.run_delay += diff;
+ t->sched_info.run_delay += delta_jiffies;
t->sched_info.last_arrival = now;
t->sched_info.pcnt++;
- if (!rq)
- return;
-
- rq->rq_sched_info.run_delay += diff;
- rq->rq_sched_info.pcnt++;
+ rq_sched_info_arrive(task_rq(t), delta_jiffies);
}
/*
@@ -550,25 +616,23 @@ static void sched_info_arrive(task_t *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(task_t *t)
+static inline void sched_info_queued(struct task_struct *t)
{
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = jiffies;
+ if (unlikely(sched_info_on()))
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = jiffies;
}
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
*/
-static inline void sched_info_depart(task_t *t)
+static inline void sched_info_depart(struct task_struct *t)
{
- struct runqueue *rq = task_rq(t);
- unsigned long diff = jiffies - t->sched_info.last_arrival;
-
- t->sched_info.cpu_time += diff;
+ unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
- if (rq)
- rq->rq_sched_info.cpu_time += diff;
+ t->sched_info.cpu_time += delta_jiffies;
+ rq_sched_info_depart(task_rq(t), delta_jiffies);
}
/*
@@ -576,9 +640,10 @@ static inline void sched_info_depart(task_t *t)
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
-static inline void sched_info_switch(task_t *prev, task_t *next)
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
- struct runqueue *rq = task_rq(prev);
+ struct rq *rq = task_rq(prev);
/*
* prev now departs the cpu. It's not interesting to record
@@ -591,15 +656,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next)
if (next != rq->idle)
sched_info_arrive(next);
}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(prev, next);
+}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
* Adding/removing a task to/from a priority array:
*/
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
{
array->nr_active--;
list_del(&p->run_list);
@@ -607,7 +678,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
__clear_bit(p->prio, array->bitmap);
}
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
sched_info_queued(p);
list_add_tail(&p->run_list, array->queue + p->prio);
@@ -620,12 +691,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
*/
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static void requeue_task(struct task_struct *p, struct prio_array *array)
{
list_move_tail(&p->run_list, array->queue + p->prio);
}
-static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
{
list_add(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
@@ -634,7 +706,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
}
/*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +719,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
*
* Both properties are important to certain workloads.
*/
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(struct task_struct *p)
{
int bonus, prio;
- if (rt_task(p))
- return p->prio;
-
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus;
@@ -665,57 +735,165 @@ static int effective_prio(task_t *p)
}
/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+ (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+ LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+ (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(struct task_struct *p)
+{
+ if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+ if (p == task_rq(p)->migration_thread)
+ /*
+ * The migration thread does the actual balancing.
+ * Giving its load any weight will skew balancing
+ * adversely.
+ */
+ p->load_weight = 0;
+ else
+#endif
+ p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+ } else
+ p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void
+inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+{
+ rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void
+dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
+{
+ rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
+{
+ rq->nr_running++;
+ inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
+{
+ rq->nr_running--;
+ dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ int prio;
+
+ if (has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
* __activate_task - move a task to the runqueue.
*/
-static void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(struct task_struct *p, struct rq *rq)
{
- prio_array_t *target = rq->active;
+ struct prio_array *target = rq->active;
if (batch_task(p))
target = rq->expired;
enqueue_task(p, target);
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
/*
* __activate_idle_task - move idle task to the _front_ of runqueue.
*/
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
{
enqueue_task_head(p, rq->active);
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
-static int recalc_task_prio(task_t *p, unsigned long long now)
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
+static int recalc_task_prio(struct task_struct *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
- unsigned long long __sleep_time = now - p->timestamp;
- unsigned long sleep_time;
+ unsigned long sleep_time = now - p->timestamp;
if (batch_task(p))
sleep_time = 0;
- else {
- if (__sleep_time > NS_MAX_SLEEP_AVG)
- sleep_time = NS_MAX_SLEEP_AVG;
- else
- sleep_time = (unsigned long)__sleep_time;
- }
if (likely(sleep_time > 0)) {
/*
- * User tasks that sleep a long time are categorised as
- * idle. They will only have their sleep_avg increased to a
- * level that makes them just interactive priority to stay
- * active yet prevent them suddenly becoming cpu hogs and
- * starving other processes.
+ * This ceiling is set to the lowest priority that would allow
+ * a task to be reinserted into the active array on timeslice
+ * completion.
*/
- if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
- unsigned long ceiling;
+ unsigned long ceiling = INTERACTIVE_SLEEP(p);
- ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
- DEF_TIMESLICE);
- if (p->sleep_avg < ceiling)
- p->sleep_avg = ceiling;
+ if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+ /*
+ * Prevents user tasks from achieving best priority
+ * with one single large enough sleep.
+ */
+ p->sleep_avg = ceiling;
+ /*
+ * Using INTERACTIVE_SLEEP() as a ceiling places a
+ * nice(0) task 1ms sleep away from promotion, and
+ * gives it 700ms to round-robin with no chance of
+ * being demoted. This is more than generous, so
+ * mark this sleep as non-interactive to prevent the
+ * on-runqueue bonus logic from intervening should
+ * this task not receive cpu immediately.
+ */
+ p->sleep_type = SLEEP_NONINTERACTIVE;
} else {
/*
* Tasks waking from uninterruptible sleep are
@@ -723,12 +901,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
* are likely to be waiting on I/O
*/
if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
- if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+ if (p->sleep_avg >= ceiling)
sleep_time = 0;
else if (p->sleep_avg + sleep_time >=
- INTERACTIVE_SLEEP(p)) {
- p->sleep_avg = INTERACTIVE_SLEEP(p);
- sleep_time = 0;
+ ceiling) {
+ p->sleep_avg = ceiling;
+ sleep_time = 0;
}
}
@@ -742,9 +920,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
*/
p->sleep_avg += sleep_time;
- if (p->sleep_avg > NS_MAX_SLEEP_AVG)
- p->sleep_avg = NS_MAX_SLEEP_AVG;
}
+ if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+ p->sleep_avg = NS_MAX_SLEEP_AVG;
}
return effective_prio(p);
@@ -756,7 +934,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
* Update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
-static void activate_task(task_t *p, runqueue_t *rq, int local)
+static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
unsigned long long now;
@@ -764,7 +942,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
- runqueue_t *this_rq = this_rq();
+ struct rq *this_rq = this_rq();
now = (now - this_rq->timestamp_last_tick)
+ rq->timestamp_last_tick;
}
@@ -803,9 +981,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static void deactivate_task(struct task_struct *p, struct rq *rq)
{
- rq->nr_running--;
+ dec_nr_running(p, rq);
dequeue_task(p, p->array);
p->array = NULL;
}
@@ -818,7 +996,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
* the target CPU.
*/
#ifdef CONFIG_SMP
-static void resched_task(task_t *p)
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
{
int cpu;
@@ -833,13 +1016,13 @@ static void resched_task(task_t *p)
if (cpu == smp_processor_id())
return;
- /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
+ /* NEED_RESCHED must be visible before we test polling */
smp_mb();
- if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+ if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
#else
-static inline void resched_task(task_t *p)
+static inline void resched_task(struct task_struct *p)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
@@ -850,28 +1033,35 @@ static inline void resched_task(task_t *p)
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
*/
-inline int task_curr(const task_t *p)
+inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->raw_weighted_load;
+}
+
#ifdef CONFIG_SMP
-typedef struct {
+struct migration_req {
struct list_head list;
- task_t *task;
+ struct task_struct *task;
int dest_cpu;
struct completion done;
-} migration_req_t;
+};
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
+static int
+migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
{
- runqueue_t *rq = task_rq(p);
+ struct rq *rq = task_rq(p);
/*
* If the task is not on a runqueue (and not running), then
@@ -886,6 +1076,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
+
return 1;
}
@@ -898,10 +1089,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t *p)
+void wait_task_inactive(struct task_struct *p)
{
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
int preempted;
repeat:
@@ -932,7 +1123,7 @@ repeat:
* to another CPU then no harm is done and the purpose has been
* achieved as well.
*/
-void kick_process(task_t *p)
+void kick_process(struct task_struct *p)
{
int cpu;
@@ -944,32 +1135,45 @@ void kick_process(task_t *p)
}
/*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static inline unsigned long source_load(int cpu, int type)
{
- runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ struct rq *rq = cpu_rq(cpu);
+
if (type == 0)
- return load_now;
+ return rq->raw_weighted_load;
- return min(rq->cpu_load[type-1], load_now);
+ return min(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
*/
static inline unsigned long target_load(int cpu, int type)
{
- runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ struct rq *rq = cpu_rq(cpu);
+
if (type == 0)
- return load_now;
+ return rq->raw_weighted_load;
+
+ return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
- return max(rq->cpu_load[type-1], load_now);
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long n = rq->nr_running;
+
+ return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
@@ -1042,7 +1246,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
cpus_and(tmp, group->cpumask, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
- load = source_load(i, 0);
+ load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
@@ -1069,9 +1273,15 @@ static int sched_balance_self(int cpu, int flag)
struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL;
- for_each_domain(cpu, tmp)
+ for_each_domain(cpu, tmp) {
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
+ if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+ break;
if (tmp->flags & flag)
sd = tmp;
+ }
while (sd) {
cpumask_t span;
@@ -1116,7 +1326,7 @@ nextlevel:
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, task_t *p)
+static int wake_idle(int cpu, struct task_struct *p)
{
cpumask_t tmp;
struct sched_domain *sd;
@@ -1139,7 +1349,7 @@ static int wake_idle(int cpu, task_t *p)
return cpu;
}
#else
-static inline int wake_idle(int cpu, task_t *p)
+static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
@@ -1159,15 +1369,15 @@ static inline int wake_idle(int cpu, task_t *p)
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
- runqueue_t *rq;
+ struct rq *rq;
#ifdef CONFIG_SMP
- unsigned long load, this_load;
struct sched_domain *sd, *this_sd = NULL;
+ unsigned long load, this_load;
int new_cpu;
#endif
@@ -1221,17 +1431,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
+ unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync)
- tl -= SCHED_LOAD_SCALE;
+ tl -= current->load_weight;
if ((tl <= load &&
- tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
- 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ tl + target_load(cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->load_weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1315,15 +1527,14 @@ out:
return success;
}
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
}
-
EXPORT_SYMBOL(wake_up_process);
-int fastcall wake_up_state(task_t *p, unsigned int state)
+int fastcall wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
}
@@ -1332,7 +1543,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*/
-void fastcall sched_fork(task_t *p, int clone_flags)
+void fastcall sched_fork(struct task_struct *p, int clone_flags)
{
int cpu = get_cpu();
@@ -1348,10 +1559,17 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_RUNNING;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child:
+ */
+ p->prio = current->normal_prio;
+
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
- memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
p->oncpu = 0;
@@ -1394,11 +1612,11 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
+void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
+ struct rq *rq, *this_rq;
unsigned long flags;
int this_cpu, cpu;
- runqueue_t *rq, *this_rq;
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
@@ -1427,10 +1645,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
__activate_task(p, rq);
else {
p->prio = current->prio;
+ p->normal_prio = current->normal_prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
set_need_resched();
} else
@@ -1477,10 +1696,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t *p)
+void fastcall sched_exit(struct task_struct *p)
{
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
/*
* If the child was a (relative-) CPU hog then decrease
@@ -1511,7 +1730,7 @@ void fastcall sched_exit(task_t *p)
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
-static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
{
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
@@ -1532,31 +1751,31 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
- unsigned long prev_task_flags;
+ long prev_state;
rq->prev_mm = NULL;
/*
* A task struct has one reference for the use as "current".
- * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
- * calls schedule one last time. The schedule call will never return,
- * and the scheduled task must drop that reference.
- * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
* still held, otherwise prev could be scheduled on another cpu, die
* there before we look at prev->state, and then the reference would
* be dropped twice.
* Manfred Spraul <manfred@colorfullife.com>
*/
- prev_task_flags = prev->flags;
+ prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
- if (unlikely(prev_task_flags & PF_DEAD)) {
+ if (unlikely(prev_state == TASK_DEAD)) {
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
@@ -1570,10 +1789,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(task_t *prev)
+asmlinkage void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
+
finish_task_switch(rq, prev);
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
@@ -1587,8 +1807,9 @@ asmlinkage void schedule_tail(task_t *prev)
* context_switch - switch to the new MM and the new
* thread's register state.
*/
-static inline
-task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
+static inline struct task_struct *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
@@ -1605,6 +1826,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
}
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -1648,7 +1878,8 @@ unsigned long nr_uninterruptible(void)
unsigned long long nr_context_switches(void)
{
- unsigned long long i, sum = 0;
+ int i;
+ unsigned long long sum = 0;
for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_switches;
@@ -1684,15 +1915,21 @@ unsigned long nr_active(void)
#ifdef CONFIG_SMP
/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+ return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
+}
+
+/*
* double_rq_lock - safely lock two runqueues
*
- * We must take them in cpu order to match code in
- * dependent_sleeper and wake_dependent_sleeper.
- *
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
-static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
@@ -1700,7 +1937,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
} else {
- if (rq1->cpu < rq2->cpu) {
+ if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
@@ -1716,7 +1953,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
-static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
@@ -1730,13 +1967,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest->cpu < this_rq->cpu) {
+ if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
spin_lock(&this_rq->lock);
@@ -1751,11 +1988,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
* the cpu_allowed mask is restored.
*/
-static void sched_migrate_task(task_t *p, int dest_cpu)
+static void sched_migrate_task(struct task_struct *p, int dest_cpu)
{
- migration_req_t req;
- runqueue_t *rq;
+ struct migration_req req;
unsigned long flags;
+ struct rq *rq;
rq = task_rq_lock(p, &flags);
if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1766,11 +2003,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
struct task_struct *mt = rq->migration_thread;
+
get_task_struct(mt);
task_rq_unlock(rq, &flags);
wake_up_process(mt);
put_task_struct(mt);
wait_for_completion(&req.done);
+
return;
}
out:
@@ -1794,14 +2033,14 @@ void sched_exec(void)
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
-static
-void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
- runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+ struct task_struct *p, struct rq *this_rq,
+ struct prio_array *this_array, int this_cpu)
{
dequeue_task(p, src_array);
- src_rq->nr_running--;
+ dec_nr_running(p, src_rq);
set_task_cpu(p, this_cpu);
- this_rq->nr_running++;
+ inc_nr_running(p, this_rq);
enqueue_task(p, this_array);
p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+ this_rq->timestamp_last_tick;
@@ -1817,7 +2056,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
-int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum idle_type idle,
int *all_pinned)
{
@@ -1848,26 +2087,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
return 1;
}
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+
/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
*
* Called with both runqueues locked.
*/
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
- unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle, int *all_pinned)
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_nr_move, unsigned long max_load_move,
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
- prio_array_t *array, *dst_array;
+ int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+ best_prio_seen, skip_for_load;
+ struct prio_array *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0, pinned = 0;
- task_t *tmp;
+ struct task_struct *tmp;
+ long rem_load_move;
- if (max_nr_move == 0)
+ if (max_nr_move == 0 || max_load_move == 0)
goto out;
+ rem_load_move = max_load_move;
pinned = 1;
+ this_best_prio = rq_best_prio(this_rq);
+ best_prio = rq_best_prio(busiest);
+ /*
+ * Enable handling of the case where there is more than one task
+ * with the best priority. If the current running task is one
+ * of those with prio==best_prio we know it won't be moved
+ * and therefore it's safe to override the skip (based on load) of
+ * any task we find with that prio.
+ */
+ best_prio_seen = best_prio == busiest->curr->prio;
/*
* We first consider expired tasks. Those will likely not be
@@ -1903,11 +2158,22 @@ skip_bitmap:
head = array->queue + idx;
curr = head->prev;
skip_queue:
- tmp = list_entry(curr, task_t, run_list);
+ tmp = list_entry(curr, struct task_struct, run_list);
curr = curr->prev;
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+ /*
+ * To help distribute high priority tasks accross CPUs we don't
+ * skip a task if it will be the highest priority task (i.e. smallest
+ * prio value) on its new queue regardless of its load weight
+ */
+ skip_for_load = tmp->load_weight > rem_load_move;
+ if (skip_for_load && idx < this_best_prio)
+ skip_for_load = !best_prio_seen && idx == best_prio;
+ if (skip_for_load ||
+ !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+
+ best_prio_seen |= idx == best_prio;
if (curr != head)
goto skip_queue;
idx++;
@@ -1921,9 +2187,15 @@ skip_queue:
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
+ rem_load_move -= tmp->load_weight;
- /* We only want to steal up to the prescribed number of tasks. */
- if (pulled < max_nr_move) {
+ /*
+ * We only want to steal up to the prescribed number of tasks
+ * and the prescribed amount of weighted load.
+ */
+ if (pulled < max_nr_move && rem_load_move > 0) {
+ if (idx < this_best_prio)
+ this_best_prio = idx;
if (curr != head)
goto skip_queue;
idx++;
@@ -1944,19 +2216,30 @@ out:
/*
* find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+ cpumask_t *cpus)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
+ unsigned long busiest_load_per_task, busiest_nr_running;
+ unsigned long this_load_per_task, this_nr_running;
int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance = 1;
+ unsigned long leader_nr_running = 0, min_load_per_task = 0;
+ unsigned long min_nr_running = ULONG_MAX;
+ struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
max_load = this_load = total_load = total_pwr = 0;
+ busiest_load_per_task = busiest_nr_running = 0;
+ this_load_per_task = this_nr_running = 0;
if (idle == NOT_IDLE)
load_idx = sd->busy_idx;
else if (idle == NEWLY_IDLE)
@@ -1965,16 +2248,24 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
load_idx = sd->idle_idx;
do {
- unsigned long load;
+ unsigned long load, group_capacity;
int local_group;
int i;
+ unsigned long sum_nr_running, sum_weighted_load;
local_group = cpu_isset(this_cpu, group->cpumask);
/* Tally up the load of all CPUs in the group */
- avg_load = 0;
+ sum_weighted_load = sum_nr_running = avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ struct rq *rq;
+
+ if (!cpu_isset(i, *cpus))
+ continue;
+
+ rq = cpu_rq(i);
+
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;
@@ -1985,6 +2276,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
load = source_load(i, load_idx);
avg_load += load;
+ sum_nr_running += rq->nr_running;
+ sum_weighted_load += rq->raw_weighted_load;
}
total_load += avg_load;
@@ -1993,17 +2286,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
if (local_group) {
this_load = avg_load;
this = group;
- } else if (avg_load > max_load) {
+ this_nr_running = sum_nr_running;
+ this_load_per_task = sum_weighted_load;
+ } else if (avg_load > max_load &&
+ sum_nr_running > group_capacity) {
max_load = avg_load;
busiest = group;
+ busiest_nr_running = sum_nr_running;
+ busiest_load_per_task = sum_weighted_load;
+ }
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto group_next;
+
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (this_nr_running >= group_capacity ||
+ !this_nr_running))
+ power_savings_balance = 0;
+
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!power_savings_balance || sum_nr_running >= group_capacity
+ || !sum_nr_running)
+ goto group_next;
+
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sum_nr_running < min_nr_running) ||
+ (sum_nr_running == min_nr_running &&
+ first_cpu(group->cpumask) <
+ first_cpu(group_min->cpumask))) {
+ group_min = group;
+ min_nr_running = sum_nr_running;
+ min_load_per_task = sum_weighted_load /
+ sum_nr_running;
+ }
+
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sum_nr_running <= group_capacity - 1) {
+ if (sum_nr_running > leader_nr_running ||
+ (sum_nr_running == leader_nr_running &&
+ first_cpu(group->cpumask) >
+ first_cpu(group_leader->cpumask))) {
+ group_leader = group;
+ leader_nr_running = sum_nr_running;
+ }
}
+group_next:
+#endif
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+ if (!busiest || this_load >= max_load || busiest_nr_running == 0)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2368,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
100*max_load <= sd->imbalance_pct*this_load)
goto out_balanced;
+ busiest_load_per_task /= busiest_nr_running;
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2380,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+ if (max_load <= busiest_load_per_task)
+ goto out_balanced;
+
+ /*
+ * In the presence of smp nice balancing, certain scenarios can have
+ * max load less than avg load(as we skip the groups at or below
+ * its cpu_power, while calculating max_load..)
+ */
+ if (max_load < avg_load) {
+ *imbalance = 0;
+ goto small_imbalance;
+ }
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+ max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
- if (*imbalance < SCHED_LOAD_SCALE) {
- unsigned long pwr_now = 0, pwr_move = 0;
- unsigned long tmp;
+ /*
+ * if *imbalance is less than the average load per runnable task
+ * there is no gaurantee that any tasks will be moved so we'll have
+ * a think about bumping its value to force at least one task to be
+ * moved
+ */
+ if (*imbalance < busiest_load_per_task) {
+ unsigned long tmp, pwr_now, pwr_move;
+ unsigned int imbn;
+
+small_imbalance:
+ pwr_move = pwr_now = 0;
+ imbn = 2;
+ if (this_nr_running) {
+ this_load_per_task /= this_nr_running;
+ if (busiest_load_per_task > this_load_per_task)
+ imbn = 1;
+ } else
+ this_load_per_task = SCHED_LOAD_SCALE;
- if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
- *imbalance = 1;
+ if (max_load - this_load >= busiest_load_per_task * imbn) {
+ *imbalance = busiest_load_per_task;
return busiest;
}
@@ -2047,39 +2432,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* moving them.
*/
- pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
- pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+ pwr_now += busiest->cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+ tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
if (max_load > tmp)
- pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
- max_load - tmp);
+ pwr_move += busiest->cpu_power *
+ min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
if (max_load*busiest->cpu_power <
- SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+ busiest_load_per_task*SCHED_LOAD_SCALE)
tmp = max_load*busiest->cpu_power/this->cpu_power;
else
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
- pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+ tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+ pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move <= pwr_now)
goto out_balanced;
- *imbalance = 1;
- return busiest;
+ *imbalance = busiest_load_per_task;
}
- /* Get rid of the scaling factor, rounding down as we divide */
- *imbalance = *imbalance / SCHED_LOAD_SCALE;
return busiest;
out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto ret;
+ if (this == group_leader && group_leader != group_min) {
+ *imbalance = min_load_per_task;
+ return group_min;
+ }
+ret:
+#endif
*imbalance = 0;
return NULL;
}
@@ -2087,19 +2480,27 @@ out_balanced:
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static runqueue_t *find_busiest_queue(struct sched_group *group,
- enum idle_type idle)
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum idle_type idle,
+ unsigned long imbalance, cpumask_t *cpus)
{
- unsigned long load, max_load = 0;
- runqueue_t *busiest = NULL;
+ struct rq *busiest = NULL, *rq;
+ unsigned long max_load = 0;
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i, 0);
- if (load > max_load) {
- max_load = load;
- busiest = cpu_rq(i);
+ if (!cpu_isset(i, *cpus))
+ continue;
+
+ rq = cpu_rq(i);
+
+ if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
+ continue;
+
+ if (rq->raw_weighted_load > max_load) {
+ max_load = rq->raw_weighted_load;
+ busiest = rq;
}
}
@@ -2112,34 +2513,41 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
*/
#define MAX_PINNED_INTERVAL 512
+static inline unsigned long minus_1_or_zero(unsigned long n)
+{
+ return n > 0 ? n - 1 : 0;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
* Called with this_rq unlocked.
*/
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum idle_type idle)
{
+ int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
- runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved, all_pinned = 0;
- int active_balance = 0;
- int sd_idle = 0;
+ struct rq *busiest;
+ cpumask_t cpus = CPU_MASK_ALL;
- if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ &cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle);
+ busiest = find_busiest_queue(group, idle, imbalance, &cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2159,12 +2567,17 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle, &all_pinned);
+ minus_1_or_zero(busiest->nr_running),
+ imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned))
+ if (unlikely(all_pinned)) {
+ cpu_clear(cpu_of(busiest), cpus);
+ if (!cpus_empty(cpus))
+ goto redo;
goto out_balanced;
+ }
}
if (!nr_moved) {
@@ -2216,7 +2629,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->balance_interval *= 2;
}
- if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
return nr_moved;
@@ -2231,7 +2645,8 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
return 0;
}
@@ -2243,26 +2658,30 @@ out_one_pinned:
* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
* this_rq is locked.
*/
-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
- struct sched_domain *sd)
+static int
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
- runqueue_t *busiest = NULL;
+ struct rq *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ cpumask_t cpus = CPU_MASK_ALL;
- if (sd->flags & SD_SHARE_CPUPOWER)
+ if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ &sd_idle, &cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}
- busiest = find_busiest_queue(group, NEWLY_IDLE);
+ busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+ &cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
@@ -2277,8 +2696,15 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ minus_1_or_zero(busiest->nr_running),
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
+
+ if (!nr_moved) {
+ cpu_clear(cpu_of(busiest), cpus);
+ if (!cpus_empty(cpus))
+ goto redo;
+ }
}
if (!nr_moved) {
@@ -2292,9 +2718,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
sd->nr_balance_failed = 0;
+
return 0;
}
@@ -2302,16 +2730,15 @@ out_balanced:
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static void idle_balance(int this_cpu, runqueue_t *this_rq)
+static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
- if (load_balance_newidle(this_cpu, this_rq, sd)) {
- /* We've pulled tasks over so stop searching */
+ /* If we've pulled tasks over stop searching: */
+ if (load_balance_newidle(this_cpu, this_rq, sd))
break;
- }
}
}
}
@@ -2324,14 +2751,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
*
* Called with busiest_rq locked.
*/
-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{
- struct sched_domain *sd;
- runqueue_t *target_rq;
int target_cpu = busiest_rq->push_cpu;
+ struct sched_domain *sd;
+ struct rq *target_rq;
+ /* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- /* no task to move */
return;
target_rq = cpu_rq(target_cpu);
@@ -2347,21 +2774,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
- for_each_domain(target_cpu, sd)
+ for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
- cpu_isset(busiest_cpu, sd->span))
+ cpu_isset(busiest_cpu, sd->span))
break;
+ }
- if (unlikely(sd == NULL))
- goto out;
-
- schedstat_inc(sd, alb_cnt);
+ if (likely(sd)) {
+ schedstat_inc(sd, alb_cnt);
- if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
- schedstat_inc(sd, alb_pushed);
- else
- schedstat_inc(sd, alb_failed);
-out:
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+ RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+ NULL))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
spin_unlock(&target_rq->lock);
}
@@ -2374,23 +2802,27 @@ out:
* Balancing parameters are set up in arch_init_sched_domains.
*/
-/* Don't have all balancing operations going off at once */
-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+/* Don't have all balancing operations going off at once: */
+static inline unsigned long cpu_offset(int cpu)
+{
+ return jiffies + cpu * HZ / NR_CPUS;
+}
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
- enum idle_type idle)
+static void
+rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
{
- unsigned long old_load, this_load;
- unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+ unsigned long this_load, interval, j = cpu_offset(this_cpu);
struct sched_domain *sd;
- int i;
+ int i, scale;
+
+ this_load = this_rq->raw_weighted_load;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
+ unsigned long old_load, new_load;
- this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
- /* Update our load */
- for (i = 0; i < 3; i++) {
- unsigned long new_load = this_load;
- int scale = 1 << i;
old_load = this_rq->cpu_load[i];
+ new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
* prevents us from getting stuck on 9 if the load is 10, for
@@ -2402,8 +2834,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
}
for_each_domain(this_cpu, sd) {
- unsigned long interval;
-
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2433,17 +2863,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
/*
* on UP we do not need to balance between CPUs:
*/
-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
{
}
-static inline void idle_balance(int cpu, runqueue_t *rq)
+static inline void idle_balance(int cpu, struct rq *rq)
{
}
#endif
-static inline int wake_priority_sleeper(runqueue_t *rq)
+static inline int wake_priority_sleeper(struct rq *rq)
{
int ret = 0;
+
#ifdef CONFIG_SCHED_SMT
spin_lock(&rq->lock);
/*
@@ -2467,25 +2898,26 @@ EXPORT_PER_CPU_SYMBOL(kstat);
* This is called on clock ticks and on context switches.
* Bank in p->sched_time the ns elapsed since the last tick or switch.
*/
-static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
- unsigned long long now)
+static inline void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
{
- unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
- p->sched_time += now - last;
+ p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
}
/*
* Return current->sched_time plus any more ns on the sched_clock
* that have not yet been banked.
*/
-unsigned long long current_sched_time(const task_t *tsk)
+unsigned long long current_sched_time(const struct task_struct *p)
{
unsigned long long ns;
unsigned long flags;
+
local_irq_save(flags);
- ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
- ns = tsk->sched_time + (sched_clock() - ns);
+ ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
+ ns = p->sched_time + sched_clock() - ns;
local_irq_restore(flags);
+
return ns;
}
@@ -2499,11 +2931,16 @@ unsigned long long current_sched_time(const task_t *tsk)
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
-#define EXPIRED_STARVING(rq) \
- ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
- ((rq)->curr->static_prio > (rq)->best_expired_prio))
+static inline int expired_starving(struct rq *rq)
+{
+ if (rq->curr->static_prio > rq->best_expired_prio)
+ return 1;
+ if (!STARVATION_LIMIT || !rq->expired_timestamp)
+ return 0;
+ if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+ return 1;
+ return 0;
+}
/*
* Account user cpu time to a process.
@@ -2536,7 +2973,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
cputime64_t tmp;
p->stime = cputime_add(p->stime, cputime);
@@ -2566,7 +3003,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp = cputime_to_cputime64(steal);
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
@@ -2587,10 +3024,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
*/
void scheduler_tick(void)
{
- int cpu = smp_processor_id();
- runqueue_t *rq = this_rq();
- task_t *p = current;
unsigned long long now = sched_clock();
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
@@ -2640,7 +3077,7 @@ void scheduler_tick(void)
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+ if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
@@ -2679,55 +3116,42 @@ out:
}
#ifdef CONFIG_SCHED_SMT
-static inline void wakeup_busy_runqueue(runqueue_t *rq)
+static inline void wakeup_busy_runqueue(struct rq *rq)
{
/* If an SMT runqueue is sleeping due to priority reasons wake it up */
if (rq->curr == rq->idle && rq->nr_running)
resched_task(rq->idle);
}
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
{
struct sched_domain *tmp, *sd = NULL;
- cpumask_t sibling_map;
int i;
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_SHARE_CPUPOWER)
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp;
+ break;
+ }
+ }
if (!sd)
return;
- /*
- * Unlock the current runqueue because we have to lock in
- * CPU order to avoid deadlocks. Caller knows that we might
- * unlock. We keep IRQs disabled.
- */
- spin_unlock(&this_rq->lock);
-
- sibling_map = sd->span;
-
- for_each_cpu_mask(i, sibling_map)
- spin_lock(&cpu_rq(i)->lock);
- /*
- * We clear this CPU from the mask. This both simplifies the
- * inner loop and keps this_rq locked when we exit:
- */
- cpu_clear(this_cpu, sibling_map);
+ for_each_cpu_mask(i, sd->span) {
+ struct rq *smt_rq = cpu_rq(i);
- for_each_cpu_mask(i, sibling_map) {
- runqueue_t *smt_rq = cpu_rq(i);
+ if (i == this_cpu)
+ continue;
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
wakeup_busy_runqueue(smt_rq);
+ spin_unlock(&smt_rq->lock);
}
-
- for_each_cpu_mask(i, sibling_map)
- spin_unlock(&cpu_rq(i)->lock);
- /*
- * We exit with this_cpu's rq still held and IRQs
- * still disabled:
- */
}
/*
@@ -2735,57 +3159,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
* utilize, if another task runs on a sibling. This models the
* slowdown effect of other tasks running on siblings:
*/
-static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+static inline unsigned long
+smt_slice(struct task_struct *p, struct sched_domain *sd)
{
return p->time_slice * (100 - sd->per_cpu_gain) / 100;
}
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
{
struct sched_domain *tmp, *sd = NULL;
- cpumask_t sibling_map;
- prio_array_t *array;
int ret = 0, i;
- task_t *p;
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_SHARE_CPUPOWER)
+ /* kernel/rt threads do not participate in dependent sleeping */
+ if (!p->mm || rt_task(p))
+ return 0;
+
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp;
+ break;
+ }
+ }
if (!sd)
return 0;
- /*
- * The same locking rules and details apply as for
- * wake_sleeping_dependent():
- */
- spin_unlock(&this_rq->lock);
- sibling_map = sd->span;
- for_each_cpu_mask(i, sibling_map)
- spin_lock(&cpu_rq(i)->lock);
- cpu_clear(this_cpu, sibling_map);
+ for_each_cpu_mask(i, sd->span) {
+ struct task_struct *smt_curr;
+ struct rq *smt_rq;
- /*
- * Establish next task to be run - it might have gone away because
- * we released the runqueue lock above:
- */
- if (!this_rq->nr_running)
- goto out_unlock;
- array = this_rq->active;
- if (!array->nr_active)
- array = this_rq->expired;
- BUG_ON(!array->nr_active);
+ if (i == this_cpu)
+ continue;
- p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
- task_t, run_list);
+ smt_rq = cpu_rq(i);
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
- for_each_cpu_mask(i, sibling_map) {
- runqueue_t *smt_rq = cpu_rq(i);
- task_t *smt_curr = smt_rq->curr;
+ smt_curr = smt_rq->curr;
- /* Kernel threads do not participate in dependent sleeping */
- if (!p->mm || !smt_curr->mm || rt_task(p))
- goto check_smt_task;
+ if (!smt_curr->mm)
+ goto unlock;
/*
* If a user task with lower static priority than the
@@ -2803,49 +3223,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
if ((jiffies % DEF_TIMESLICE) >
(sd->per_cpu_gain * DEF_TIMESLICE / 100))
ret = 1;
- } else
+ } else {
if (smt_curr->static_prio < p->static_prio &&
!TASK_PREEMPTS_CURR(p, smt_rq) &&
smt_slice(smt_curr, sd) > task_timeslice(p))
ret = 1;
-
-check_smt_task:
- if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
- rt_task(smt_curr))
- continue;
- if (!p->mm) {
- wakeup_busy_runqueue(smt_rq);
- continue;
- }
-
- /*
- * Reschedule a lower priority task on the SMT sibling for
- * it to be put to sleep, or wake it up if it has been put to
- * sleep for priority reasons to see if it should run now.
- */
- if (rt_task(p)) {
- if ((jiffies % DEF_TIMESLICE) >
- (sd->per_cpu_gain * DEF_TIMESLICE / 100))
- resched_task(smt_curr);
- } else {
- if (TASK_PREEMPTS_CURR(p, smt_rq) &&
- smt_slice(p, sd) > task_timeslice(smt_curr))
- resched_task(smt_curr);
- else
- wakeup_busy_runqueue(smt_rq);
}
+unlock:
+ spin_unlock(&smt_rq->lock);
}
-out_unlock:
- for_each_cpu_mask(i, sibling_map)
- spin_unlock(&cpu_rq(i)->lock);
return ret;
}
#else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
{
}
-
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
{
return 0;
}
@@ -2858,12 +3252,13 @@ void fastcall add_preempt_count(int val)
/*
* Underflow?
*/
- BUG_ON((preempt_count() < 0));
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
preempt_count() += val;
/*
* Spinlock count overflowing soon?
*/
- BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
}
EXPORT_SYMBOL(add_preempt_count);
@@ -2872,11 +3267,15 @@ void fastcall sub_preempt_count(int val)
/*
* Underflow?
*/
- BUG_ON(val > preempt_count());
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
/*
* Is the spinlock portion underflowing?
*/
- BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+
preempt_count() -= val;
}
EXPORT_SYMBOL(sub_preempt_count);
@@ -2894,14 +3293,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
*/
asmlinkage void __sched schedule(void)
{
- long *switch_count;
- task_t *prev, *next;
- runqueue_t *rq;
- prio_array_t *array;
+ struct task_struct *prev, *next;
+ struct prio_array *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx, new_prio;
+ long *switch_count;
+ struct rq *rq;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2949,9 +3348,6 @@ need_resched_nonpreemptible:
spin_lock_irq(&rq->lock);
- if (unlikely(prev->flags & PF_DEAD))
- prev->state = EXIT_DEAD;
-
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
@@ -2967,32 +3363,13 @@ need_resched_nonpreemptible:
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
-go_idle:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- /*
- * wake_sleeping_dependent() might have released
- * the runqueue, so break out if we got new
- * tasks meanwhile:
- */
- if (!rq->nr_running)
- goto switch_tasks;
- }
- } else {
- if (dependent_sleeper(cpu, rq)) {
- next = rq->idle;
+ wake_sleeping_dependent(cpu);
goto switch_tasks;
}
- /*
- * dependent_sleeper() releases and reacquires the runqueue
- * lock, hence go into the idle loop if the rq went
- * empty meanwhile:
- */
- if (unlikely(!rq->nr_running))
- goto go_idle;
}
array = rq->active;
@@ -3010,7 +3387,7 @@ go_idle:
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
- next = list_entry(queue->next, task_t, run_list);
+ next = list_entry(queue->next, struct task_struct, run_list);
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
unsigned long long delta = now - next->timestamp;
@@ -3030,6 +3407,8 @@ go_idle:
}
}
next->sleep_type = SLEEP_NORMAL;
+ if (dependent_sleeper(cpu, rq, next))
+ next = rq->idle;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
@@ -3071,12 +3450,11 @@ switch_tasks:
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
-
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_PREEMPT
/*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
@@ -3116,11 +3494,10 @@ need_resched:
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
-
EXPORT_SYMBOL(preempt_schedule);
/*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
@@ -3132,7 +3509,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
struct task_struct *task = current;
int saved_lock_depth;
#endif
- /* Catch callers which need to be fixed*/
+ /* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
need_resched:
@@ -3165,10 +3542,8 @@ need_resched:
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
void *key)
{
- task_t *p = curr->private;
- return try_to_wake_up(p, mode, sync);
+ return try_to_wake_up(curr->private, mode, sync);
}
-
EXPORT_SYMBOL(default_wake_function);
/*
@@ -3186,13 +3561,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, &q->task_list) {
- wait_queue_t *curr;
- unsigned flags;
- curr = list_entry(tmp, wait_queue_t, task_list);
- flags = curr->flags;
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ unsigned flags = curr->flags;
+
if (curr->func(curr, mode, sync, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) &&
- !--nr_exclusive)
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
@@ -3213,7 +3586,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
-
EXPORT_SYMBOL(__wake_up);
/*
@@ -3282,6 +3654,7 @@ EXPORT_SYMBOL(complete_all);
void fastcall __sched wait_for_completion(struct completion *x)
{
might_sleep();
+
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3426,7 +3799,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
schedule();
SLEEP_ON_TAIL
}
-
EXPORT_SYMBOL(interruptible_sleep_on);
long fastcall __sched
@@ -3442,7 +3814,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
return timeout;
}
-
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
void fastcall __sched sleep_on(wait_queue_head_t *q)
@@ -3455,7 +3826,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q)
schedule();
SLEEP_ON_TAIL
}
-
EXPORT_SYMBOL(sleep_on);
long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
@@ -3473,12 +3843,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(sleep_on_timeout);
-void set_user_nice(task_t *p, long nice)
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ struct prio_array *array;
+ unsigned long flags;
+ struct rq *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+
+ oldprio = p->prio;
+ array = p->array;
+ if (array)
+ dequeue_task(p, array);
+ p->prio = prio;
+
+ if (array) {
+ /*
+ * If changing to an RT priority then queue it
+ * in the active array!
+ */
+ if (rt_task(p))
+ array = rq->active;
+ enqueue_task(p, array);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
{
+ struct prio_array *array;
+ int old_prio, delta;
unsigned long flags;
- prio_array_t *array;
- runqueue_t *rq;
- int old_prio, new_prio, delta;
+ struct rq *rq;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -3493,22 +3916,25 @@ void set_user_nice(task_t *p, long nice)
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH:
*/
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
array = p->array;
- if (array)
+ if (array) {
dequeue_task(p, array);
+ dec_raw_weighted_load(rq, p);
+ }
- old_prio = p->prio;
- new_prio = NICE_TO_PRIO(nice);
- delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio += delta;
+ set_load_weight(p);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
if (array) {
enqueue_task(p, array);
+ inc_raw_weighted_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3519,7 +3945,6 @@ void set_user_nice(task_t *p, long nice)
out_unlock:
task_rq_unlock(rq, &flags);
}
-
EXPORT_SYMBOL(set_user_nice);
/*
@@ -3527,10 +3952,11 @@ EXPORT_SYMBOL(set_user_nice);
* @p: task
* @nice: nice value
*/
-int can_nice(const task_t *p, const int nice)
+int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
+
return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
capable(CAP_SYS_NICE));
}
@@ -3546,8 +3972,7 @@ int can_nice(const task_t *p, const int nice)
*/
asmlinkage long sys_nice(int increment)
{
- int retval;
- long nice;
+ long nice, retval;
/*
* Setpriority might change our priority at the same moment.
@@ -3586,7 +4011,7 @@ asmlinkage long sys_nice(int increment)
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
-int task_prio(const task_t *p)
+int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;
}
@@ -3595,7 +4020,7 @@ int task_prio(const task_t *p)
* task_nice - return the nice value of a given task.
* @p: the task in question.
*/
-int task_nice(const task_t *p)
+int task_nice(const struct task_struct *p)
{
return TASK_NICE(p);
}
@@ -3614,7 +4039,7 @@ int idle_cpu(int cpu)
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
*/
-task_t *idle_task(int cpu)
+struct task_struct *idle_task(int cpu)
{
return cpu_rq(cpu)->idle;
}
@@ -3623,7 +4048,7 @@ task_t *idle_task(int cpu)
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
*/
-static inline task_t *find_process_by_pid(pid_t pid)
+static inline struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_pid(pid) : current;
}
@@ -3632,18 +4057,18 @@ static inline task_t *find_process_by_pid(pid_t pid)
static void __setscheduler(struct task_struct *p, int policy, int prio)
{
BUG_ON(p->array);
+
p->policy = policy;
p->rt_priority = prio;
- if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
- p->prio = MAX_RT_PRIO-1 - p->rt_priority;
- } else {
- p->prio = p->static_prio;
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
- }
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ /*
+ * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+ */
+ if (policy == SCHED_BATCH)
+ p->sleep_avg = 0;
+ set_load_weight(p);
}
/**
@@ -3652,16 +4077,19 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @p: the task in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
*/
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
- int retval;
- int oldprio, oldpolicy = -1;
- prio_array_t *array;
+ int retval, oldprio, oldpolicy = -1;
+ struct prio_array *array;
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
recheck:
/* double check policy once rq lock held */
if (policy < 0)
@@ -3678,28 +4106,32 @@ recheck:
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
- != (param->sched_priority == 0))
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (!capable(CAP_SYS_NICE)) {
- /*
- * can't change policy, except between SCHED_NORMAL
- * and SCHED_BATCH:
- */
- if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
- (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
- !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
- /* can't increase priority */
- if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
- param->sched_priority > p->rt_priority &&
- param->sched_priority >
- p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
- return -EPERM;
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return -ESRCH;
+ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ unlock_task_sighand(p, &flags);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
/* can't change other user's priorities */
if ((current->euid != p->euid) &&
(current->euid != p->uid))
@@ -3710,14 +4142,20 @@ recheck:
if (retval)
return retval;
/*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
+ /*
* To be able to change p->policy safely, the apropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
array = p->array;
@@ -3738,7 +4176,11 @@ recheck:
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+
return 0;
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3746,22 +4188,22 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
- int retval;
struct sched_param lparam;
struct task_struct *p;
+ int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- read_lock_irq(&tasklist_lock);
+
+ rcu_read_lock();
+ retval = -ESRCH;
p = find_process_by_pid(pid);
- if (!p) {
- read_unlock_irq(&tasklist_lock);
- return -ESRCH;
- }
- retval = sched_setscheduler(p, policy, &lparam);
- read_unlock_irq(&tasklist_lock);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
return retval;
}
@@ -3797,8 +4239,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
*/
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
+ struct task_struct *p;
int retval = -EINVAL;
- task_t *p;
if (pid < 0)
goto out_nounlock;
@@ -3825,8 +4267,8 @@ out_nounlock:
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
{
struct sched_param lp;
+ struct task_struct *p;
int retval = -EINVAL;
- task_t *p;
if (!param || pid < 0)
goto out_nounlock;
@@ -3859,9 +4301,9 @@ out_unlock:
long sched_setaffinity(pid_t pid, cpumask_t new_mask)
{
- task_t *p;
- int retval;
cpumask_t cpus_allowed;
+ struct task_struct *p;
+ int retval;
lock_cpu_hotplug();
read_lock(&tasklist_lock);
@@ -3947,8 +4389,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
long sched_getaffinity(pid_t pid, cpumask_t *mask)
{
+ struct task_struct *p;
int retval;
- task_t *p;
lock_cpu_hotplug();
read_lock(&tasklist_lock);
@@ -4007,9 +4449,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
*/
asmlinkage long sys_sched_yield(void)
{
- runqueue_t *rq = this_rq_lock();
- prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;
+ struct rq *rq = this_rq_lock();
+ struct prio_array *array = current->array, *target = rq->expired;
schedstat_inc(rq, yld_cnt);
/*
@@ -4043,6 +4484,7 @@ asmlinkage long sys_sched_yield(void)
* no need to preempt or enable interrupts:
*/
__release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
_raw_spin_unlock(&rq->lock);
preempt_enable_no_resched();
@@ -4051,7 +4493,16 @@ asmlinkage long sys_sched_yield(void)
return 0;
}
-static inline void __cond_resched(void)
+static inline int __resched_legal(int expected_preempt_count)
+{
+ if (unlikely(preempt_count() != expected_preempt_count))
+ return 0;
+ if (unlikely(system_state != SYSTEM_RUNNING))
+ return 0;
+ return 1;
+}
+
+static void __cond_resched(void)
{
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
__might_sleep(__FILE__, __LINE__);
@@ -4061,10 +4512,6 @@ static inline void __cond_resched(void)
* PREEMPT_ACTIVE, which could trigger a second
* cond_resched() call.
*/
- if (unlikely(preempt_count()))
- return;
- if (unlikely(system_state != SYSTEM_RUNNING))
- return;
do {
add_preempt_count(PREEMPT_ACTIVE);
schedule();
@@ -4074,13 +4521,12 @@ static inline void __cond_resched(void)
int __sched cond_resched(void)
{
- if (need_resched()) {
+ if (need_resched() && __resched_legal(0)) {
__cond_resched();
return 1;
}
return 0;
}
-
EXPORT_SYMBOL(cond_resched);
/*
@@ -4101,7 +4547,8 @@ int cond_resched_lock(spinlock_t *lock)
ret = 1;
spin_lock(lock);
}
- if (need_resched()) {
+ if (need_resched() && __resched_legal(1)) {
+ spin_release(&lock->dep_map, 1, _THIS_IP_);
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
@@ -4110,25 +4557,24 @@ int cond_resched_lock(spinlock_t *lock)
}
return ret;
}
-
EXPORT_SYMBOL(cond_resched_lock);
int __sched cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched()) {
- __local_bh_enable();
+ if (need_resched() && __resched_legal(0)) {
+ raw_local_irq_disable();
+ _local_bh_enable();
+ raw_local_irq_enable();
__cond_resched();
local_bh_disable();
return 1;
}
return 0;
}
-
EXPORT_SYMBOL(cond_resched_softirq);
-
/**
* yield - yield the current processor to other threads.
*
@@ -4140,7 +4586,6 @@ void __sched yield(void)
set_current_state(TASK_RUNNING);
sys_sched_yield();
}
-
EXPORT_SYMBOL(yield);
/*
@@ -4152,23 +4597,26 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
}
-
EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
long ret;
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
return ret;
}
@@ -4230,9 +4678,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
asmlinkage
long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
{
+ struct task_struct *p;
int retval = -EINVAL;
struct timespec t;
- task_t *p;
if (pid < 0)
goto out_nounlock;
@@ -4247,7 +4695,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
if (retval)
goto out_unlock;
- jiffies_to_timespec(p->policy & SCHED_FIFO ?
+ jiffies_to_timespec(p->policy == SCHED_FIFO ?
0 : task_timeslice(p), &t);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4260,35 +4708,36 @@ out_unlock:
static inline struct task_struct *eldest_child(struct task_struct *p)
{
- if (list_empty(&p->children)) return NULL;
+ if (list_empty(&p->children))
+ return NULL;
return list_entry(p->children.next,struct task_struct,sibling);
}
static inline struct task_struct *older_sibling(struct task_struct *p)
{
- if (p->sibling.prev==&p->parent->children) return NULL;
+ if (p->sibling.prev==&p->parent->children)
+ return NULL;
return list_entry(p->sibling.prev,struct task_struct,sibling);
}
static inline struct task_struct *younger_sibling(struct task_struct *p)
{
- if (p->sibling.next==&p->parent->children) return NULL;
+ if (p->sibling.next==&p->parent->children)
+ return NULL;
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t *p)
+static const char stat_nam[] = "RSDTtZX";
+
+static void show_task(struct task_struct *p)
{
- task_t *relative;
- unsigned state;
+ struct task_struct *relative;
unsigned long free = 0;
- static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+ unsigned state;
- printk("%-13.13s ", p->comm);
state = p->state ? __ffs(p->state) + 1 : 0;
- if (state < ARRAY_SIZE(stat_nam))
- printk(stat_nam[state]);
- else
- printk("?");
+ printk("%-13.13s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if (BITS_PER_LONG == 32)
if (state == TASK_RUNNING)
printk(" running ");
@@ -4332,7 +4781,7 @@ static void show_task(task_t *p)
void show_state(void)
{
- task_t *g, *p;
+ struct task_struct *g, *p;
#if (BITS_PER_LONG == 32)
printk("\n"
@@ -4354,7 +4803,7 @@ void show_state(void)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- mutex_debug_show_all_locks();
+ debug_show_all_locks();
}
/**
@@ -4365,15 +4814,15 @@ void show_state(void)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __devinit init_idle(task_t *idle, int cpu)
+void __devinit init_idle(struct task_struct *idle, int cpu)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
unsigned long flags;
idle->timestamp = sched_clock();
idle->sleep_avg = 0;
idle->array = NULL;
- idle->prio = MAX_PRIO;
+ idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);
@@ -4406,7 +4855,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
/*
* This is how migration works:
*
- * 1) we queue a migration_req_t structure in the source CPU's
+ * 1) we queue a struct migration_req structure in the source CPU's
* runqueue and wake up that CPU's migration thread.
* 2) we down() the locked semaphore => thread blocks.
* 3) migration thread wakes up (implicitly it forces the migrated
@@ -4428,12 +4877,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
+ struct migration_req req;
unsigned long flags;
+ struct rq *rq;
int ret = 0;
- migration_req_t req;
- runqueue_t *rq;
rq = task_rq_lock(p, &flags);
if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -4456,9 +4905,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
}
out:
task_rq_unlock(rq, &flags);
+
return ret;
}
-
EXPORT_SYMBOL_GPL(set_cpus_allowed);
/*
@@ -4469,13 +4918,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
*
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
*/
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest, *rq_src;
+ struct rq *rq_dest, *rq_src;
+ int ret = 0;
if (unlikely(cpu_is_offline(dest_cpu)))
- return;
+ return ret;
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
@@ -4499,13 +4951,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+ rq_dest->timestamp_last_tick;
deactivate_task(p, rq_src);
- activate_task(p, rq_dest, 0);
+ __activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
}
-
+ ret = 1;
out:
double_rq_unlock(rq_src, rq_dest);
+ return ret;
}
/*
@@ -4515,16 +4968,16 @@ out:
*/
static int migration_thread(void *data)
{
- runqueue_t *rq;
int cpu = (long)data;
+ struct rq *rq;
rq = cpu_rq(cpu);
BUG_ON(rq->migration_thread != current);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
+ struct migration_req *req;
struct list_head *head;
- migration_req_t *req;
try_to_freeze();
@@ -4548,7 +5001,7 @@ static int migration_thread(void *data)
set_current_state(TASK_INTERRUPTIBLE);
continue;
}
- req = list_entry(head->next, migration_req_t, list);
+ req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
spin_unlock(&rq->lock);
@@ -4573,36 +5026,42 @@ wait_to_die:
#ifdef CONFIG_HOTPLUG_CPU
/* Figure out where task on dead CPU should go, use force if neccessary. */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- int dest_cpu;
+ unsigned long flags;
cpumask_t mask;
+ struct rq *rq;
+ int dest_cpu;
+restart:
/* On same node? */
mask = node_to_cpumask(cpu_to_node(dead_cpu));
- cpus_and(mask, mask, tsk->cpus_allowed);
+ cpus_and(mask, mask, p->cpus_allowed);
dest_cpu = any_online_cpu(mask);
/* On any allowed CPU? */
if (dest_cpu == NR_CPUS)
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ dest_cpu = any_online_cpu(p->cpus_allowed);
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
- cpus_setall(tsk->cpus_allowed);
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ rq = task_rq_lock(p, &flags);
+ cpus_setall(p->cpus_allowed);
+ dest_cpu = any_online_cpu(p->cpus_allowed);
+ task_rq_unlock(rq, &flags);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
- if (tsk->mm && printk_ratelimit())
+ if (p->mm && printk_ratelimit())
printk(KERN_INFO "process %d (%s) no "
"longer affine to cpu%d\n",
- tsk->pid, tsk->comm, dead_cpu);
+ p->pid, p->comm, dead_cpu);
}
- __migrate_task(tsk, dead_cpu, dest_cpu);
+ if (!__migrate_task(p, dead_cpu, dest_cpu))
+ goto restart;
}
/*
@@ -4612,9 +5071,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
* their home CPUs. So we just add the counter to another CPU's counter,
* to keep the global sum constant after CPU-down:
*/
-static void migrate_nr_uninterruptible(runqueue_t *rq_src)
+static void migrate_nr_uninterruptible(struct rq *rq_src)
{
- runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+ struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
unsigned long flags;
local_irq_save(flags);
@@ -4628,48 +5087,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src)
/* Run through task list and migrate tasks from the dead cpu. */
static void migrate_live_tasks(int src_cpu)
{
- struct task_struct *tsk, *t;
+ struct task_struct *p, *t;
write_lock_irq(&tasklist_lock);
- do_each_thread(t, tsk) {
- if (tsk == current)
+ do_each_thread(t, p) {
+ if (p == current)
continue;
- if (task_cpu(tsk) == src_cpu)
- move_task_off_dead_cpu(src_cpu, tsk);
- } while_each_thread(t, tsk);
+ if (task_cpu(p) == src_cpu)
+ move_task_off_dead_cpu(src_cpu, p);
+ } while_each_thread(t, p);
write_unlock_irq(&tasklist_lock);
}
/* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible and adding it to
- * the _front_ of runqueue. Used by CPU offline code.
+ * the _front_ of the runqueue. Used by CPU offline code.
*/
void sched_idle_next(void)
{
- int cpu = smp_processor_id();
- runqueue_t *rq = this_rq();
+ int this_cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(this_cpu);
struct task_struct *p = rq->idle;
unsigned long flags;
/* cpu has to be offline */
- BUG_ON(cpu_online(cpu));
+ BUG_ON(cpu_online(this_cpu));
- /* Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on current cpu.
+ /*
+ * Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on the current cpu.
*/
spin_lock_irqsave(&rq->lock, flags);
__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
- /* Add idle task to _front_ of it's priority queue */
+
+ /* Add idle task to the _front_ of its priority queue: */
__activate_idle_task(p, rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
-/* Ensures that the idle task is using init_mm right before its cpu goes
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
* offline.
*/
void idle_task_exit(void)
@@ -4683,17 +5145,17 @@ void idle_task_exit(void)
mmdrop(mm);
}
-static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
+static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
{
- struct runqueue *rq = cpu_rq(dead_cpu);
+ struct rq *rq = cpu_rq(dead_cpu);
/* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
+ BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
/* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(tsk->flags & PF_DEAD);
+ BUG_ON(p->state == TASK_DEAD);
- get_task_struct(tsk);
+ get_task_struct(p);
/*
* Drop lock around migration; if someone else moves it,
@@ -4701,25 +5163,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
* fine.
*/
spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, tsk);
+ move_task_off_dead_cpu(dead_cpu, p);
spin_lock_irq(&rq->lock);
- put_task_struct(tsk);
+ put_task_struct(p);
}
/* release_task() removes task from tasklist, so we won't find dead tasks. */
static void migrate_dead_tasks(unsigned int dead_cpu)
{
- unsigned arr, i;
- struct runqueue *rq = cpu_rq(dead_cpu);
+ struct rq *rq = cpu_rq(dead_cpu);
+ unsigned int arr, i;
for (arr = 0; arr < 2; arr++) {
for (i = 0; i < MAX_PRIO; i++) {
struct list_head *list = &rq->arrays[arr].queue[i];
+
while (!list_empty(list))
- migrate_dead(dead_cpu,
- list_entry(list->next, task_t,
- run_list));
+ migrate_dead(dead_cpu, list_entry(list->next,
+ struct task_struct, run_list));
}
}
}
@@ -4729,13 +5191,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int migration_call(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- int cpu = (long)hcpu;
struct task_struct *p;
- struct runqueue *rq;
+ int cpu = (long)hcpu;
unsigned long flags;
+ struct rq *rq;
switch (action) {
case CPU_UP_PREPARE:
@@ -4750,18 +5212,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
task_rq_unlock(rq, &flags);
cpu_rq(cpu)->migration_thread = p;
break;
+
case CPU_ONLINE:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
+
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!cpu_rq(cpu)->migration_thread)
+ break;
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,
any_online_cpu(cpu_online_map));
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
+
case CPU_DEAD:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
@@ -4782,9 +5249,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
- migration_req_t *req;
+ struct migration_req *req;
+
req = list_entry(rq->migration_queue.next,
- migration_req_t, list);
+ struct migration_req, list);
list_del_init(&req->list);
complete(&req->done);
}
@@ -4798,7 +5266,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
/* Register at highest priority so that task migration (migrate_all_tasks)
* happens before everything else.
*/
-static struct notifier_block migration_notifier = {
+static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
.priority = 10
};
@@ -4806,10 +5274,14 @@ static struct notifier_block migration_notifier = {
int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
- /* Start one for boot CPU. */
- migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ int err;
+
+ /* Start one for the boot CPU: */
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+
return 0;
}
#endif
@@ -4905,7 +5377,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
} while (sd);
}
#else
-#define sched_domain_debug(sd, cpu) {}
+# define sched_domain_debug(sd, cpu) do { } while (0)
#endif
static int sd_degenerate(struct sched_domain *sd)
@@ -4931,8 +5403,8 @@ static int sd_degenerate(struct sched_domain *sd)
return 1;
}
-static int sd_parent_degenerate(struct sched_domain *sd,
- struct sched_domain *parent)
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
{
unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4965,7 +5437,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
*/
static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
@@ -5227,8 +5699,8 @@ static void touch_cache(void *__cache, unsigned long __size)
/*
* Measure the cache-cost of one task migration. Returns in units of nsec.
*/
-static unsigned long long measure_one(void *cache, unsigned long size,
- int source, int target)
+static unsigned long long
+measure_one(void *cache, unsigned long size, int source, int target)
{
cpumask_t mask, saved_mask;
unsigned long long t0, t1, t2, t3, cost;
@@ -5380,7 +5852,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
cache = vmalloc(max_size);
if (!cache) {
printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
- return 1000000; // return 1 msec on very small boxen
+ return 1000000; /* return 1 msec on very small boxen */
}
while (size <= max_size) {
@@ -5578,9 +6050,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
*/
static cpumask_t sched_domain_node_span(int node)
{
- int i;
- cpumask_t span, nodemask;
DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+ cpumask_t span, nodemask;
+ int i;
cpus_clear(span);
bitmap_zero(used_nodes, MAX_NUMNODES);
@@ -5591,6 +6063,7 @@ static cpumask_t sched_domain_node_span(int node)
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, used_nodes);
+
nodemask = node_to_cpumask(next_node);
cpus_or(span, span, nodemask);
}
@@ -5599,22 +6072,27 @@ static cpumask_t sched_domain_node_span(int node)
}
#endif
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
+ * SMT sched-domains:
*/
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
+
static int cpu_to_cpu_group(int cpu)
{
return cpu;
}
#endif
+/*
+ * multi-core sched-domains:
+ */
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
#endif
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5630,10 +6108,11 @@ static int cpu_to_core_group(int cpu)
#endif
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+
static int cpu_to_phys_group(int cpu)
{
-#if defined(CONFIG_SCHED_MC)
+#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
return first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
@@ -5687,13 +6166,74 @@ next_sg:
}
#endif
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+ int cpu;
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+#endif
+ for_each_cpu_mask(cpu, *cpu_map) {
+ if (sched_group_phys_bycpu[cpu]) {
+ kfree(sched_group_phys_bycpu[cpu]);
+ sched_group_phys_bycpu[cpu] = NULL;
+ }
+#ifdef CONFIG_SCHED_MC
+ if (sched_group_core_bycpu[cpu]) {
+ kfree(sched_group_core_bycpu[cpu]);
+ sched_group_core_bycpu[cpu] = NULL;
+ }
+#endif
+ }
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+ struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+ struct sched_group *sched_group_core = NULL;
+#endif
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
struct sched_group *sched_group_allnodes = NULL;
@@ -5701,11 +6241,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
/*
* Allocate the per-node list of sched groups
*/
- sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
- GFP_ATOMIC);
+ sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
- return;
+ return -ENOMEM;
}
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
#endif
@@ -5731,7 +6271,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
if (!sched_group_allnodes) {
printk(KERN_WARNING
"Can not alloc allnodes sched group\n");
- break;
+ goto error;
}
sched_group_allnodes_bycpu[i]
= sched_group_allnodes;
@@ -5752,6 +6292,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
cpus_and(sd->span, sd->span, *cpu_map);
#endif
+ if (!sched_group_phys) {
+ sched_group_phys
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_phys) {
+ printk (KERN_WARNING "Can not alloc phys sched"
+ "group\n");
+ goto error;
+ }
+ sched_group_phys_bycpu[i] = sched_group_phys;
+ }
+
p = sd;
sd = &per_cpu(phys_domains, i);
group = cpu_to_phys_group(i);
@@ -5761,6 +6313,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
sd->groups = &sched_group_phys[group];
#ifdef CONFIG_SCHED_MC
+ if (!sched_group_core) {
+ sched_group_core
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_core) {
+ printk (KERN_WARNING "Can not alloc core sched"
+ "group\n");
+ goto error;
+ }
+ sched_group_core_bycpu[i] = sched_group_core;
+ }
+
p = sd;
sd = &per_cpu(core_domains, i);
group = cpu_to_core_group(i);
@@ -5844,24 +6408,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
domainspan = sched_domain_node_span(i);
cpus_and(domainspan, domainspan, *cpu_map);
- sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING "Can not alloc domain group for "
+ "node %d\n", i);
+ goto error;
+ }
sched_group_nodes[i] = sg;
for_each_cpu_mask(j, nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j);
sd->groups = sg;
- if (sd->groups == NULL) {
- /* Turn off balancing if we have no groups */
- sd->flags = 0;
- }
- }
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", i);
- continue;
}
sg->cpu_power = 0;
sg->cpumask = nodemask;
+ sg->next = sg;
cpus_or(covered, covered, nodemask);
prev = sg;
@@ -5880,54 +6441,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
if (cpus_empty(tmp))
continue;
- sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sg = kmalloc_node(sizeof(struct sched_group),
+ GFP_KERNEL, i);
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
- break;
+ goto error;
}
sg->cpu_power = 0;
sg->cpumask = tmp;
+ sg->next = prev->next;
cpus_or(covered, covered, tmp);
prev->next = sg;
prev = sg;
}
- prev->next = sched_group_nodes[i];
}
#endif
/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- int power;
struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
- power = SCHED_LOAD_SCALE;
- sd->groups->cpu_power = power;
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ }
#endif
#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask(i, *cpu_map) {
+ int power;
+ struct sched_domain *sd;
sd = &per_cpu(core_domains, i);
- power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
* SCHED_LOAD_SCALE / 10;
sd->groups->cpu_power = power;
+ }
+#endif
+ for_each_cpu_mask(i, *cpu_map) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
sd = &per_cpu(phys_domains, i);
+ if (i != first_cpu(sd->groups->cpumask))
+ continue;
- /*
- * This has to be < 2 * SCHED_LOAD_SCALE
- * Lets keep it SCHED_LOAD_SCALE, so that
- * while calculating NUMA group's cpu_power
- * we can simply do
- * numa_group->cpu_power += phys_group->cpu_power;
- *
- * See "only add power once for each physical pkg"
- * comment below
- */
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sd->groups->cpu_power = 0;
+ if (sched_mc_power_savings || sched_smt_power_savings) {
+ int j;
+
+ for_each_cpu_mask(j, sd->groups->cpumask) {
+ struct sched_domain *sd1;
+ sd1 = &per_cpu(core_domains, j);
+ /*
+ * for each core we will add once
+ * to the group in physical domain
+ */
+ if (j != first_cpu(sd1->groups->cpumask))
+ continue;
+
+ if (sched_smt_power_savings)
+ sd->groups->cpu_power += sd1->groups->cpu_power;
+ else
+ sd->groups->cpu_power += SCHED_LOAD_SCALE;
+ }
+ } else
+ /*
+ * This has to be < 2 * SCHED_LOAD_SCALE
+ * Lets keep it SCHED_LOAD_SCALE, so that
+ * while calculating NUMA group's cpu_power
+ * we can simply do
+ * numa_group->cpu_power += phys_group->cpu_power;
+ *
+ * See "only add power once for each physical pkg"
+ * comment below
+ */
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
#else
+ int power;
sd = &per_cpu(phys_domains, i);
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE;
sd->groups->cpu_power = power;
#endif
}
@@ -5936,7 +6533,12 @@ void build_sched_domains(const cpumask_t *cpu_map)
for (i = 0; i < MAX_NUMNODES; i++)
init_numa_sched_groups_power(sched_group_nodes[i]);
- init_numa_sched_groups_power(sched_group_allnodes);
+ if (sched_group_allnodes) {
+ int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+ struct sched_group *sg = &sched_group_allnodes[group];
+
+ init_numa_sched_groups_power(sg);
+ }
#endif
/* Attach the domains */
@@ -5955,13 +6557,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
* Tune cache-hot values:
*/
calibrate_migration_costs(cpu_map);
+
+ return 0;
+
+error:
+ free_sched_groups(cpu_map);
+ return -ENOMEM;
}
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
*/
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
cpumask_t cpu_default_map;
+ int err;
/*
* Setup mask for cpus without special case scheduling requirements.
@@ -5970,51 +6579,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
*/
cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
- build_sched_domains(&cpu_default_map);
+ err = build_sched_domains(&cpu_default_map);
+
+ return err;
}
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
-#ifdef CONFIG_NUMA
- int i;
- int cpu;
-
- for_each_cpu_mask(cpu, *cpu_map) {
- struct sched_group *sched_group_allnodes
- = sched_group_allnodes_bycpu[cpu];
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
-
- if (sched_group_allnodes) {
- kfree(sched_group_allnodes);
- sched_group_allnodes_bycpu[cpu] = NULL;
- }
-
- if (!sched_group_nodes)
- continue;
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- cpumask_t nodemask = node_to_cpumask(i);
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
- cpus_and(nodemask, nodemask, *cpu_map);
- if (cpus_empty(nodemask))
- continue;
-
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
- }
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
- }
-#endif
+ free_sched_groups(cpu_map);
}
/*
@@ -6039,9 +6611,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
* correct sched domains
* Call with hotplug lock held
*/
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
{
cpumask_t change_map;
+ int err = 0;
cpus_and(*partition1, *partition1, cpu_online_map);
cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6050,10 +6623,89 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
/* Detach sched domains from all of the affected cpus */
detach_destroy_domains(&change_map);
if (!cpus_empty(*partition1))
- build_sched_domains(partition1);
- if (!cpus_empty(*partition2))
- build_sched_domains(partition2);
+ err = build_sched_domains(partition1);
+ if (!err && !cpus_empty(*partition2))
+ err = build_sched_domains(partition2);
+
+ return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+ int err;
+
+ lock_cpu_hotplug();
+ detach_destroy_domains(&cpu_online_map);
+ err = arch_init_sched_domains(&cpu_online_map);
+ unlock_cpu_hotplug();
+
+ return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+ int ret;
+
+ if (buf[0] != '0' && buf[0] != '1')
+ return -EINVAL;
+
+ if (smt)
+ sched_smt_power_savings = (buf[0] == '1');
+ else
+ sched_mc_power_savings = (buf[0] == '1');
+
+ ret = arch_reinit_sched_domains();
+
+ return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 1);
}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -6108,6 +6760,7 @@ int in_sched_functions(unsigned long addr)
{
/* Linker adds these: start and end of __sched functions */
extern char __sched_text_start[], __sched_text_end[];
+
return in_lock_functions(addr) ||
(addr >= (unsigned long)__sched_text_start
&& addr < (unsigned long)__sched_text_end);
@@ -6115,14 +6768,15 @@ int in_sched_functions(unsigned long addr)
void __init sched_init(void)
{
- runqueue_t *rq;
int i, j, k;
for_each_possible_cpu(i) {
- prio_array_t *array;
+ struct prio_array *array;
+ struct rq *rq;
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+ lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
@@ -6134,9 +6788,9 @@ void __init sched_init(void)
rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
+ rq->cpu = i;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
- rq->cpu = i;
#endif
atomic_set(&rq->nr_iowait, 0);
@@ -6151,6 +6805,12 @@ void __init sched_init(void)
}
}
+ set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+ plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -6169,7 +6829,7 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
void __might_sleep(char *file, int line)
{
-#if defined(in_atomic)
+#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((in_atomic() || irqs_disabled()) &&
@@ -6191,17 +6851,18 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
+ struct prio_array *array;
struct task_struct *p;
- prio_array_t *array;
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
read_lock_irq(&tasklist_lock);
- for_each_process (p) {
+ for_each_process(p) {
if (!rt_task(p))
continue;
- rq = task_rq_lock(p, &flags);
+ spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
array = p->array;
if (array)
@@ -6212,7 +6873,8 @@ void normalize_rt_tasks(void)
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
}
read_unlock_irq(&tasklist_lock);
}
@@ -6236,7 +6898,7 @@ void normalize_rt_tasks(void)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-task_t *curr_task(int cpu)
+struct task_struct *curr_task(int cpu)
{
return cpu_curr(cpu);
}
@@ -6256,7 +6918,7 @@ task_t *curr_task(int cpu)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-void set_curr_task(int cpu, task_t *p)
+void set_curr_task(int cpu, struct task_struct *p)
{
cpu_curr(cpu) = p;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 1b3c921737e2..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -10,7 +10,6 @@
* to allow signals to be sent reliably.
*/
-#include <linux/config.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
@@ -418,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
siginfo_t *info)
{
- int sig = 0;
+ int sig = next_signal(pending, mask);
- sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
@@ -433,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
if (!collect_signal(sig, pending, info))
sig = 0;
-
}
- recalc_sigpending();
return sig;
}
@@ -452,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
if (!signr)
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info);
+ recalc_sigpending_tsk(tsk);
if (signr && unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
@@ -584,7 +581,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
&& !capable(CAP_KILL))
return error;
- error = security_task_kill(t, info, sig);
+ error = security_task_kill(t, info, sig, 0);
if (!error)
audit_signal_info(sig, t); /* Let audit system see the signal */
return error;
@@ -792,22 +789,31 @@ out:
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example.
*/
-
int
force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
- int ret;
+ int ret, blocked, ignored;
+ struct k_sigaction *action;
spin_lock_irqsave(&t->sighand->siglock, flags);
- if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
- t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
- }
- if (sigismember(&t->blocked, sig)) {
- sigdelset(&t->blocked, sig);
+ action = &t->sighand->action[sig-1];
+ ignored = action->sa.sa_handler == SIG_IGN;
+ blocked = sigismember(&t->blocked, sig);
+ if (blocked || ignored) {
+ action->sa.sa_handler = SIG_DFL;
+ if (blocked) {
+ sigdelset(&t->blocked, sig);
+ recalc_sigpending_tsk(t);
+ }
}
- recalc_sigpending_tsk(t);
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -1107,7 +1113,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
/* like kill_proc_info(), but doesn't use uid/euid of "current" */
int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
- uid_t uid, uid_t euid)
+ uid_t uid, uid_t euid, u32 secid)
{
int ret = -EINVAL;
struct task_struct *p;
@@ -1127,6 +1133,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
ret = -EPERM;
goto out_unlock;
}
+ ret = security_task_kill(p, info, sig, secid);
+ if (ret)
+ goto out_unlock;
if (sig && p->sighand) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -1531,6 +1540,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
spin_unlock_irqrestore(&sighand->siglock, flags);
}
+static inline int may_ptrace_stop(void)
+{
+ if (!likely(current->ptrace & PT_PTRACED))
+ return 0;
+
+ if (unlikely(current->parent == current->real_parent &&
+ (current->ptrace & PT_ATTACHED)))
+ return 0;
+
+ if (unlikely(current->signal == current->parent->signal) &&
+ unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
+ return 0;
+
+ /*
+ * Are we in the middle of do_coredump?
+ * If so and our tracer is also part of the coredump stopping
+ * is a deadlock situation, and pointless because our tracer
+ * is dead so don't allow us to stop.
+ * If SIGKILL was already sent before the caller unlocked
+ * ->siglock we must see ->core_waiters != 0. Otherwise it
+ * is safe to enter schedule().
+ */
+ if (unlikely(current->mm->core_waiters) &&
+ unlikely(current->mm == current->parent->mm))
+ return 0;
+
+ return 1;
+}
+
/*
* This must be called with current->sighand->siglock held.
*
@@ -1559,11 +1597,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
spin_unlock_irq(&current->sighand->siglock);
try_to_freeze();
read_lock(&tasklist_lock);
- if (likely(current->ptrace & PT_PTRACED) &&
- likely(current->parent != current->real_parent ||
- !(current->ptrace & PT_ATTACHED)) &&
- (likely(current->parent->signal != current->signal) ||
- !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
+ if (may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
read_unlock(&tasklist_lock);
schedule();
@@ -2541,6 +2575,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
}
#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
void __init signals_init(void)
{
sigqueue_cachep =
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,137 @@ static inline void wakeup_softirqd(void)
}
/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+static void __local_bh_disable(unsigned long ip)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ raw_local_irq_save(flags);
+ add_preempt_count(SOFTIRQ_OFFSET);
+ /*
+ * Were softirqs turned off above:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+}
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+ add_preempt_count(SOFTIRQ_OFFSET);
+ barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+void local_bh_disable(void)
+{
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+}
+
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+
+ /*
+ * softirqs should never be enabled by __local_bh_enable(),
+ * it always nests inside local_bh_enable() sections:
+ */
+ WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
+
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+EXPORT_SYMBOL_GPL(__local_bh_enable);
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+
+EXPORT_SYMBOL(_local_bh_enable);
+
+void local_bh_enable(void)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+#endif
+ WARN_ON_ONCE(irqs_disabled());
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_save(flags);
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
+ do_softirq();
+
+ dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_restore(flags);
+#endif
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ local_irq_save(flags);
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on(ip);
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
+ do_softirq();
+
+ dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_restore(flags);
+#endif
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/*
* We restart softirq processing MAX_SOFTIRQ_RESTART times,
* and we fall back to softirqd after that.
*
@@ -80,8 +211,11 @@ asmlinkage void __do_softirq(void)
int cpu;
pending = local_softirq_pending();
+ account_system_vtime(current);
+
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+ trace_softirq_enter();
- local_bh_disable();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
@@ -109,7 +243,10 @@ restart:
if (pending)
wakeup_softirqd();
- __local_bh_enable();
+ trace_softirq_exit();
+
+ account_system_vtime(current);
+ _local_bh_enable();
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -136,23 +273,6 @@ EXPORT_SYMBOL(do_softirq);
#endif
-void local_bh_enable(void)
-{
- WARN_ON(irqs_disabled());
- /*
- * Keep preemption disabled until we are done with
- * softirq processing:
- */
- sub_preempt_count(SOFTIRQ_OFFSET - 1);
-
- if (unlikely(!in_interrupt() && local_softirq_pending()))
- do_softirq();
-
- dec_preempt_count();
- preempt_check_resched();
-}
-EXPORT_SYMBOL(local_bh_enable);
-
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
# define invoke_softirq() __do_softirq()
#else
@@ -165,6 +285,7 @@ EXPORT_SYMBOL(local_bh_enable);
void irq_exit(void)
{
account_system_vtime(current);
+ trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -208,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
softirq_vec[nr].action = action;
}
-EXPORT_SYMBOL(open_softirq);
-
/* Tasklets */
struct tasklet_head
{
@@ -446,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -470,6 +589,8 @@ static int cpu_callback(struct notifier_block *nfb,
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!per_cpu(ksoftirqd, hotcpu))
+ break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu),
any_online_cpu(cpu_online_map));
@@ -484,14 +605,16 @@ static int cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
__init int spawn_ksoftirqd(void)
{
void *cpu = (void *)(long)smp_processor_id();
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
void touch_softlockup_watchdog(void)
{
- per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
+ __raw_get_cpu_var(touch_timestamp) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
-static int
+static int __cpuinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!per_cpu(watchdog_task, hotcpu))
+ break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
@@ -140,15 +142,16 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();
+ int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d1b810782bc4..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,31 +7,27 @@
*
* This file contains the spinlock/rwlock implementations for the
* SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
*/
-#include <linux/config.h>
#include <linux/linkage.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
#include <linux/module.h>
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
- __raw_read_lock(lock);
- return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
-
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
- if (_raw_spin_trylock(lock))
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
@@ -41,8 +37,10 @@ EXPORT_SYMBOL(_spin_trylock);
int __lockfunc _read_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_read_trylock(lock))
+ if (_raw_read_trylock(lock)) {
+ rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
@@ -52,19 +50,28 @@ EXPORT_SYMBOL(_read_trylock);
int __lockfunc _write_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_write_trylock(lock))
+ if (_raw_write_trylock(lock)) {
+ rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
}
EXPORT_SYMBOL(_write_trylock);
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
+ defined(CONFIG_DEBUG_LOCK_ALLOC)
void __lockfunc _read_lock(rwlock_t *lock)
{
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock);
@@ -75,7 +82,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_PROVE_LOCKING
+ _raw_spin_lock(lock);
+#else
_raw_spin_lock_flags(lock, &flags);
+#endif
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave);
@@ -84,6 +101,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_irq);
@@ -92,6 +110,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_bh);
@@ -102,6 +121,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
return flags;
}
@@ -111,6 +131,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_irq);
@@ -119,6 +140,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_bh);
@@ -129,6 +151,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
return flags;
}
@@ -138,6 +161,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_irq);
@@ -146,6 +170,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_bh);
@@ -153,6 +178,7 @@ EXPORT_SYMBOL(_write_lock_bh);
void __lockfunc _spin_lock(spinlock_t *lock)
{
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
@@ -161,6 +187,7 @@ EXPORT_SYMBOL(_spin_lock);
void __lockfunc _write_lock(rwlock_t *lock)
{
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
@@ -256,8 +283,22 @@ BUILD_LOCK_OPS(write, rwlock);
#endif /* CONFIG_PREEMPT */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ _raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nested);
+
+#endif
+
void __lockfunc _spin_unlock(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
preempt_enable();
}
@@ -265,6 +306,7 @@ EXPORT_SYMBOL(_spin_unlock);
void __lockfunc _write_unlock(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
preempt_enable();
}
@@ -272,6 +314,7 @@ EXPORT_SYMBOL(_write_unlock);
void __lockfunc _read_unlock(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
preempt_enable();
}
@@ -279,6 +322,7 @@ EXPORT_SYMBOL(_read_unlock);
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -287,6 +331,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore);
void __lockfunc _spin_unlock_irq(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -295,14 +340,16 @@ EXPORT_SYMBOL(_spin_unlock_irq);
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_spin_unlock_bh);
void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -311,6 +358,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore);
void __lockfunc _read_unlock_irq(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -319,14 +367,16 @@ EXPORT_SYMBOL(_read_unlock_irq);
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_read_unlock_bh);
void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -335,6 +385,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore);
void __lockfunc _write_unlock_irq(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -343,9 +394,10 @@ EXPORT_SYMBOL(_write_unlock_irq);
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_write_unlock_bh);
@@ -353,11 +405,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
- if (_raw_spin_trylock(lock))
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
return 0;
}
EXPORT_SYMBOL(_spin_trylock_bh);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 000000000000..b71816e47a30
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+ int i, j;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ unsigned long ip = trace->entries[i];
+
+ for (j = 0; j < spaces + 1; j++)
+ printk(" ");
+ print_ip_sym(ip);
+ }
+}
+
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
#include <linux/stop_machine.h>
#include <linux/kthread.h>
#include <linux/sched.h>
@@ -111,7 +114,6 @@ static int stop_machine(void)
/* If some failed, kill them all. */
if (ret < 0) {
stopmachine_set_state(STOPMACHINE_EXIT);
- up(&stopmachine_mutex);
return ret;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 90930b28d2ca..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,6 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/utsname.h>
@@ -29,6 +28,7 @@
#include <linux/tty.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
+#include <linux/getcpu.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -137,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
- struct notifier_block *nb;
+ struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
while (nb) {
+ next_nb = rcu_dereference(nb->next);
ret = nb->notifier_call(nb, val, v);
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
- nb = rcu_dereference(nb->next);
+ nb = next_nb;
}
return ret;
}
@@ -588,7 +589,7 @@ void emergency_restart(void)
}
EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart_prepare(char *cmd)
+static void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
@@ -611,7 +612,6 @@ void kernel_restart(char *cmd)
} else {
printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
}
- printk(".\n");
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -622,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
-void kernel_kexec(void)
+static void kernel_kexec(void)
{
#ifdef CONFIG_KEXEC
struct kimage *image;
@@ -636,7 +636,6 @@ void kernel_kexec(void)
machine_kexec(image);
#endif
}
-EXPORT_SYMBOL_GPL(kernel_kexec);
void kernel_shutdown_prepare(enum system_states state)
{
@@ -1984,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
error = current->mm->dumpable;
break;
case PR_SET_DUMPABLE:
- if (arg2 < 0 || arg2 > 2) {
+ if (arg2 < 0 || arg2 > 1) {
error = -EINVAL;
break;
}
@@ -2063,3 +2062,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
}
return error;
}
+
+asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
+ struct getcpu_cache __user *cache)
+{
+ int err = 0;
+ int cpu = raw_smp_processor_id();
+ if (cpup)
+ err |= put_user(cpu, cpup);
+ if (nodep)
+ err |= put_user(cpu_to_node(cpu), nodep);
+ if (cache) {
+ /*
+ * The cache is not needed for this implementation,
+ * but make sure user programs pass something
+ * valid. vsyscall implementations can instead make
+ * good use of the cache. Only use t0 and t1 because
+ * these are available in both 32bit and 64bit ABI (no
+ * need for a compat_getcpu). 32bit has enough
+ * padding
+ */
+ unsigned long t0, t1;
+ get_user(t0, &cache->blob[0]);
+ get_user(t1, &cache->blob[1]);
+ t0++;
+ t1++;
+ put_user(t0, &cache->blob[0]);
+ put_user(t1, &cache->blob[1]);
+ }
+ return err ? -EFAULT : 0;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eb8bd214e7d7..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -18,7 +18,6 @@
* Removed it and replaced it with older style, 03/23/00, Bill Wendling
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -53,6 +52,10 @@
extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
+
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
@@ -73,12 +76,7 @@ extern int printk_ratelimit_burst;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
-
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
- void __user *, size_t *, loff_t *);
-#endif
+extern int compat_log;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -132,8 +130,15 @@ extern int acct_parm[];
extern int no_unaligned_warning;
#endif
-static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
- ctl_table *, void **);
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+static int parse_table(int __user *, int, void __user *, size_t __user *,
+ void __user *, size_t, ctl_table *, void **);
+#endif
+
static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -143,7 +148,6 @@ static struct ctl_table_header root_table_header =
static ctl_table kern_table[];
static ctl_table vm_table[];
-static ctl_table proc_table[];
static ctl_table fs_table[];
static ctl_table debug_table[];
static ctl_table dev_table[];
@@ -161,7 +165,7 @@ int sysctl_legacy_va_layout;
/* /proc declarations: */
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -203,12 +207,6 @@ static ctl_table root_table[] = {
},
#endif
{
- .ctl_name = CTL_PROC,
- .procname = "proc",
- .mode = 0555,
- .child = proc_table,
- },
- {
.ctl_name = CTL_FS,
.procname = "fs",
.mode = 0555,
@@ -631,11 +629,27 @@ static ctl_table kern_table[] = {
.data = &unknown_nmi_panic,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = &proc_unknown_nmi_panic,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_NMI_WATCHDOG,
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_nmi_enabled,
},
#endif
#if defined(CONFIG_X86)
{
+ .ctl_name = KERN_PANIC_ON_NMI,
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = KERN_BOOTLOADER_TYPE,
.procname = "bootloader_type",
.data = &bootloader_type,
@@ -684,6 +698,27 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_COMPAT
+ {
+ .ctl_name = KERN_COMPAT_LOG,
+ .procname = "compat-log",
+ .data = &compat_log,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_RT_MUTEXES
+ {
+ .ctl_name = KERN_MAX_LOCK_DEPTH,
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+
{ .ctl_name = 0 }
};
@@ -915,19 +950,40 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
},
{
- .ctl_name = VM_ZONE_RECLAIM_INTERVAL,
- .procname = "zone_reclaim_interval",
- .data = &zone_reclaim_interval,
- .maxlen = sizeof(zone_reclaim_interval),
+ .ctl_name = VM_MIN_UNMAPPED,
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
.mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
+ .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .ctl_name = VM_MIN_SLAB,
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
+#ifdef CONFIG_X86_32
+ {
+ .ctl_name = VM_VDSO_ENABLED,
+ .procname = "vdso_enabled",
+ .data = &vdso_enabled,
+ .maxlen = sizeof(vdso_enabled),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
},
#endif
- { .ctl_name = 0 }
-};
-
-static ctl_table proc_table[] = {
{ .ctl_name = 0 }
};
@@ -1110,12 +1166,13 @@ static void start_unregistering(struct ctl_table_header *p)
void __init sysctl_init(void)
{
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
register_proc_table(root_table, proc_sys_root, &root_table_header);
init_irq_proc();
#endif
}
+#ifdef CONFIG_SYSCTL_SYSCALL
int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
{
@@ -1169,6 +1226,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
unlock_kernel();
return error;
}
+#endif /* CONFIG_SYSCTL_SYSCALL */
/*
* ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1195,6 +1253,7 @@ static inline int ctl_perm(ctl_table *table, int op)
return test_perm(table->mode, op);
}
+#ifdef CONFIG_SYSCTL_SYSCALL
static int parse_table(int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen,
@@ -1284,6 +1343,7 @@ int do_sysctl_strategy (ctl_table *table,
}
return 0;
}
+#endif /* CONFIG_SYSCTL_SYSCALL */
/**
* register_sysctl_table - register a sysctl hierarchy
@@ -1371,7 +1431,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
else
list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
register_proc_table(table, proc_sys_root, tmp);
#endif
return tmp;
@@ -1389,18 +1449,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
might_sleep();
spin_lock(&sysctl_lock);
start_unregistering(header);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
unregister_proc_table(header->ctl_table, proc_sys_root);
#endif
spin_unlock(&sysctl_lock);
kfree(header);
}
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+ int insert_at_head)
+{
+ return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
/*
* /proc/sys support
*/
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
/* Scan the sysctl entries in table and add them all into /proc */
static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -1839,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
return -EPERM;
}
- op = (current->pid == 1) ? OP_SET : OP_AND;
+ op = is_init(current) ? OP_SET : OP_AND;
return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
do_proc_dointvec_bset_conv,&op);
}
@@ -2262,6 +2335,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SYSCTL_SYSCALL
/*
* General sysctl support routines
*/
@@ -2404,11 +2478,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return 1;
}
-#else /* CONFIG_SYSCTL */
+#else /* CONFIG_SYSCTL_SYSCALL */
asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
{
+ static int msg_count;
+
+ if (msg_count < 5) {
+ msg_count++;
+ printk(KERN_INFO
+ "warning: process `%s' used the removed sysctl "
+ "system call\n", current->comm);
+ }
return -ENOSYS;
}
@@ -2440,73 +2522,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
return -ENOSYS;
}
-int proc_dostring(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
- struct file *filp,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
-struct ctl_table_header * register_sysctl_table(ctl_table * table,
- int insert_at_head)
-{
- return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SYSCTL_SYSCALL */
/*
* No sense putting this after each symbol definition, twice,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..2ed4040d0dc5
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,564 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+ [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+ [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+ struct list_head list;
+ pid_t pid;
+ char valid;
+};
+
+struct listener_list {
+ struct rw_semaphore sem;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+ REGISTER,
+ DEREGISTER,
+ CPU_DONT_CARE
+};
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+ void **replyp, size_t size)
+{
+ struct sk_buff *skb;
+ void *reply;
+
+ /*
+ * If new attributes are added, please revisit this allocation
+ */
+ skb = nlmsg_new(size, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ } else
+ reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+
+ *skbp = skb;
+ *replyp = reply;
+ return 0;
+}
+
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ void *reply = genlmsg_data(genlhdr);
+ int rc;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return rc;
+ }
+
+ return genlmsg_unicast(skb, pid);
+}
+
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct listener_list *listeners;
+ struct listener *s, *tmp;
+ struct sk_buff *skb_next, *skb_cur = skb;
+ void *reply = genlmsg_data(genlhdr);
+ int rc, delcount = 0;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return;
+ }
+
+ rc = 0;
+ listeners = &per_cpu(listener_array, cpu);
+ down_read(&listeners->sem);
+ list_for_each_entry(s, &listeners->list, list) {
+ skb_next = NULL;
+ if (!list_is_last(&s->list, &listeners->list)) {
+ skb_next = skb_clone(skb_cur, GFP_KERNEL);
+ if (!skb_next)
+ break;
+ }
+ rc = genlmsg_unicast(skb_cur, s->pid);
+ if (rc == -ECONNREFUSED) {
+ s->valid = 0;
+ delcount++;
+ }
+ skb_cur = skb_next;
+ }
+ up_read(&listeners->sem);
+
+ if (skb_cur)
+ nlmsg_free(skb_cur);
+
+ if (!delcount)
+ return;
+
+ /* Delete invalidated entries */
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (!s->valid) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ }
+ up_write(&listeners->sem);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+ struct taskstats *stats)
+{
+ int rc = 0;
+ struct task_struct *tsk = pidtsk;
+
+ if (!pidtsk) {
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(tsk);
+
+ /*
+ * Each accounting subsystem adds calls to its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+
+ delayacct_add_tsk(stats, tsk);
+ stats->version = TASKSTATS_VERSION;
+
+ /* Define err: label here if needed */
+ put_task_struct(tsk);
+ return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+ struct taskstats *stats)
+{
+ struct task_struct *tsk, *first;
+ unsigned long flags;
+
+ /*
+ * Add additional stats from live tasks except zombie thread group
+ * leaders who are already counted with the dead tasks
+ */
+ first = tgidtsk;
+ if (!first) {
+ read_lock(&tasklist_lock);
+ first = find_task_by_pid(tgid);
+ if (!first) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ get_task_struct(first);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(first);
+
+ /* Start with stats from dead tasks */
+ spin_lock_irqsave(&first->signal->stats_lock, flags);
+ if (first->signal->stats)
+ memcpy(stats, first->signal->stats, sizeof(*stats));
+ spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
+ tsk = first;
+ read_lock(&tasklist_lock);
+ do {
+ if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+ continue;
+ /*
+ * Accounting subsystem can call its functions here to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+ delayacct_add_tsk(stats, tsk);
+
+ } while_each_thread(first, tsk);
+ read_unlock(&tasklist_lock);
+ stats->version = TASKSTATS_VERSION;
+
+ /*
+ * Accounting subsytems can also add calls here to modify
+ * fields of taskstats.
+ */
+
+ return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+ if (!tsk->signal->stats)
+ goto ret;
+
+ /*
+ * Each accounting subsystem calls its functions here to
+ * accumalate its per-task stats for tsk, into the per-tgid structure
+ *
+ * per-task-foo(tsk->signal->stats, tsk);
+ */
+ delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+ spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+ return;
+}
+
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+ struct listener_list *listeners;
+ struct listener *s, *tmp;
+ unsigned int cpu;
+ cpumask_t mask = *maskp;
+
+ if (!cpus_subset(mask, cpu_possible_map))
+ return -EINVAL;
+
+ if (isadd == REGISTER) {
+ for_each_cpu_mask(cpu, mask) {
+ s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!s)
+ goto cleanup;
+ s->pid = pid;
+ INIT_LIST_HEAD(&s->list);
+ s->valid = 1;
+
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_add(&s->list, &listeners->list);
+ up_write(&listeners->sem);
+ }
+ return 0;
+ }
+
+ /* Deregister or cleanup */
+cleanup:
+ for_each_cpu_mask(cpu, mask) {
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (s->pid == pid) {
+ list_del(&s->list);
+ kfree(s);
+ break;
+ }
+ }
+ up_write(&listeners->sem);
+ }
+ return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+ char *data;
+ int len;
+ int ret;
+
+ if (na == NULL)
+ return 1;
+ len = nla_len(na);
+ if (len > TASKSTATS_CPUMASK_MAXLEN)
+ return -E2BIG;
+ if (len < 1)
+ return -EINVAL;
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ nla_strlcpy(data, na, len);
+ ret = cpulist_parse(data, *mask);
+ kfree(data);
+ return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct taskstats stats;
+ void *reply;
+ size_t size;
+ struct nlattr *na;
+ cpumask_t mask;
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ memset(&stats, 0, sizeof(stats));
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ return rc;
+
+ if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ rc = fill_pid(pid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+ u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ rc = fill_tgid(tgid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else {
+ rc = -EINVAL;
+ goto err;
+ }
+
+ nla_nest_end(rep_skb, na);
+
+ return send_reply(rep_skb, info->snd_pid);
+
+nla_put_failure:
+ return genlmsg_cancel(rep_skb, reply);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+ struct listener_list *listeners;
+ struct taskstats *tmp;
+ /*
+ * This is the cpu on which the task is exiting currently and will
+ * be the one for which the exit event is sent, even if the cpu
+ * on which this function is running changes later.
+ */
+ *mycpu = raw_smp_processor_id();
+
+ *ptidstats = NULL;
+ tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+ if (!tmp)
+ return;
+
+ listeners = &per_cpu(listener_array, *mycpu);
+ down_read(&listeners->sem);
+ if (!list_empty(&listeners->list)) {
+ *ptidstats = tmp;
+ tmp = NULL;
+ }
+ up_read(&listeners->sem);
+ kfree(tmp);
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+ int group_dead, unsigned int mycpu)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ void *reply;
+ size_t size;
+ int is_thread_group;
+ struct nlattr *na;
+ unsigned long flags;
+
+ if (!family_registered || !tidstats)
+ return;
+
+ spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+ is_thread_group = tsk->signal->stats ? 1 : 0;
+ spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+
+ rc = 0;
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ if (is_thread_group)
+ size = 2 * size; /* PID + STATS + TGID + STATS */
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ goto ret;
+
+ rc = fill_pid(tsk->pid, tsk, tidstats);
+ if (rc < 0)
+ goto err_skb;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tidstats);
+ nla_nest_end(rep_skb, na);
+
+ if (!is_thread_group)
+ goto send;
+
+ /*
+ * tsk has/had a thread group so fill the tsk->signal->stats structure
+ * Doesn't matter if tsk is the leader or the last group member leaving
+ */
+
+ fill_tgid_exit(tsk);
+ if (!group_dead)
+ goto send;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+ /* No locking needed for tsk->signal->stats since group is dead */
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tsk->signal->stats);
+ nla_nest_end(rep_skb, na);
+
+send:
+ send_cpu_listeners(rep_skb, mycpu);
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(rep_skb, reply);
+ goto ret;
+err_skb:
+ nlmsg_free(rep_skb);
+ret:
+ return;
+}
+
+static struct genl_ops taskstats_ops = {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+ unsigned int i;
+
+ taskstats_cache = kmem_cache_create("taskstats_cache",
+ sizeof(struct taskstats),
+ 0, SLAB_PANIC, NULL, NULL);
+ for_each_possible_cpu(i) {
+ INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+ init_rwsem(&(per_cpu(listener_array, i).sem));
+ }
+}
+
+static int __init taskstats_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&family);
+ if (rc)
+ return rc;
+
+ rc = genl_register_ops(&family, &taskstats_ops);
+ if (rc < 0)
+ goto err;
+
+ family_registered = 1;
+ return 0;
+err:
+ genl_unregister_family(&family);
+ return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
#else
+#ifndef CONFIG_GENERIC_TIME
/*
* Simulate gettimeofday using do_gettimeofday which only allows a timeval
* and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
}
EXPORT_SYMBOL_GPL(getnstimeofday);
#endif
+#endif
/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
+obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ * o Allow clocksource drivers to be unregistered
+ * o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ * currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ * pending next selected clocksource.
+ * clocksource_list:
+ * linked list with the registered clocksources
+ * clocksource_lock:
+ * protects manipulations to curr_clocksource and next_clocksource
+ * and the clocksource_list
+ * override_name:
+ * Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_SPINLOCK(clocksource_lock);
+static char override_name[32];
+static int finished_booting;
+
+/* clocksource_done_booting - Called near the end of bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time
+ */
+static int __init clocksource_done_booting(void)
+{
+ finished_booting = 1;
+ return 0;
+}
+
+late_initcall(clocksource_done_booting);
+
+/**
+ * clocksource_get_next - Returns the selected clocksource
+ *
+ */
+struct clocksource *clocksource_get_next(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ if (next_clocksource && finished_booting) {
+ curr_clocksource = next_clocksource;
+ next_clocksource = NULL;
+ }
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+
+ return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ *
+ * Looks through the list of registered clocksources, returning
+ * the one with the highest rating value. If there is a clocksource
+ * name that matches the override string, it returns that clocksource.
+ */
+static struct clocksource *select_clocksource(void)
+{
+ struct clocksource *best = NULL;
+ struct list_head *tmp;
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *src;
+
+ src = list_entry(tmp, struct clocksource, list);
+ if (!best)
+ best = src;
+
+ /* check for override: */
+ if (strlen(src->name) == strlen(override_name) &&
+ !strcmp(src->name, override_name)) {
+ best = src;
+ break;
+ }
+ /* pick the highest rating: */
+ if (src->rating > best->rating)
+ best = src;
+ }
+
+ return best;
+}
+
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c: pointer to a clocksource
+ *
+ * Private helper function. Must hold clocksource_lock when called.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static int is_registered_source(struct clocksource *c)
+{
+ int len = strlen(c->name);
+ struct list_head *tmp;
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *src;
+
+ src = list_entry(tmp, struct clocksource, list);
+ if (strlen(src->name) == len && !strcmp(src->name, c->name))
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * clocksource_register - Used to install new clocksources
+ * @t: clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int clocksource_register(struct clocksource *c)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ /* check if clocksource is already registered */
+ if (is_registered_source(c)) {
+ printk("register_clocksource: Cannot register %s. "
+ "Already registered!", c->name);
+ ret = -EBUSY;
+ } else {
+ /* register it */
+ list_add(&c->list, &clocksource_list);
+ /* scan the registered clocksources, and pick the best one */
+ next_clocksource = select_clocksource();
+ }
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(clocksource_register);
+
+/**
+ * clocksource_reselect - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scanned for the best
+ * clocksource.
+ */
+void clocksource_reselect(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+ next_clocksource = select_clocksource();
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+EXPORT_SYMBOL(clocksource_reselect);
+
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+ char *curr = buf;
+
+ spin_lock_irq(&clocksource_lock);
+ curr += sprintf(curr, "%s ", curr_clocksource->name);
+ spin_unlock_irq(&clocksource_lock);
+
+ curr += sprintf(curr, "\n");
+
+ return curr - buf;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev: unused
+ * @buf: name of override clocksource
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ size_t ret = count;
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(override_name))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+ if (count < 1)
+ return -EINVAL;
+
+ spin_lock_irq(&clocksource_lock);
+
+ /* copy the name given: */
+ memcpy(override_name, buf, count);
+ override_name[count] = 0;
+
+ /* try to select it: */
+ next_clocksource = select_clocksource();
+
+ spin_unlock_irq(&clocksource_lock);
+
+ return ret;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+ struct list_head *tmp;
+ char *curr = buf;
+
+ spin_lock_irq(&clocksource_lock);
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *src;
+
+ src = list_entry(tmp, struct clocksource, list);
+ curr += sprintf(curr, "%s ", src->name);
+ }
+ spin_unlock_irq(&clocksource_lock);
+
+ curr += sprintf(curr, "\n");
+
+ return curr - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+ sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0600,
+ sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+ set_kset_name("clocksource"),
+};
+
+static struct sys_device device_clocksource = {
+ .id = 0,
+ .cls = &clocksource_sysclass,
+};
+
+static int __init init_clocksource_sysfs(void)
+{
+ int error = sysdev_class_register(&clocksource_sysclass);
+
+ if (!error)
+ error = sysdev_register(&device_clocksource);
+ if (!error)
+ error = sysdev_create_file(
+ &device_clocksource,
+ &attr_current_clocksource);
+ if (!error)
+ error = sysdev_create_file(
+ &device_clocksource,
+ &attr_available_clocksource);
+ return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str: override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&clocksource_lock, flags);
+ if (str)
+ strlcpy(override_name, str, sizeof(override_name));
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+ return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str: override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+ if (!strcmp(str, "pmtmr")) {
+ printk("Warning: clock=pmtmr is deprecated. "
+ "Use clocksource=acpi_pm.\n");
+ return boot_override_clocksource("acpi_pm");
+ }
+ printk("Warning! clock= boot option is deprecated. "
+ "Use clocksource=xyz\n");
+ return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT 8
+
+static cycle_t jiffies_read(void)
+{
+ return (cycle_t) jiffies;
+}
+
+struct clocksource clocksource_jiffies = {
+ .name = "jiffies",
+ .rating = 0, /* lowest rating*/
+ .read = jiffies_read,
+ .mask = 0xffffffff, /*32bits*/
+ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .is_continuous = 0, /* tick based, not free running */
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+ return clocksource_register(&clocksource_jiffies);
+}
+
+module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index f35b3939e937..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
-/***
+/**
* init_timer - initialize a timer.
* @timer: the timer to be initialized
*
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
void fastcall init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
- timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
+ timer->base = __raw_get_cpu_var(tvec_bases);
}
EXPORT_SYMBOL(init_timer);
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
*/
static tvec_base_t *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
+ __acquires(timer->base->lock)
{
tvec_base_t *base;
@@ -235,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(__mod_timer);
-/***
+/**
* add_timer_on - start a timer on a particular CPU
* @timer: the timer to be added
* @cpu: the CPU to start it on
@@ -255,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
-/***
+/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
+ * @expires: new timeout in jiffies
*
* mod_timer is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
@@ -291,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
EXPORT_SYMBOL(mod_timer);
-/***
+/**
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
*
@@ -323,7 +325,10 @@ int del_timer(struct timer_list *timer)
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
*
@@ -351,7 +356,7 @@ out:
return ret;
}
-/***
+/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
@@ -374,6 +379,7 @@ int del_timer_sync(struct timer_list *timer)
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
return ret;
+ cpu_relax();
}
}
@@ -400,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
return index;
}
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
*
* This function cascades all vectors and executes all expired timer
* vectors.
*/
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
-
static inline void __run_timers(tvec_base_t *base)
{
struct timer_list *timer;
@@ -597,7 +603,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
long time_precision = 1; /* clock precision (us) */
long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
-static long time_phase; /* phase offset (scaled us) */
long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
/* frequency offset (scaled ppm)*/
static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -747,27 +752,14 @@ static long adjtime_adjustment(void)
}
/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+static void update_ntp_one_tick(void)
{
- long time_adjust_step, delta_nsec;
+ long time_adjust_step;
time_adjust_step = adjtime_adjustment();
if (time_adjust_step)
/* Reduce by this step the amount of time left */
time_adjust -= time_adjust_step;
- delta_nsec = tick_nsec + time_adjust_step * 1000;
- /*
- * Advance the phase, once it gets to one microsecond, then
- * advance the tick more.
- */
- time_phase += time_adj;
- if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
- long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
- time_phase -= ltemp << (SHIFT_SCALE - 10);
- delta_nsec += ltemp;
- }
- xtime.tv_nsec += delta_nsec;
- time_interpolator_update(delta_nsec);
/* Changes by adjtime() do not take effect till next tick. */
if (time_next_adjust != 0) {
@@ -780,36 +772,404 @@ static void update_wall_time_one_tick(void)
* Return how long ticks are at the moment, that is, how much time
* update_wall_time_one_tick will add to xtime next time we call it
* (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
- * bits to the right of the binary point.
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
* This function has no side-effects.
*/
u64 current_tick_length(void)
{
long delta_nsec;
+ u64 ret;
+ /* calculate the finest interval NTP will allow.
+ * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
+ */
delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
- return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+ ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+ ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+
+ return ret;
}
-/*
- * Using a loop looks inefficient, but "ticks" is
- * usually just one (we shouldn't be losing ticks,
- * we're doing this this way mainly for interrupt
- * latency reasons, not because we think we'll
- * have lots of lost timer ticks
+/* XXX - all of this timekeeping code should be later moved to time.c */
+#include <linux/clocksource.h>
+static struct clocksource *clock; /* pointer to current clocksource */
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 ns_offset;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyc2ns(clock, cycle_delta);
+
+ return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts = xtime;
+ nsecs = __get_nsec_offset();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ __get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
*/
-static void update_wall_time(unsigned long ticks)
+void do_gettimeofday(struct timeval *tv)
{
+ struct timespec now;
+
+ __get_realtime_clock_ts(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ time_t wtm_sec, sec = tv->tv_sec;
+ long wtm_nsec, nsec = tv->tv_nsec;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ nsec -= __get_nsec_offset();
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+ set_normalized_timespec(&xtime, sec, nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+ clock->error = 0;
+ ntp_clear();
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static int change_clocksource(void)
+{
+ struct clocksource *new;
+ cycle_t now;
+ u64 nsec;
+ new = clocksource_get_next();
+ if (clock != new) {
+ now = clocksource_read(new);
+ nsec = __get_nsec_offset();
+ timespec_add_ns(&xtime, nsec);
+
+ clock = new;
+ clock->cycle_last = now;
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+ return 1;
+ } else if (clock->update_callback) {
+ return clock->update_callback();
+ }
+ return 0;
+}
+#else
+#define change_clocksource() (0)
+#endif
+
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+ unsigned long seq;
+ int ret;
+
do {
- ticks--;
- update_wall_time_one_tick();
- if (xtime.tv_nsec >= 1000000000) {
- xtime.tv_nsec -= 1000000000;
+ seq = read_seqbegin(&xtime_lock);
+
+ ret = clock->is_continuous;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return ret;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ clock = clocksource_get_next();
+ clocksource_calculate_interval(clock, tick_nsec);
+ clock->cycle_last = clocksource_read(clock);
+ ntp_clear();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+
+static int timekeeping_suspended;
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev: unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ /* restart the last cycle value */
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_suspended = 1;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+ set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+ int error = sysdev_class_register(&timekeeping_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+static void update_wall_time(void)
+{
+ cycle_t offset;
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
+
+#ifdef CONFIG_GENERIC_TIME
+ offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+ offset = clock->cycle_interval;
+#endif
+ clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+
+ /* normally this loop will run just once, however in the
+ * case of lost or late ticks, it will accumulate correctly.
+ */
+ while (offset >= clock->cycle_interval) {
+ /* accumulate one interval */
+ clock->xtime_nsec += clock->xtime_interval;
+ clock->cycle_last += clock->cycle_interval;
+ offset -= clock->cycle_interval;
+
+ if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+ clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
xtime.tv_sec++;
second_overflow();
}
- } while (ticks);
+
+ /* interpolator bits */
+ time_interpolator_update(clock->xtime_interval
+ >> clock->shift);
+ /* increment the NTP state machine */
+ update_ntp_one_tick();
+
+ /* accumulate error between NTP and clock interval */
+ clock->error += current_tick_length();
+ clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+ }
+
+ /* correct the clock when NTP error is too big */
+ clocksource_adjust(clock, offset);
+
+ /* store full nanoseconds into xtime */
+ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+ clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+
+ /* check to see if there is a new clocksource to use */
+ if (change_clocksource()) {
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, tick_nsec);
+ }
}
/*
@@ -862,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
- count -= ticks;
- if (count < 0) {
- count += LOAD_FREQ;
- active_tasks = count_active_tasks();
+ active_tasks = count_active_tasks();
+ for (count -= ticks; count < 0; count += LOAD_FREQ) {
CALC_LOAD(avenrun[0], EXP_1, active_tasks);
CALC_LOAD(avenrun[1], EXP_5, active_tasks);
CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -880,7 +1238,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
* playing with xtime and avenrun.
*/
#ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
EXPORT_SYMBOL(xtime_lock);
#endif
@@ -910,15 +1268,10 @@ void run_local_timers(void)
* Called by the timer interrupt. xtime_lock must already be taken
* by the timer IRQ!
*/
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
{
- unsigned long ticks;
-
- ticks = jiffies - wall_jiffies;
- if (ticks) {
- wall_jiffies += ticks;
- update_wall_time(ticks);
- }
+ wall_jiffies += ticks;
+ update_wall_time();
calc_load(ticks);
}
@@ -928,12 +1281,10 @@ static inline void update_times(void)
* jiffies is defined in the linker script...
*/
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
{
- jiffies_64++;
- /* prevent loading jiffies before storing new jiffies_64 value. */
- barrier();
- update_times();
+ jiffies_64 += ticks;
+ update_times(ticks);
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -971,46 +1322,19 @@ asmlinkage long sys_getpid(void)
}
/*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
*/
asmlinkage long sys_getppid(void)
{
int pid;
- struct task_struct *me = current;
- struct task_struct *parent;
- parent = me->group_leader->real_parent;
- for (;;) {
- pid = parent->tgid;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
- struct task_struct *old = parent;
+ rcu_read_lock();
+ pid = rcu_dereference(current->real_parent)->tgid;
+ rcu_read_unlock();
- /*
- * Make sure we read the pid before re-reading the
- * parent pointer:
- */
- smp_rmb();
- parent = me->group_leader->real_parent;
- if (old != parent)
- continue;
-}
-#endif
- break;
- }
return pid;
}
@@ -1042,7 +1366,7 @@ asmlinkage long sys_getegid(void)
static void process_timeout(unsigned long __data)
{
- wake_up_process((task_t *)__data);
+ wake_up_process((struct task_struct *)__data);
}
/**
@@ -1144,8 +1468,9 @@ asmlinkage long sys_gettid(void)
return current->pid;
}
-/*
+/**
* sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
*/
asmlinkage long sys_sysinfo(struct sysinfo __user *info)
{
@@ -1233,6 +1558,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
return 0;
}
+/*
+ * lockdep: we want to track each per-CPU base as a separate lock-class,
+ * but timer-bases are kmalloc()-ed, so we need to attach separate
+ * keys to them:
+ */
+static struct lock_class_key base_lock_keys[NR_CPUS];
+
static int __devinit init_timers_cpu(int cpu)
{
int j;
@@ -1268,6 +1600,8 @@ static int __devinit init_timers_cpu(int cpu)
}
spin_lock_init(&base->lock);
+ lockdep_set_class(&base->lock, base_lock_keys + cpu);
+
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1326,7 +1660,7 @@ static void __devinit migrate_timers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1346,15 +1680,17 @@ static int timer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
.notifier_call = timer_cpu_notify,
};
void __init init_timers(void)
{
- timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
+
+ BUG_ON(err == NOTIFY_BAD);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
}
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..2e2368607aab
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ * Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks. This is used for
+ * debugging and error reporting purposes. The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/unwind.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+
+extern char __start_unwind[], __end_unwind[];
+
+#define MAX_STACK_DEPTH 8
+
+#define EXTRA_INFO(f) { \
+ BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
+ % FIELD_SIZEOF(struct unwind_frame_info, f)) \
+ + offsetof(struct unwind_frame_info, f) \
+ / FIELD_SIZEOF(struct unwind_frame_info, f), \
+ FIELD_SIZEOF(struct unwind_frame_info, f) \
+ }
+#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
+
+static const struct {
+ unsigned offs:BITS_PER_LONG / 2;
+ unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+ UNW_REGISTER_INFO
+};
+
+#undef PTREGS_INFO
+#undef EXTRA_INFO
+
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+
+#define DW_CFA_nop 0x00
+#define DW_CFA_set_loc 0x01
+#define DW_CFA_advance_loc1 0x02
+#define DW_CFA_advance_loc2 0x03
+#define DW_CFA_advance_loc4 0x04
+#define DW_CFA_offset_extended 0x05
+#define DW_CFA_restore_extended 0x06
+#define DW_CFA_undefined 0x07
+#define DW_CFA_same_value 0x08
+#define DW_CFA_register 0x09
+#define DW_CFA_remember_state 0x0a
+#define DW_CFA_restore_state 0x0b
+#define DW_CFA_def_cfa 0x0c
+#define DW_CFA_def_cfa_register 0x0d
+#define DW_CFA_def_cfa_offset 0x0e
+#define DW_CFA_def_cfa_expression 0x0f
+#define DW_CFA_expression 0x10
+#define DW_CFA_offset_extended_sf 0x11
+#define DW_CFA_def_cfa_sf 0x12
+#define DW_CFA_def_cfa_offset_sf 0x13
+#define DW_CFA_val_offset 0x14
+#define DW_CFA_val_offset_sf 0x15
+#define DW_CFA_val_expression 0x16
+#define DW_CFA_lo_user 0x1c
+#define DW_CFA_GNU_window_save 0x2d
+#define DW_CFA_GNU_args_size 0x2e
+#define DW_CFA_GNU_negative_offset_extended 0x2f
+#define DW_CFA_hi_user 0x3f
+
+#define DW_EH_PE_FORM 0x07
+#define DW_EH_PE_native 0x00
+#define DW_EH_PE_leb128 0x01
+#define DW_EH_PE_data2 0x02
+#define DW_EH_PE_data4 0x03
+#define DW_EH_PE_data8 0x04
+#define DW_EH_PE_signed 0x08
+#define DW_EH_PE_ADJUST 0x70
+#define DW_EH_PE_abs 0x00
+#define DW_EH_PE_pcrel 0x10
+#define DW_EH_PE_textrel 0x20
+#define DW_EH_PE_datarel 0x30
+#define DW_EH_PE_funcrel 0x40
+#define DW_EH_PE_aligned 0x50
+#define DW_EH_PE_indirect 0x80
+#define DW_EH_PE_omit 0xff
+
+typedef unsigned long uleb128_t;
+typedef signed long sleb128_t;
+
+static struct unwind_table {
+ struct {
+ unsigned long pc;
+ unsigned long range;
+ } core, init;
+ const void *address;
+ unsigned long size;
+ struct unwind_table *link;
+ const char *name;
+} root_table;
+
+struct unwind_item {
+ enum item_location {
+ Nowhere,
+ Memory,
+ Register,
+ Value
+ } where;
+ uleb128_t value;
+};
+
+struct unwind_state {
+ uleb128_t loc, org;
+ const u8 *cieStart, *cieEnd;
+ uleb128_t codeAlign;
+ sleb128_t dataAlign;
+ struct cfa {
+ uleb128_t reg, offs;
+ } cfa;
+ struct unwind_item regs[ARRAY_SIZE(reg_info)];
+ unsigned stackDepth:8;
+ unsigned version:8;
+ const u8 *label;
+ const u8 *stack[MAX_STACK_DEPTH];
+};
+
+static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+
+static struct unwind_table *find_table(unsigned long pc)
+{
+ struct unwind_table *table;
+
+ for (table = &root_table; table; table = table->link)
+ if ((pc >= table->core.pc
+ && pc < table->core.pc + table->core.range)
+ || (pc >= table->init.pc
+ && pc < table->init.pc + table->init.range))
+ break;
+
+ return table;
+}
+
+static void init_unwind_table(struct unwind_table *table,
+ const char *name,
+ const void *core_start,
+ unsigned long core_size,
+ const void *init_start,
+ unsigned long init_size,
+ const void *table_start,
+ unsigned long table_size)
+{
+ table->core.pc = (unsigned long)core_start;
+ table->core.range = core_size;
+ table->init.pc = (unsigned long)init_start;
+ table->init.range = init_size;
+ table->address = table_start;
+ table->size = table_size;
+ table->link = NULL;
+ table->name = name;
+}
+
+void __init unwind_init(void)
+{
+ init_unwind_table(&root_table, "kernel",
+ _text, _end - _text,
+ NULL, 0,
+ __start_unwind, __end_unwind - __start_unwind);
+}
+
+#ifdef CONFIG_MODULES
+
+static struct unwind_table *last_table;
+
+/* Must be called with module_mutex held. */
+void *unwind_add_table(struct module *module,
+ const void *table_start,
+ unsigned long table_size)
+{
+ struct unwind_table *table;
+
+ if (table_size <= 0)
+ return NULL;
+
+ table = kmalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ init_unwind_table(table, module->name,
+ module->module_core, module->core_size,
+ module->module_init, module->init_size,
+ table_start, table_size);
+
+ if (last_table)
+ last_table->link = table;
+ else
+ root_table.link = table;
+ last_table = table;
+
+ return table;
+}
+
+struct unlink_table_info
+{
+ struct unwind_table *table;
+ int init_only;
+};
+
+static int unlink_table(void *arg)
+{
+ struct unlink_table_info *info = arg;
+ struct unwind_table *table = info->table, *prev;
+
+ for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
+ ;
+
+ if (prev->link) {
+ if (info->init_only) {
+ table->init.pc = 0;
+ table->init.range = 0;
+ info->table = NULL;
+ } else {
+ prev->link = table->link;
+ if (!prev->link)
+ last_table = prev;
+ }
+ } else
+ info->table = NULL;
+
+ return 0;
+}
+
+/* Must be called with module_mutex held. */
+void unwind_remove_table(void *handle, int init_only)
+{
+ struct unwind_table *table = handle;
+ struct unlink_table_info info;
+
+ if (!table || table == &root_table)
+ return;
+
+ if (init_only && table == last_table) {
+ table->init.pc = 0;
+ table->init.range = 0;
+ return;
+ }
+
+ info.table = table;
+ info.init_only = init_only;
+ stop_machine_run(unlink_table, &info, NR_CPUS);
+
+ if (info.table)
+ kfree(table);
+}
+
+#endif /* CONFIG_MODULES */
+
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+ const u8 *cur = *pcur;
+ uleb128_t value;
+ unsigned shift;
+
+ for (shift = 0, value = 0; cur < end; shift += 7) {
+ if (shift + 7 > 8 * sizeof(value)
+ && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+ cur = end + 1;
+ break;
+ }
+ value |= (uleb128_t)(*cur & 0x7f) << shift;
+ if (!(*cur++ & 0x80))
+ break;
+ }
+ *pcur = cur;
+
+ return value;
+}
+
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+ const u8 *cur = *pcur;
+ sleb128_t value;
+ unsigned shift;
+
+ for (shift = 0, value = 0; cur < end; shift += 7) {
+ if (shift + 7 > 8 * sizeof(value)
+ && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+ cur = end + 1;
+ break;
+ }
+ value |= (sleb128_t)(*cur & 0x7f) << shift;
+ if (!(*cur & 0x80)) {
+ value |= -(*cur++ & 0x40) << shift;
+ break;
+ }
+ }
+ *pcur = cur;
+
+ return value;
+}
+
+static unsigned long read_pointer(const u8 **pLoc,
+ const void *end,
+ signed ptrType)
+{
+ unsigned long value = 0;
+ union {
+ const u8 *p8;
+ const u16 *p16u;
+ const s16 *p16s;
+ const u32 *p32u;
+ const s32 *p32s;
+ const unsigned long *pul;
+ } ptr;
+
+ if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+ return 0;
+ ptr.p8 = *pLoc;
+ switch(ptrType & DW_EH_PE_FORM) {
+ case DW_EH_PE_data2:
+ if (end < (const void *)(ptr.p16u + 1))
+ return 0;
+ if(ptrType & DW_EH_PE_signed)
+ value = get_unaligned(ptr.p16s++);
+ else
+ value = get_unaligned(ptr.p16u++);
+ break;
+ case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+ if (end < (const void *)(ptr.p32u + 1))
+ return 0;
+ if(ptrType & DW_EH_PE_signed)
+ value = get_unaligned(ptr.p32s++);
+ else
+ value = get_unaligned(ptr.p32u++);
+ break;
+ case DW_EH_PE_data8:
+ BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+ BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+ case DW_EH_PE_native:
+ if (end < (const void *)(ptr.pul + 1))
+ return 0;
+ value = get_unaligned(ptr.pul++);
+ break;
+ case DW_EH_PE_leb128:
+ BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+ value = ptrType & DW_EH_PE_signed
+ ? get_sleb128(&ptr.p8, end)
+ : get_uleb128(&ptr.p8, end);
+ if ((const void *)ptr.p8 > end)
+ return 0;
+ break;
+ default:
+ return 0;
+ }
+ switch(ptrType & DW_EH_PE_ADJUST) {
+ case DW_EH_PE_abs:
+ break;
+ case DW_EH_PE_pcrel:
+ value += (unsigned long)*pLoc;
+ break;
+ default:
+ return 0;
+ }
+ if ((ptrType & DW_EH_PE_indirect)
+ && __get_user(value, (unsigned long *)value))
+ return 0;
+ *pLoc = ptr.p8;
+
+ return value;
+}
+
+static signed fde_pointer_type(const u32 *cie)
+{
+ const u8 *ptr = (const u8 *)(cie + 2);
+ unsigned version = *ptr;
+
+ if (version != 1)
+ return -1; /* unsupported */
+ if (*++ptr) {
+ const char *aug;
+ const u8 *end = (const u8 *)(cie + 1) + *cie;
+ uleb128_t len;
+
+ /* check if augmentation size is first (and thus present) */
+ if (*ptr != 'z')
+ return -1;
+ /* check if augmentation string is nul-terminated */
+ if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
+ return -1;
+ ++ptr; /* skip terminator */
+ get_uleb128(&ptr, end); /* skip code alignment */
+ get_sleb128(&ptr, end); /* skip data alignment */
+ /* skip return address column */
+ version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
+ len = get_uleb128(&ptr, end); /* augmentation length */
+ if (ptr + len < ptr || ptr + len > end)
+ return -1;
+ end = ptr + len;
+ while (*++aug) {
+ if (ptr >= end)
+ return -1;
+ switch(*aug) {
+ case 'L':
+ ++ptr;
+ break;
+ case 'P': {
+ signed ptrType = *ptr++;
+
+ if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+ return -1;
+ }
+ break;
+ case 'R':
+ return *ptr;
+ default:
+ return -1;
+ }
+ }
+ }
+ return DW_EH_PE_native|DW_EH_PE_abs;
+}
+
+static int advance_loc(unsigned long delta, struct unwind_state *state)
+{
+ state->loc += delta * state->codeAlign;
+
+ return delta > 0;
+}
+
+static void set_rule(uleb128_t reg,
+ enum item_location where,
+ uleb128_t value,
+ struct unwind_state *state)
+{
+ if (reg < ARRAY_SIZE(state->regs)) {
+ state->regs[reg].where = where;
+ state->regs[reg].value = value;
+ }
+}
+
+static int processCFI(const u8 *start,
+ const u8 *end,
+ unsigned long targetLoc,
+ signed ptrType,
+ struct unwind_state *state)
+{
+ union {
+ const u8 *p8;
+ const u16 *p16;
+ const u32 *p32;
+ } ptr;
+ int result = 1;
+
+ if (start != state->cieStart) {
+ state->loc = state->org;
+ result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
+ if (targetLoc == 0 && state->label == NULL)
+ return result;
+ }
+ for (ptr.p8 = start; result && ptr.p8 < end; ) {
+ switch(*ptr.p8 >> 6) {
+ uleb128_t value;
+
+ case 0:
+ switch(*ptr.p8++) {
+ case DW_CFA_nop:
+ break;
+ case DW_CFA_set_loc:
+ if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+ result = 0;
+ break;
+ case DW_CFA_advance_loc1:
+ result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
+ break;
+ case DW_CFA_advance_loc2:
+ result = ptr.p8 <= end + 2
+ && advance_loc(*ptr.p16++, state);
+ break;
+ case DW_CFA_advance_loc4:
+ result = ptr.p8 <= end + 4
+ && advance_loc(*ptr.p32++, state);
+ break;
+ case DW_CFA_offset_extended:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_val_offset:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_offset_extended_sf:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_val_offset_sf:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_restore_extended:
+ case DW_CFA_undefined:
+ case DW_CFA_same_value:
+ set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
+ break;
+ case DW_CFA_register:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value,
+ Register,
+ get_uleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_remember_state:
+ if (ptr.p8 == state->label) {
+ state->label = NULL;
+ return 1;
+ }
+ if (state->stackDepth >= MAX_STACK_DEPTH)
+ return 0;
+ state->stack[state->stackDepth++] = ptr.p8;
+ break;
+ case DW_CFA_restore_state:
+ if (state->stackDepth) {
+ const uleb128_t loc = state->loc;
+ const u8 *label = state->label;
+
+ state->label = state->stack[state->stackDepth - 1];
+ memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
+ memset(state->regs, 0, sizeof(state->regs));
+ state->stackDepth = 0;
+ result = processCFI(start, end, 0, ptrType, state);
+ state->loc = loc;
+ state->label = label;
+ } else
+ return 0;
+ break;
+ case DW_CFA_def_cfa:
+ state->cfa.reg = get_uleb128(&ptr.p8, end);
+ /*nobreak*/
+ case DW_CFA_def_cfa_offset:
+ state->cfa.offs = get_uleb128(&ptr.p8, end);
+ break;
+ case DW_CFA_def_cfa_sf:
+ state->cfa.reg = get_uleb128(&ptr.p8, end);
+ /*nobreak*/
+ case DW_CFA_def_cfa_offset_sf:
+ state->cfa.offs = get_sleb128(&ptr.p8, end)
+ * state->dataAlign;
+ break;
+ case DW_CFA_def_cfa_register:
+ state->cfa.reg = get_uleb128(&ptr.p8, end);
+ break;
+ /*todo case DW_CFA_def_cfa_expression: */
+ /*todo case DW_CFA_expression: */
+ /*todo case DW_CFA_val_expression: */
+ case DW_CFA_GNU_args_size:
+ get_uleb128(&ptr.p8, end);
+ break;
+ case DW_CFA_GNU_negative_offset_extended:
+ value = get_uleb128(&ptr.p8, end);
+ set_rule(value,
+ Memory,
+ (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
+ break;
+ case DW_CFA_GNU_window_save:
+ default:
+ result = 0;
+ break;
+ }
+ break;
+ case 1:
+ result = advance_loc(*ptr.p8++ & 0x3f, state);
+ break;
+ case 2:
+ value = *ptr.p8++ & 0x3f;
+ set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+ break;
+ case 3:
+ set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
+ break;
+ }
+ if (ptr.p8 > end)
+ result = 0;
+ if (result && targetLoc != 0 && targetLoc < state->loc)
+ return 1;
+ }
+
+ return result
+ && ptr.p8 == end
+ && (targetLoc == 0
+ || (/*todo While in theory this should apply, gcc in practice omits
+ everything past the function prolog, and hence the location
+ never reaches the end of the function.
+ targetLoc < state->loc &&*/ state->label == NULL));
+}
+
+/* Unwind to previous to frame. Returns 0 if successful, negative
+ * number in case of an error. */
+int unwind(struct unwind_frame_info *frame)
+{
+#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
+ const u32 *fde = NULL, *cie = NULL;
+ const u8 *ptr = NULL, *end = NULL;
+ unsigned long pc = UNW_PC(frame) - frame->call_frame;
+ unsigned long startLoc = 0, endLoc = 0, cfa;
+ unsigned i;
+ signed ptrType = -1;
+ uleb128_t retAddrReg = 0;
+ struct unwind_table *table;
+ struct unwind_state state;
+
+ if (UNW_PC(frame) == 0)
+ return -EINVAL;
+ if ((table = find_table(pc)) != NULL
+ && !(table->size & (sizeof(*fde) - 1))) {
+ unsigned long tableSize = table->size;
+
+ for (fde = table->address;
+ tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+ tableSize -= sizeof(*fde) + *fde,
+ fde += 1 + *fde / sizeof(*fde)) {
+ if (!*fde || (*fde & (sizeof(*fde) - 1)))
+ break;
+ if (!fde[1])
+ continue; /* this is a CIE */
+ if ((fde[1] & (sizeof(*fde) - 1))
+ || fde[1] > (unsigned long)(fde + 1)
+ - (unsigned long)table->address)
+ continue; /* this is not a valid FDE */
+ cie = fde + 1 - fde[1] / sizeof(*fde);
+ if (*cie <= sizeof(*cie) + 4
+ || *cie >= fde[1] - sizeof(*fde)
+ || (*cie & (sizeof(*cie) - 1))
+ || cie[1]
+ || (ptrType = fde_pointer_type(cie)) < 0) {
+ cie = NULL; /* this is not a (valid) CIE */
+ continue;
+ }
+ ptr = (const u8 *)(fde + 2);
+ startLoc = read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType);
+ endLoc = startLoc
+ + read_pointer(&ptr,
+ (const u8 *)(fde + 1) + *fde,
+ ptrType & DW_EH_PE_indirect
+ ? ptrType
+ : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
+ if (pc >= startLoc && pc < endLoc)
+ break;
+ cie = NULL;
+ }
+ }
+ if (cie != NULL) {
+ memset(&state, 0, sizeof(state));
+ state.cieEnd = ptr; /* keep here temporarily */
+ ptr = (const u8 *)(cie + 2);
+ end = (const u8 *)(cie + 1) + *cie;
+ frame->call_frame = 1;
+ if ((state.version = *ptr) != 1)
+ cie = NULL; /* unsupported version */
+ else if (*++ptr) {
+ /* check if augmentation size is first (and thus present) */
+ if (*ptr == 'z') {
+ while (++ptr < end && *ptr) {
+ switch(*ptr) {
+ /* check for ignorable (or already handled)
+ * nul-terminated augmentation string */
+ case 'L':
+ case 'P':
+ case 'R':
+ continue;
+ case 'S':
+ frame->call_frame = 0;
+ continue;
+ default:
+ break;
+ }
+ break;
+ }
+ }
+ if (ptr >= end || *ptr)
+ cie = NULL;
+ }
+ ++ptr;
+ }
+ if (cie != NULL) {
+ /* get code aligment factor */
+ state.codeAlign = get_uleb128(&ptr, end);
+ /* get data aligment factor */
+ state.dataAlign = get_sleb128(&ptr, end);
+ if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
+ cie = NULL;
+ else {
+ retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+ /* skip augmentation */
+ if (((const char *)(cie + 2))[1] == 'z')
+ ptr += get_uleb128(&ptr, end);
+ if (ptr > end
+ || retAddrReg >= ARRAY_SIZE(reg_info)
+ || REG_INVALID(retAddrReg)
+ || reg_info[retAddrReg].width != sizeof(unsigned long))
+ cie = NULL;
+ }
+ }
+ if (cie != NULL) {
+ state.cieStart = ptr;
+ ptr = state.cieEnd;
+ state.cieEnd = end;
+ end = (const u8 *)(fde + 1) + *fde;
+ /* skip augmentation */
+ if (((const char *)(cie + 2))[1] == 'z') {
+ uleb128_t augSize = get_uleb128(&ptr, end);
+
+ if ((ptr += augSize) > end)
+ fde = NULL;
+ }
+ }
+ if (cie == NULL || fde == NULL) {
+#ifdef CONFIG_FRAME_POINTER
+ unsigned long top, bottom;
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+ top = STACK_TOP(frame->task);
+ bottom = STACK_BOTTOM(frame->task);
+# if FRAME_RETADDR_OFFSET < 0
+ if (UNW_SP(frame) < top
+ && UNW_FP(frame) <= UNW_SP(frame)
+ && bottom < UNW_FP(frame)
+# else
+ if (UNW_SP(frame) > top
+ && UNW_FP(frame) >= UNW_SP(frame)
+ && bottom > UNW_FP(frame)
+# endif
+ && !((UNW_SP(frame) | UNW_FP(frame))
+ & (sizeof(unsigned long) - 1))) {
+ unsigned long link;
+
+ if (!__get_user(link,
+ (unsigned long *)(UNW_FP(frame)
+ + FRAME_LINK_OFFSET))
+# if FRAME_RETADDR_OFFSET < 0
+ && link > bottom && link < UNW_FP(frame)
+# else
+ && link > UNW_FP(frame) && link < bottom
+# endif
+ && !(link & (sizeof(link) - 1))
+ && !__get_user(UNW_PC(frame),
+ (unsigned long *)(UNW_FP(frame)
+ + FRAME_RETADDR_OFFSET))) {
+ UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
+# if FRAME_RETADDR_OFFSET < 0
+ -
+# else
+ +
+# endif
+ sizeof(UNW_PC(frame));
+ UNW_FP(frame) = link;
+ return 0;
+ }
+ }
+#endif
+ return -ENXIO;
+ }
+ state.org = startLoc;
+ memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
+ /* process instructions */
+ if (!processCFI(ptr, end, pc, ptrType, &state)
+ || state.loc > endLoc
+ || state.regs[retAddrReg].where == Nowhere
+ || state.cfa.reg >= ARRAY_SIZE(reg_info)
+ || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+ || state.cfa.offs % sizeof(unsigned long))
+ return -EIO;
+ /* update frame */
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+ if(frame->call_frame
+ && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
+ frame->call_frame = 0;
+#endif
+ cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
+ startLoc = min((unsigned long)UNW_SP(frame), cfa);
+ endLoc = max((unsigned long)UNW_SP(frame), cfa);
+ if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
+ startLoc = min(STACK_LIMIT(cfa), cfa);
+ endLoc = max(STACK_LIMIT(cfa), cfa);
+ }
+#ifndef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32)
+#else
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#endif
+ for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+ if (REG_INVALID(i)) {
+ if (state.regs[i].where == Nowhere)
+ continue;
+ return -EIO;
+ }
+ switch(state.regs[i].where) {
+ default:
+ break;
+ case Register:
+ if (state.regs[i].value >= ARRAY_SIZE(reg_info)
+ || REG_INVALID(state.regs[i].value)
+ || reg_info[i].width > reg_info[state.regs[i].value].width)
+ return -EIO;
+ switch(reg_info[state.regs[i].value].width) {
+#define CASE(n) \
+ case sizeof(u##n): \
+ state.regs[i].value = FRAME_REG(state.regs[i].value, \
+ const u##n); \
+ break
+ CASES;
+#undef CASE
+ default:
+ return -EIO;
+ }
+ break;
+ }
+ }
+ for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+ if (REG_INVALID(i))
+ continue;
+ switch(state.regs[i].where) {
+ case Nowhere:
+ if (reg_info[i].width != sizeof(UNW_SP(frame))
+ || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
+ != &UNW_SP(frame))
+ continue;
+ UNW_SP(frame) = cfa;
+ break;
+ case Register:
+ switch(reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+ FRAME_REG(i, u##n) = state.regs[i].value; \
+ break
+ CASES;
+#undef CASE
+ default:
+ return -EIO;
+ }
+ break;
+ case Value:
+ if (reg_info[i].width != sizeof(unsigned long))
+ return -EIO;
+ FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
+ * state.dataAlign;
+ break;
+ case Memory: {
+ unsigned long addr = cfa + state.regs[i].value
+ * state.dataAlign;
+
+ if ((state.regs[i].value * state.dataAlign)
+ % sizeof(unsigned long)
+ || addr < startLoc
+ || addr + sizeof(unsigned long) < addr
+ || addr + sizeof(unsigned long) > endLoc)
+ return -EIO;
+ switch(reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+ __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+ break
+ CASES;
+#undef CASE
+ default:
+ return -EIO;
+ }
+ }
+ break;
+ }
+ }
+
+ return 0;
+#undef CASES
+#undef FRAME_REG
+}
+EXPORT_SYMBOL(unwind);
+
+int unwind_init_frame_info(struct unwind_frame_info *info,
+ struct task_struct *tsk,
+ /*const*/ struct pt_regs *regs)
+{
+ info->task = tsk;
+ info->call_frame = 0;
+ arch_unw_init_frame_info(info, regs);
+
+ return 0;
+}
+EXPORT_SYMBOL(unwind_init_frame_info);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+int unwind_init_blocked(struct unwind_frame_info *info,
+ struct task_struct *tsk)
+{
+ info->task = tsk;
+ info->call_frame = 0;
+ arch_unw_init_blocked(info);
+
+ return 0;
+}
+EXPORT_SYMBOL(unwind_init_blocked);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+int unwind_init_running(struct unwind_frame_info *info,
+ asmlinkage int (*callback)(struct unwind_frame_info *,
+ void *arg),
+ void *arg)
+{
+ info->task = current;
+ info->call_frame = 0;
+
+ return arch_unwind_init_running(info, callback, arg);
+}
+EXPORT_SYMBOL(unwind_init_running);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs). Returns 0 if successful, negative number in case of
+ * error.
+ */
+int unwind_to_user(struct unwind_frame_info *info)
+{
+ while (!arch_unw_user_mode(info)) {
+ int err = unwind(info);
+
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/wait.c b/kernel/wait.c
index 791681cfea98..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -3,7 +3,6 @@
*
* (C) 2004 William Irwin, Oracle
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -11,6 +10,14 @@
#include <linux/wait.h>
#include <linux/hash.h>
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+ spin_lock_init(&q->lock);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(init_waitqueue_head);
+
void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 740c5abceb07..835fe28b87a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct {
wait_queue_head_t work_done;
struct workqueue_struct *wq;
- task_t *thread;
+ struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
} ____cacheline_aligned;
@@ -68,7 +68,7 @@ struct workqueue_struct {
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
-static DEFINE_SPINLOCK(workqueue_lock);
+static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);
static int singlethread_cpu;
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
spin_unlock_irqrestore(&cwq->lock, flags);
}
-/*
- * Queue work on a workqueue. Return non-zero if it was successfully
- * added.
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
*
* We queue the work to the CPU it was submitted, but there is no
* guarantee that it will be processed by that CPU.
@@ -114,6 +117,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
put_cpu();
return ret;
}
+EXPORT_SYMBOL_GPL(queue_work);
static void delayed_work_timer_fn(unsigned long __data)
{
@@ -127,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct work_struct *work, unsigned long delay)
{
@@ -147,6 +159,38 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
}
return ret;
}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
+int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work, unsigned long delay)
+{
+ int ret = 0;
+ struct timer_list *timer = &work->timer;
+
+ if (!test_and_set_bit(0, &work->pending)) {
+ BUG_ON(timer_pending(timer));
+ BUG_ON(!list_empty(&work->entry));
+
+ /* This stores wq for the moment, for the timer_fn */
+ work->wq_data = wq;
+ timer->expires = jiffies + delay;
+ timer->data = (unsigned long)work;
+ timer->function = delayed_work_timer_fn;
+ add_timer_on(timer, cpu);
+ ret = 1;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
@@ -251,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
}
}
-/*
+/**
* flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
*
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
@@ -275,12 +320,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
} else {
int cpu;
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
}
}
+EXPORT_SYMBOL_GPL(flush_workqueue);
static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
int cpu)
@@ -325,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
}
wq->name = name;
- /* We don't need the distraction of CPUs appearing and vanishing. */
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
p = create_workqueue_thread(wq, singlethread_cpu);
@@ -335,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
else
wake_up_process(p);
} else {
- spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
for_each_online_cpu(cpu) {
p = create_workqueue_thread(wq, cpu);
if (p) {
@@ -347,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
destroy = 1;
}
}
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
/*
* Was there any error during startup? If yes then clean up:
@@ -358,6 +401,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
}
return wq;
}
+EXPORT_SYMBOL_GPL(__create_workqueue);
static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
@@ -374,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
kthread_stop(p);
}
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
void destroy_workqueue(struct workqueue_struct *wq)
{
int cpu;
@@ -381,69 +431,94 @@ void destroy_workqueue(struct workqueue_struct *wq)
flush_workqueue(wq);
/* We don't need the distraction of CPUs appearing and vanishing. */
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
if (is_single_threaded(wq))
cleanup_workqueue_thread(wq, singlethread_cpu);
else {
for_each_online_cpu(cpu)
cleanup_workqueue_thread(wq, cpu);
- spin_lock(&workqueue_lock);
list_del(&wq->list);
- spin_unlock(&workqueue_lock);
}
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
free_percpu(wq->cpu_wq);
kfree(wq);
}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
static struct workqueue_struct *keventd_wq;
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
int fastcall schedule_work(struct work_struct *work)
{
return queue_work(keventd_wq, work);
}
+EXPORT_SYMBOL(schedule_work);
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
{
return queue_delayed_work(keventd_wq, work, delay);
}
+EXPORT_SYMBOL(schedule_delayed_work);
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
int schedule_delayed_work_on(int cpu,
struct work_struct *work, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &work->timer;
-
- if (!test_and_set_bit(0, &work->pending)) {
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
- /* This stores keventd_wq for the moment, for the timer_fn */
- work->wq_data = keventd_wq;
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)work;
- timer->function = delayed_work_timer_fn;
- add_timer_on(timer, cpu);
- ret = 1;
- }
- return ret;
+ return queue_delayed_work_on(cpu, keventd_wq, work, delay);
}
+EXPORT_SYMBOL(schedule_delayed_work_on);
-int schedule_on_each_cpu(void (*func) (void *info), void *info)
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info)
{
int cpu;
- struct work_struct *work;
-
- work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+ struct work_struct *works;
- if (!work)
+ works = alloc_percpu(struct work_struct);
+ if (!works)
return -ENOMEM;
+
+ mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu) {
- INIT_WORK(work + cpu, func, info);
+ INIT_WORK(per_cpu_ptr(works, cpu), func, info);
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
- work + cpu);
+ per_cpu_ptr(works, cpu));
}
+ mutex_unlock(&workqueue_mutex);
flush_workqueue(keventd_wq);
- kfree(work);
+ free_percpu(works);
return 0;
}
@@ -451,6 +526,7 @@ void flush_scheduled_work(void)
{
flush_workqueue(keventd_wq);
}
+EXPORT_SYMBOL(flush_scheduled_work);
/**
* cancel_rearming_delayed_workqueue - reliably kill off a delayed
@@ -547,7 +623,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
}
/* We're holding the cpucontrol mutex here */
-static int workqueue_cpu_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -556,6 +632,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
+ mutex_lock(&workqueue_mutex);
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
if (!create_workqueue_thread(wq, hotcpu)) {
@@ -574,15 +651,27 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
kthread_bind(cwq->thread, hotcpu);
wake_up_process(cwq->thread);
}
+ mutex_unlock(&workqueue_mutex);
break;
case CPU_UP_CANCELED:
list_for_each_entry(wq, &workqueues, list) {
+ if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+ continue;
/* Unbind so it can run. */
kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
any_online_cpu(cpu_online_map));
cleanup_workqueue_thread(wq, hotcpu);
}
+ mutex_unlock(&workqueue_mutex);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ mutex_lock(&workqueue_mutex);
+ break;
+
+ case CPU_DOWN_FAILED:
+ mutex_unlock(&workqueue_mutex);
break;
case CPU_DEAD:
@@ -590,6 +679,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
cleanup_workqueue_thread(wq, hotcpu);
list_for_each_entry(wq, &workqueues, list)
take_over_work(wq, hotcpu);
+ mutex_unlock(&workqueue_mutex);
break;
}
@@ -605,13 +695,3 @@ void init_workqueues(void)
BUG_ON(!keventd_wq);
}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-EXPORT_SYMBOL_GPL(queue_work);
-EXPORT_SYMBOL_GPL(queue_delayed_work);
-EXPORT_SYMBOL_GPL(flush_workqueue);
-EXPORT_SYMBOL_GPL(destroy_workqueue);
-
-EXPORT_SYMBOL(schedule_work);
-EXPORT_SYMBOL(schedule_delayed_work);
-EXPORT_SYMBOL(schedule_delayed_work_on);
-EXPORT_SYMBOL(flush_scheduled_work);
OpenPOWER on IntegriCloud