summaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig15
-rw-r--r--fs/proc/array.c35
-rw-r--r--fs/proc/base.c270
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c39
-rw-r--r--fs/proc/vmcore.c386
8 files changed, 550 insertions, 203 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..0eaeb41453f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,21 @@ config PROC_VMCORE
help
Exports the dump image of crashed kernel in ELF format.
+config PROC_VMCORE_DEVICE_DUMP
+ bool "Device Hardware/Firmware Log Collection"
+ depends on PROC_VMCORE
+ default n
+ help
+ After kernel panic, device drivers can collect the device
+ specific snapshot of their hardware or firmware before the
+ underlying devices are initialized in crash recovery kernel.
+ Note that the device driver must be present in the crash
+ recovery kernel's initramfs to collect its underlying device
+ snapshot.
+
+ If you say Y here, the collected device dumps will be added
+ as ELF notes to /proc/vmcore.
+
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e6d7f41b6684..0ceb3b6b37e7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -96,22 +96,29 @@
#include <asm/processor.h>
#include "internal.h"
-static inline void task_name(struct seq_file *m, struct task_struct *p)
+void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char *buf;
size_t size;
- char tcomm[sizeof(p->comm)];
+ char tcomm[64];
int ret;
- get_task_comm(tcomm, p);
-
- seq_puts(m, "Name:\t");
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else
+ __get_task_comm(tcomm, sizeof(tcomm), p);
size = seq_get_buf(m, &buf);
- ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- seq_commit(m, ret < size ? ret : -1);
+ if (escape) {
+ ret = string_escape_str(tcomm, buf, size,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ if (ret >= size)
+ ret = -1;
+ } else {
+ ret = strscpy(buf, tcomm, size);
+ }
- seq_putc(m, '\n');
+ seq_commit(m, ret);
}
/*
@@ -261,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
unsigned long flags;
sigset_t pending, shpending, blocked, ignored, caught;
int num_threads = 0;
- unsigned long qsize = 0;
+ unsigned int qsize = 0;
unsigned long qlim = 0;
sigemptyset(&pending);
@@ -390,7 +397,10 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
{
struct mm_struct *mm = get_task_mm(task);
- task_name(m, task);
+ seq_puts(m, "Name:\t");
+ proc_task_name(m, task, true);
+ seq_putc(m, '\n');
+
task_state(m, ns, pid, task);
if (mm) {
@@ -425,7 +435,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cutime, cstime, utime, stime;
u64 cgtime, gtime;
unsigned long rsslim = 0;
- char tcomm[sizeof(task->comm)];
unsigned long flags;
state = *get_task_state(task);
@@ -452,8 +461,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
}
- get_task_comm(tcomm, task);
-
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = 0;
@@ -520,7 +527,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
- seq_puts(m, tcomm);
+ proc_task_name(m, task, false);
seq_puts(m, ") ");
seq_putc(m, state);
seq_put_decimal_ll(m, " ", ppid);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 33ed1746927a..4aa9ce5df02f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -205,171 +205,129 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
- size_t _count, loff_t *pos)
+static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
- char *page;
- unsigned long count = _count;
unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long len1, len2, len;
- unsigned long p;
- char c;
- ssize_t rv;
-
- BUG_ON(*pos < 0);
+ unsigned long pos, len;
+ char *page;
- tsk = get_proc_task(file_inode(file));
- if (!tsk)
- return -ESRCH;
- mm = get_task_mm(tsk);
- put_task_struct(tsk);
- if (!mm)
- return 0;
/* Check if process spawned far enough to have cmdline. */
- if (!mm->env_end) {
- rv = 0;
- goto out_mmput;
- }
-
- page = (char *)__get_free_page(GFP_KERNEL);
- if (!page) {
- rv = -ENOMEM;
- goto out_mmput;
- }
+ if (!mm->env_end)
+ return 0;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
-
- BUG_ON(arg_start > arg_end);
- BUG_ON(env_start > env_end);
+ spin_unlock(&mm->arg_lock);
- len1 = arg_end - arg_start;
- len2 = env_end - env_start;
+ if (arg_start >= arg_end)
+ return 0;
- /* Empty ARGV. */
- if (len1 == 0) {
- rv = 0;
- goto out_free_page;
- }
/*
- * Inherently racy -- command line shares address space
- * with code and data.
+ * We have traditionally allowed the user to re-write
+ * the argument strings and overflow the end result
+ * into the environment section. But only do that if
+ * the environment area is contiguous to the arguments.
*/
- rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
- if (rv <= 0)
- goto out_free_page;
-
- rv = 0;
-
- if (c == '\0') {
- /* Command line (set of strings) occupies whole ARGV. */
- if (len1 <= *pos)
- goto out_free_page;
-
- p = arg_start + *pos;
- len = len1 - *pos;
- while (count > 0 && len > 0) {
- unsigned int _count;
- int nr_read;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
+ if (env_start != arg_end || env_start >= env_end)
+ env_start = env_end = arg_end;
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
- }
- } else {
- /*
- * Command line (1 string) occupies ARGV and
- * extends into ENVP.
- */
- struct {
- unsigned long p;
- unsigned long len;
- } cmdline[2] = {
- { .p = arg_start, .len = len1 },
- { .p = env_start, .len = len2 },
- };
- loff_t pos1 = *pos;
- unsigned int i;
+ /* We're not going to care if "*ppos" has high bits set */
+ pos = arg_start + *ppos;
+
+ /* .. but we do check the result is in the proper range */
+ if (pos < arg_start || pos >= env_end)
+ return 0;
+
+ /* .. and we never go past env_end */
+ if (env_end - pos < count)
+ count = env_end - pos;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ len = 0;
+ while (count) {
+ int got;
+ size_t size = min_t(size_t, PAGE_SIZE, count);
+
+ got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+ if (got <= 0)
+ break;
+
+ /* Don't walk past a NUL character once you hit arg_end */
+ if (pos + got >= arg_end) {
+ int n = 0;
- i = 0;
- while (i < 2 && pos1 >= cmdline[i].len) {
- pos1 -= cmdline[i].len;
- i++;
+ /*
+ * If we started before 'arg_end' but ended up
+ * at or after it, we start the NUL character
+ * check at arg_end-1 (where we expect the normal
+ * EOF to be).
+ *
+ * NOTE! This is smaller than 'got', because
+ * pos + got >= arg_end
+ */
+ if (pos < arg_end)
+ n = arg_end - pos - 1;
+
+ /* Cut off at first NUL after 'n' */
+ got = n + strnlen(page+n, got-n);
+ if (!got)
+ break;
}
- while (i < 2) {
- p = cmdline[i].p + pos1;
- len = cmdline[i].len - pos1;
- while (count > 0 && len > 0) {
- unsigned int _count, l;
- int nr_read;
- bool final;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- /*
- * Command line can be shorter than whole ARGV
- * even if last "marker" byte says it is not.
- */
- final = false;
- l = strnlen(page, nr_read);
- if (l < nr_read) {
- nr_read = l;
- final = true;
- }
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
-
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
-
- if (final)
- goto out_free_page;
- }
- /* Only first chunk can be read partially. */
- pos1 = 0;
- i++;
+ got -= copy_to_user(buf, page, got);
+ if (unlikely(!got)) {
+ if (!len)
+ len = -EFAULT;
+ break;
}
+ pos += got;
+ buf += got;
+ len += got;
+ count -= got;
}
-out_free_page:
free_page((unsigned long)page);
-out_mmput:
+ return len;
+}
+
+static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mm_struct *mm;
+ ssize_t ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = get_mm_cmdline(mm, buf, count, pos);
mmput(mm);
- if (rv > 0)
- *pos += rv;
- return rv;
+ return ret;
+}
+
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct task_struct *tsk;
+ ssize_t ret;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ ret = get_task_cmdline(tsk, buf, count, pos);
+ put_task_struct(tsk);
+ if (ret > 0)
+ *pos += ret;
+ return ret;
}
static const struct file_operations proc_pid_cmdline_ops = {
@@ -430,7 +388,6 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
struct stack_trace trace;
unsigned long *entries;
int err;
- int i;
entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
if (!entries)
@@ -443,6 +400,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
err = lock_trace(task);
if (!err) {
+ unsigned int i;
+
save_stack_trace_tsk(task, &trace);
for (i = 0; i < trace.nr_entries; i++) {
@@ -927,10 +886,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
while (count > 0) {
size_t this_len, max_len;
@@ -1563,9 +1522,8 @@ static int comm_show(struct seq_file *m, void *v)
if (!p)
return -ESRCH;
- task_lock(p);
- seq_printf(m, "%s\n", p->comm);
- task_unlock(p);
+ proc_task_name(m, p, false);
+ seq_putc(m, '\n');
put_task_struct(p);
@@ -1785,9 +1743,9 @@ int pid_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
- rcu_read_lock();
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
+ rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
@@ -1876,7 +1834,7 @@ const struct dentry_operations pid_dentry_operations =
* by stat.
*/
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
- const char *name, int len,
+ const char *name, unsigned int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
struct dentry *child, *dir = file->f_path.dentry;
@@ -1895,19 +1853,19 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
struct dentry *res;
res = instantiate(child, task, ptr);
d_lookup_done(child);
- if (IS_ERR(res))
- goto end_instantiate;
if (unlikely(res)) {
dput(child);
child = res;
+ if (IS_ERR(child))
+ goto end_instantiate;
}
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
-end_instantiate:
dput(child);
+end_instantiate:
return dir_emit(ctx, name, len, ino, type);
}
@@ -3252,7 +3210,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
char name[10 + 1];
- int len;
+ unsigned int len;
cond_resched();
if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
@@ -3579,7 +3537,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
task;
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
- int len;
+ unsigned int len;
tid = task_pid_nr_ns(task, ns);
len = snprintf(name, sizeof(name), "%u", tid);
if (!proc_fill_cache(file, ctx, name, len,
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 05b9893e9a22..81882a13212d 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -248,7 +248,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
struct file *f;
struct fd_data data;
char name[10 + 1];
- int len;
+ unsigned int len;
f = fcheck_files(files, fd);
if (!f)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43c70c9e6b62..50cb22a08c2f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -136,6 +136,8 @@ unsigned name_to_int(const struct qstr *qstr);
*/
extern const struct file_operations proc_tid_children_operations;
+extern void proc_task_name(struct seq_file *m, struct task_struct *p,
+ bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
@@ -161,7 +163,7 @@ extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
struct task_struct *, const void *);
-extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
instantiate_t, struct task_struct *, const void *);
/*
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..792c78a49174 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (PageTable(page))
+ u |= 1 << KPF_PGTABLE;
if (page_is_idle(page))
u |= 1 << KPF_IDLE;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a20c6e495bb2..597969db9e90 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -18,6 +18,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
+#include <linux/pkeys.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -673,13 +674,16 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
[ilog2(VM_PKEY_BIT3)] = "",
+#if VM_PKEY_BIT4
+ [ilog2(VM_PKEY_BIT4)] = "",
#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
};
size_t i;
@@ -727,10 +731,6 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
-void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-}
-
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
static int show_smap(struct seq_file *m, void *v, int is_pid)
@@ -835,7 +835,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
seq_puts(m, " kB\n");
}
if (!rollup_mode) {
- arch_show_smap(m, vma);
+ if (arch_pkeys_enabled())
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
}
m_cache_vma(m, vma);
@@ -1258,8 +1259,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
- frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn)
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
@@ -1310,11 +1312,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
- unsigned long offset = swp_offset(entry);
+ unsigned long offset;
- offset += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- frame = swp_type(entry) |
- (offset << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn) {
+ offset = swp_offset(entry) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
@@ -1332,10 +1337,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
- if (pm->show_pfn && (flags & PM_PRESENT))
- frame++;
- else if (flags & PM_SWAP)
- frame += (1 << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
}
spin_unlock(ptl);
return err;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..cfb6674331fd 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
@@ -38,12 +39,23 @@ static size_t elfcorebuf_sz_orig;
static char *elfnotes_buf;
static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore;
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
/*
* Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
* The called function has to take care of module refcounting.
@@ -178,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf)
return 0;
}
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (copy_to(dst, buf, tsz, userbuf)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+ u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
@@ -215,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
if (*fpos < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)*fpos, buflen);
+ start = *fpos - elfcorebuf_sz;
+ if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ return -EFAULT;
+
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!buflen)
+ return acc;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
- kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
if (copy_to(buffer, kaddr, tsz, userbuf))
return -EFAULT;
+
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -302,10 +416,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
};
/**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- * vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +425,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
* If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
* disabled and there's no need to allow users to mmap the buffer.
*/
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
{
#ifdef CONFIG_MMU
- return vmalloc_user(notes_sz);
+ return vmalloc_user(size);
#else
- return vzalloc(notes_sz);
+ return vzalloc(size);
#endif
}
@@ -446,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided. This also ensures that the device dumps and
+ * other elf notes can be properly mmaped at page aligned
+ * address.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+ u64 start_off;
+
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)start, size);
+ start_off = start - elfcorebuf_sz;
+ if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+ start_off, tsz))
+ goto fail;
+
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ /* leave now if filled buffer already */
+ if (!size)
+ return 0;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
- kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
kaddr, tsz))
goto fail;
+
size -= tsz;
start += tsz;
len += tsz;
@@ -502,8 +649,8 @@ static struct vmcore* __init get_new_element(void)
return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
}
-static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
- struct list_head *vc_list)
+static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
u64 size;
struct vmcore *m;
@@ -665,7 +812,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -698,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -851,7 +1003,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -884,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -976,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
- struct list_head *vc_list)
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
loff_t vmcore_off;
struct vmcore *m;
@@ -1145,6 +1302,202 @@ static int __init parse_crash_elf_headers(void)
return 0;
}
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+ u32 size)
+{
+ struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+ vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+ vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+ vdd_hdr->n_type = NT_VMCOREDD;
+
+ strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+ sizeof(vdd_hdr->name));
+ memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcoredd_update_program_headers - Update all Elf program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+ size_t vmcoreddsz)
+{
+ unsigned char *e_ident = (unsigned char *)elfptr;
+ u64 start, end, size;
+ loff_t vmcore_off;
+ u32 i;
+
+ vmcore_off = elfcorebuf_sz + elfnotesz;
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ } else {
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ }
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * Elf header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the Elf program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+ vmcoredd_orig_sz += dump_size;
+ elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+ vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+ vmcoredd_orig_sz);
+
+ /* Update vmcore list offsets */
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+ struct vmcoredd_node *dump;
+ void *buf = NULL;
+ size_t data_size;
+ int ret;
+
+ if (!data || !strlen(data->dump_name) ||
+ !data->vmcoredd_callback || !data->size)
+ return -EINVAL;
+
+ dump = vzalloc(sizeof(*dump));
+ if (!dump) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Keep size of the buffer page aligned so that it can be mmaped */
+ data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+ PAGE_SIZE);
+
+ /* Allocate buffer for driver's to write their dumps */
+ buf = vmcore_alloc_buf(data_size);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ vmcoredd_write_header(buf, data, data_size -
+ sizeof(struct vmcoredd_header));
+
+ /* Invoke the driver's dump collection routing */
+ ret = data->vmcoredd_callback(data, buf +
+ sizeof(struct vmcoredd_header));
+ if (ret)
+ goto out_err;
+
+ dump->buf = buf;
+ dump->size = data_size;
+
+ /* Add the dump to driver sysfs list */
+ mutex_lock(&vmcoredd_mutex);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ mutex_unlock(&vmcoredd_mutex);
+
+ vmcoredd_update_size(data_size);
+ return 0;
+
+out_err:
+ if (buf)
+ vfree(buf);
+
+ if (dump)
+ vfree(dump);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ mutex_lock(&vmcoredd_mutex);
+ while (!list_empty(&vmcoredd_list)) {
+ struct vmcoredd_node *dump;
+
+ dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+ list);
+ list_del(&dump->list);
+ vfree(dump->buf);
+ vfree(dump);
+ }
+ mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
/* Init function for vmcore module. */
static int __init vmcore_init(void)
{
@@ -1192,4 +1545,7 @@ void vmcore_cleanup(void)
kfree(m);
}
free_elfcorebuf();
+
+ /* clear vmcore device dump list */
+ vmcore_free_device_dumps();
}
OpenPOWER on IntegriCloud