diff options
Diffstat (limited to 'arch/s390/mm')
| -rw-r--r-- | arch/s390/mm/Makefile | 6 | ||||
| -rw-r--r-- | arch/s390/mm/cmm.c | 74 | ||||
| -rw-r--r-- | arch/s390/mm/dump_pagetables.c | 58 | ||||
| -rw-r--r-- | arch/s390/mm/extmem.c | 8 | ||||
| -rw-r--r-- | arch/s390/mm/fault.c | 74 | ||||
| -rw-r--r-- | arch/s390/mm/gmap.c | 468 | ||||
| -rw-r--r-- | arch/s390/mm/gup.c | 2 | ||||
| -rw-r--r-- | arch/s390/mm/hugetlbpage.c | 24 | ||||
| -rw-r--r-- | arch/s390/mm/init.c | 10 | ||||
| -rw-r--r-- | arch/s390/mm/kasan_init.c | 387 | ||||
| -rw-r--r-- | arch/s390/mm/maccess.c | 25 | ||||
| -rw-r--r-- | arch/s390/mm/mem_detect.c | 62 | ||||
| -rw-r--r-- | arch/s390/mm/mmap.c | 15 | ||||
| -rw-r--r-- | arch/s390/mm/page-states.c | 2 | ||||
| -rw-r--r-- | arch/s390/mm/pageattr.c | 6 | ||||
| -rw-r--r-- | arch/s390/mm/pgalloc.c | 321 | ||||
| -rw-r--r-- | arch/s390/mm/pgtable.c | 163 | ||||
| -rw-r--r-- | arch/s390/mm/vmem.c | 7 |
18 files changed, 1450 insertions, 262 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 33fe418506bc..f5880bfd1b0c 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,10 +4,12 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o gup.o pageattr.o mem_detect.o -obj-y += pgtable.o pgalloc.o +obj-y += page-states.o gup.o pageattr.o pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o + +KASAN_SANITIZE_kasan_init.o := n +obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 6cf024eb2085..510a18299196 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -191,12 +191,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - if (timer_pending(&cmm_timer)) { - if (mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds*HZ)) - return; - } - cmm_timer.expires = jiffies + cmm_timeout_seconds*HZ; - add_timer(&cmm_timer); + mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); } static void cmm_timer_fn(struct timer_list *unused) @@ -251,45 +246,42 @@ static int cmm_skip_blanks(char *cp, char **endp) return str != cp; } -static struct ctl_table cmm_table[]; - static int cmm_pages_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - char buf[16], *p; - unsigned int len; - long nr; + long nr = cmm_get_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; - if (!*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; - if (write) { - len = *lenp; - if (copy_from_user(buf, buffer, - len > sizeof(buf) ? sizeof(buf) : len)) - return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - cmm_skip_blanks(buf, &p); - nr = simple_strtoul(p, &p, 0); - if (ctl == &cmm_table[0]) - cmm_set_pages(nr); - else - cmm_add_timed_pages(nr); - } else { - if (ctl == &cmm_table[0]) - nr = cmm_get_pages(); - else - nr = cmm_get_timed_pages(); - len = sprintf(buf, "%ld\n", nr); - if (len > *lenp) - len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; - } - *lenp = len; - *ppos += len; + cmm_set_pages(nr); + return 0; +} + +static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + long nr = cmm_get_timed_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_add_timed_pages(nr); return 0; } @@ -338,7 +330,7 @@ static struct ctl_table cmm_table[] = { { .procname = "cmm_timed_pages", .mode = 0644, - .proc_handler = cmm_pages_handler, + .proc_handler = cmm_timed_pages_handler, }, { .procname = "cmm_timeout", diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 507f23ba2034..363f6470d742 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -3,6 +3,8 @@ #include <linux/debugfs.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/kasan.h> +#include <asm/kasan.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -17,18 +19,26 @@ enum address_markers_idx { IDENTITY_NR = 0, KERNEL_START_NR, KERNEL_END_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif VMEMMAP_NR, VMALLOC_NR, MODULES_NR, }; static struct addr_marker address_markers[] = { - [IDENTITY_NR] = {0, "Identity Mapping"}, - [KERNEL_START_NR] = {(unsigned long)&_stext, "Kernel Image Start"}, - [KERNEL_END_NR] = {(unsigned long)&_end, "Kernel Image End"}, - [VMEMMAP_NR] = {0, "vmemmap Area"}, - [VMALLOC_NR] = {0, "vmalloc Area"}, - [MODULES_NR] = {0, "Modules Area"}, + [IDENTITY_NR] = {0, "Identity Mapping"}, + [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, + [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, + [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, +#endif + [VMEMMAP_NR] = {0, "vmemmap Area"}, + [VMALLOC_NR] = {0, "vmalloc Area"}, + [MODULES_NR] = {0, "Modules Area"}, { -1, NULL } }; @@ -80,7 +90,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, } else if (prot != cur || level != st->level || st->current_address >= st->marker[1].start_address) { /* Print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx", + seq_printf(m, "0x%0*lx-0x%0*lx ", width, st->start_address, width, st->current_address); delta = (st->current_address - st->start_address) >> 10; @@ -90,7 +100,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, } seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - if (st->current_address >= st->marker[1].start_address) { + while (st->current_address >= st->marker[1].start_address) { st->marker++; seq_printf(m, "---[ %s ]---\n", st->marker->name); } @@ -100,6 +110,17 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } +#ifdef CONFIG_KASAN +static void note_kasan_zero_page(struct seq_file *m, struct pg_state *st) +{ + unsigned int prot; + + prot = pte_val(*kasan_zero_pte) & + (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); + note_page(m, st, prot, 4); +} +#endif + /* * The actual page table walker functions. In order to keep the * implementation of print_prot() short, we only check and pass @@ -132,6 +153,13 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pmd_t *pmd; int i; +#ifdef CONFIG_KASAN + if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_zero_pmd)) { + note_kasan_zero_page(m, st); + return; + } +#endif + for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { st->current_address = addr; pmd = pmd_offset(pud, addr); @@ -156,6 +184,13 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pud_t *pud; int i; +#ifdef CONFIG_KASAN + if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_zero_pud)) { + note_kasan_zero_page(m, st); + return; + } +#endif + for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { st->current_address = addr; pud = pud_offset(p4d, addr); @@ -179,6 +214,13 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, p4d_t *p4d; int i; +#ifdef CONFIG_KASAN + if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_zero_p4d)) { + note_kasan_zero_page(m, st); + return; + } +#endif + for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++) { st->current_address = addr; p4d = p4d_offset(pgd, addr); diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 920d40894535..eba2def3414d 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -16,7 +16,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/ctype.h> #include <linux/ioport.h> #include <asm/diag.h> @@ -80,7 +80,7 @@ struct qin64 { struct dcss_segment { struct list_head list; char dcss_name[8]; - char res_name[15]; + char res_name[16]; unsigned long start_addr; unsigned long end; atomic_t ref_count; @@ -103,7 +103,7 @@ static int scode_set; static int dcss_set_subcodes(void) { - char *name = kmalloc(8 * sizeof(char), GFP_KERNEL | GFP_DMA); + char *name = kmalloc(8, GFP_KERNEL | GFP_DMA); unsigned long rx, ry; int rc; @@ -433,7 +433,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long memcpy(&seg->res_name, seg->dcss_name, 8); EBCASC(seg->res_name, 8); seg->res_name[8] = '\0'; - strncat(seg->res_name, " (DCSS)", 7); + strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; rc = seg->vm_segtype; if (rc == SEG_TYPE_SC || diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 93faeca52284..2b8f32f56e0c 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -265,14 +265,10 @@ void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) */ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) { - struct siginfo si; - report_user_fault(regs, SIGSEGV, 1); - si.si_signo = SIGSEGV; - si.si_errno = 0; - si.si_code = si_code; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGSEGV, &si, current); + force_sig_fault(SIGSEGV, si_code, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), + current); } static noinline void do_no_context(struct pt_regs *regs) @@ -316,18 +312,13 @@ static noinline void do_low_address(struct pt_regs *regs) static noinline void do_sigbus(struct pt_regs *regs) { - struct task_struct *tsk = current; - struct siginfo si; - /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRERR; - si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK); - force_sig_info(SIGBUS, &si, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), + current); } static noinline int signal_return(struct pt_regs *regs) @@ -350,7 +341,8 @@ static noinline int signal_return(struct pt_regs *regs) return -EACCES; } -static noinline void do_fault_error(struct pt_regs *regs, int access, int fault) +static noinline void do_fault_error(struct pt_regs *regs, int access, + vm_fault_t fault) { int si_code; @@ -410,7 +402,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline int do_exception(struct pt_regs *regs, int access) +static inline vm_fault_t do_exception(struct pt_regs *regs, int access) { struct gmap *gmap; struct task_struct *tsk; @@ -420,7 +412,7 @@ static inline int do_exception(struct pt_regs *regs, int access) unsigned long trans_exc_code; unsigned long address; unsigned int flags; - int fault; + vm_fault_t fault; tsk = current; /* @@ -511,6 +503,8 @@ retry: /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out_up; goto out; } if (unlikely(fault & VM_FAULT_ERROR)) @@ -571,7 +565,8 @@ out: void do_protection_exception(struct pt_regs *regs) { unsigned long trans_exc_code; - int access, fault; + int access; + vm_fault_t fault; trans_exc_code = regs->int_parm_long; /* @@ -606,7 +601,8 @@ NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access, fault; + int access; + vm_fault_t fault; access = VM_READ | VM_EXEC | VM_WRITE; fault = do_exception(regs, access); @@ -640,17 +636,19 @@ struct pfault_refbk { u64 reserved; } __attribute__ ((packed, aligned(8))); +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1ULL << 48, + .refcmpmk = 1ULL << 48, + .reserved = __PF_RES_FIELD +}; + int pfault_init(void) { - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD }; int rc; if (pfault_disable) @@ -662,18 +660,20 @@ int pfault_init(void) "1: la %0,8\n" "2:\n" EX_TABLE(0b,1b) - : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); + : "=d" (rc) + : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); return rc; } +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + void pfault_fini(void) { - struct pfault_refbk refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, - }; if (pfault_disable) return; @@ -682,7 +682,7 @@ void pfault_fini(void) " diag %0,0,0x258\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) - : : "a" (&refbk), "m" (refbk) : "cc"); + : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); } static DEFINE_SPINLOCK(pfault_lock); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 2c55a2b9d6c6..1e668b95e0c6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2,8 +2,10 @@ /* * KVM guest address space mapping code * - * Copyright IBM Corp. 2007, 2016 + * Copyright IBM Corp. 2007, 2016, 2018 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> */ #include <linux/kernel.h> @@ -521,6 +523,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, rcu_read_unlock(); } +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + /** * gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure @@ -541,6 +546,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + u64 unprot; int rc; BUG_ON(gmap_is_shadow(gmap)); @@ -584,8 +590,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) + /* Are we allowed to use huge pages? */ + if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL); @@ -596,10 +602,22 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; + if (!rc) { + if (pmd_large(*pmd)) { + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC; + } else + *table = pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS; + } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); + } spin_unlock(&gmap->guest_table_lock); spin_unlock(ptl); radix_tree_preload_end(); @@ -690,6 +708,14 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) vmaddr |= gaddr & ~PMD_MASK; /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); + if (!vma) + continue; + /* + * We do not discard pages that are backed by + * hugetlbfs, so we don't have to refault them. + */ + if (is_vm_hugetlb_page(vma)) + continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); zap_page_range(vma, vmaddr, size); } @@ -864,7 +890,134 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, */ static void gmap_pte_op_end(spinlock_t *ptl) { - spin_unlock(ptl); + if (ptl) + spin_unlock(ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; + + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); +} + +/* + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_sem in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; + pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (bits & GMAP_NOTIFY_MPROT) + pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @pmdp: pointer to the pmd associated with the pte + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_sem in read + */ +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int rc; + pte_t *ptep; + spinlock_t *ptl = NULL; + unsigned long pbits = 0; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); + gmap_pte_op_end(ptl); + return rc; } /* @@ -883,30 +1036,45 @@ static void gmap_pte_op_end(spinlock_t *ptl) static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) { - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + unsigned long vmaddr, dist; + pmd_t *pmdp; int rc; BUG_ON(gmap_is_shadow(gmap)); while (len) { rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); - gmap_pte_op_end(ptl); + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (pmdp) { + if (!pmd_large(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + len -= PAGE_SIZE; + gaddr += PAGE_SIZE; + } + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); + len = len < dist ? 0 : len - dist; + gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; + } + } + gmap_pmd_op_end(gmap, pmdp); } if (rc) { + if (rc == -EINVAL) + return rc; + + /* -EAGAIN, fixup of userspace mm and gmap */ vmaddr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); if (rc) return rc; - continue; } - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; } return 0; } @@ -935,7 +1103,7 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); + rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); up_read(&gmap->mm->mmap_sem); return rc; } @@ -1474,6 +1642,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, unsigned long limit; int rc; + BUG_ON(parent->mm->context.allow_gmap_hpage_1m); BUG_ON(gmap_is_shadow(parent)); spin_lock(&parent->shadow_lock); sg = gmap_find_shadow(parent, asce, edat_level); @@ -1526,7 +1695,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, PGSTE_VSIE_BIT); + PROT_READ, GMAP_NOTIFY_SHADOW); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); new->initialized = true; @@ -2092,6 +2261,225 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, } EXPORT_SYMBOL_GPL(ptep_notify); +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *pmdp = new; +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (pmdp) { + gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (purge) + __pmdp_csp(pmdp); + pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_csp - csp all affected guest pmd entries + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 1); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_csp); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_large(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + spin_unlock(ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + static inline void thp_split_mm(struct mm_struct *mm) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2168,30 +2556,58 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); * Enable storage key handling from now on and initialize the storage * keys with the default key. */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) { /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; } +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + struct page *page = pmd_page(*pmd); + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE - 1; + __storage_key_init_range(start, end); + set_bit(PG_arch_1, &page->flags); + return 0; +} + int s390_enable_skey(void) { - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; + struct mm_walk walk = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + }; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc = 0; down_write(&mm->mmap_sem); - if (mm_use_skey(mm)) + if (mm_uses_skeys(mm)) goto out_up; - mm->context.use_skey = 1; + mm->context.uses_skeys = 1; for (vma = mm->mmap; vma; vma = vma->vm_next) { if (ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.use_skey = 0; + mm->context.uses_skeys = 0; rc = -ENOMEM; goto out_up; } diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 05c8abd864f1..2809d11c7a28 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -220,6 +220,8 @@ static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e804090f4470..b0246c705a19 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -123,6 +123,29 @@ static inline pte_t __rste_to_pte(unsigned long rste) return pte; } +static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) +{ + struct page *page; + unsigned long size, paddr; + + if (!mm_uses_skeys(mm) || + rste & _SEGMENT_ENTRY_INVALID) + return; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + page = pud_page(__pud(rste)); + size = PUD_SIZE; + paddr = rste & PUD_MASK; + } else { + page = pmd_page(__pmd(rste)); + size = PMD_SIZE; + paddr = rste & PMD_MASK; + } + + if (!test_and_set_bit(PG_arch_1, &page->flags)) + __storage_key_init_range(paddr, paddr + size - 1); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -137,6 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE; else rste |= _SEGMENT_ENTRY_LARGE; + clear_huge_pte_skeys(mm, rste); pte_val(*ptep) = rste; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3fa3e5323612..76d0708438e9 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -21,7 +21,7 @@ #include <linux/smp.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/memory.h> #include <linux/pfn.h> #include <linux/poison.h> @@ -29,7 +29,6 @@ #include <linux/export.h> #include <linux/cma.h> #include <linux/gfp.h> -#include <linux/memblock.h> #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -42,6 +41,7 @@ #include <asm/ctl_reg.h> #include <asm/sclp.h> #include <asm/set_memory.h> +#include <asm/kasan.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); @@ -98,8 +98,9 @@ void __init paging_init(void) S390_lowcore.user_asce = S390_lowcore.kernel_asce; crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); + kasan_copy_shadow(init_mm.pgd); - /* enable virtual mapping in kernel mode */ + /* enable virtual mapping in kernel mode */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); __ctl_load(S390_lowcore.kernel_asce, 13, 13); @@ -107,6 +108,7 @@ void __init paging_init(void) psw_bits(psw).dat = 1; psw_bits(psw).as = PSW_BITS_AS_HOME; __load_psw_mask(psw.mask); + kasan_free_early_identity(); sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); @@ -136,7 +138,7 @@ void __init mem_init(void) cmma_init(); /* this will put all low memory onto the freelists */ - free_all_bootmem(); + memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ cmma_init_nodat(); diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c new file mode 100644 index 000000000000..acb9645b762b --- /dev/null +++ b/arch/s390/mm/kasan_init.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kasan.h> +#include <linux/sched/task.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/kasan.h> +#include <asm/mem_detect.h> +#include <asm/processor.h> +#include <asm/sclp.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/setup.h> + +static unsigned long segment_pos __initdata; +static unsigned long segment_low __initdata; +static unsigned long pgalloc_pos __initdata; +static unsigned long pgalloc_low __initdata; +static unsigned long pgalloc_freeable __initdata; +static bool has_edat __initdata; +static bool has_nx __initdata; + +#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) + +static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); + +static void __init kasan_early_panic(const char *reason) +{ + sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); + sclp_early_printk(reason); + disabled_wait(0); +} + +static void * __init kasan_early_alloc_segment(void) +{ + segment_pos -= _SEGMENT_SIZE; + + if (segment_pos < segment_low) + kasan_early_panic("out of memory during initialisation\n"); + + return (void *)segment_pos; +} + +static void * __init kasan_early_alloc_pages(unsigned int order) +{ + pgalloc_pos -= (PAGE_SIZE << order); + + if (pgalloc_pos < pgalloc_low) + kasan_early_panic("out of memory during initialisation\n"); + + return (void *)pgalloc_pos; +} + +static void * __init kasan_early_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = kasan_early_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t * __init kasan_early_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = kasan_early_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +enum populate_mode { + POPULATE_ONE2ONE, + POPULATE_MAP, + POPULATE_ZERO_SHADOW +}; +static void __init kasan_early_vmemmap_populate(unsigned long address, + unsigned long end, + enum populate_mode mode) +{ + unsigned long pgt_prot_zero, pgt_prot, sgt_prot; + pgd_t *pg_dir; + p4d_t *p4_dir; + pud_t *pu_dir; + pmd_t *pm_dir; + pte_t *pt_dir; + + pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); + if (!has_nx) + pgt_prot_zero &= ~_PAGE_NOEXEC; + pgt_prot = pgprot_val(PAGE_KERNEL_EXEC); + sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC); + + while (address < end) { + pg_dir = pgd_offset_k(address); + if (pgd_none(*pg_dir)) { + if (mode == POPULATE_ZERO_SHADOW && + IS_ALIGNED(address, PGDIR_SIZE) && + end - address >= PGDIR_SIZE) { + pgd_populate(&init_mm, pg_dir, kasan_zero_p4d); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + continue; + } + p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pg_dir, p4_dir); + } + + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { + if (mode == POPULATE_ZERO_SHADOW && + IS_ALIGNED(address, P4D_SIZE) && + end - address >= P4D_SIZE) { + p4d_populate(&init_mm, p4_dir, kasan_zero_pud); + address = (address + P4D_SIZE) & P4D_MASK; + continue; + } + pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4_dir, pu_dir); + } + + pu_dir = pud_offset(p4_dir, address); + if (pud_none(*pu_dir)) { + if (mode == POPULATE_ZERO_SHADOW && + IS_ALIGNED(address, PUD_SIZE) && + end - address >= PUD_SIZE) { + pud_populate(&init_mm, pu_dir, kasan_zero_pmd); + address = (address + PUD_SIZE) & PUD_MASK; + continue; + } + pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pu_dir, pm_dir); + } + + pm_dir = pmd_offset(pu_dir, address); + if (pmd_none(*pm_dir)) { + if (mode == POPULATE_ZERO_SHADOW && + IS_ALIGNED(address, PMD_SIZE) && + end - address >= PMD_SIZE) { + pmd_populate(&init_mm, pm_dir, kasan_zero_pte); + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } + /* the first megabyte of 1:1 is mapped with 4k pages */ + if (has_edat && address && end - address >= PMD_SIZE && + mode != POPULATE_ZERO_SHADOW) { + void *page; + + if (mode == POPULATE_ONE2ONE) { + page = (void *)address; + } else { + page = kasan_early_alloc_segment(); + memset(page, 0, _SEGMENT_SIZE); + } + pmd_val(*pm_dir) = __pa(page) | sgt_prot; + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } + + pt_dir = kasan_early_pte_alloc(); + pmd_populate(&init_mm, pm_dir, pt_dir); + } else if (pmd_large(*pm_dir)) { + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } + + pt_dir = pte_offset_kernel(pm_dir, address); + if (pte_none(*pt_dir)) { + void *page; + + switch (mode) { + case POPULATE_ONE2ONE: + page = (void *)address; + pte_val(*pt_dir) = __pa(page) | pgt_prot; + break; + case POPULATE_MAP: + page = kasan_early_alloc_pages(0); + memset(page, 0, PAGE_SIZE); + pte_val(*pt_dir) = __pa(page) | pgt_prot; + break; + case POPULATE_ZERO_SHADOW: + page = kasan_zero_page; + pte_val(*pt_dir) = __pa(page) | pgt_prot_zero; + break; + } + } + address += PAGE_SIZE; + } +} + +static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) +{ + unsigned long asce_bits; + + asce_bits = asce_type | _ASCE_TABLE_LENGTH; + S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; + S390_lowcore.user_asce = S390_lowcore.kernel_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); +} + +static void __init kasan_enable_dat(void) +{ + psw_t psw; + + psw.mask = __extract_psw(); + psw_bits(psw).dat = 1; + psw_bits(psw).as = PSW_BITS_AS_HOME; + __load_psw_mask(psw.mask); +} + +static void __init kasan_early_detect_facilities(void) +{ + __stfle(S390_lowcore.stfle_fac_list, + ARRAY_SIZE(S390_lowcore.stfle_fac_list)); + if (test_facility(8)) { + has_edat = true; + __ctl_set_bit(0, 23); + } + if (!noexec_disabled && test_facility(130)) { + has_nx = true; + __ctl_set_bit(0, 20); + } +} + +static unsigned long __init get_mem_detect_end(void) +{ + unsigned long start; + unsigned long end; + + if (mem_detect.count) { + __get_mem_detect_block(mem_detect.count - 1, &start, &end); + return end; + } + return 0; +} + +void __init kasan_early_init(void) +{ + unsigned long untracked_mem_end; + unsigned long shadow_alloc_size; + unsigned long initrd_end; + unsigned long asce_type; + unsigned long memsize; + unsigned long vmax; + unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); + pte_t pte_z; + pmd_t pmd_z = __pmd(__pa(kasan_zero_pte) | _SEGMENT_ENTRY); + pud_t pud_z = __pud(__pa(kasan_zero_pmd) | _REGION3_ENTRY); + p4d_t p4d_z = __p4d(__pa(kasan_zero_pud) | _REGION2_ENTRY); + + kasan_early_detect_facilities(); + if (!has_nx) + pgt_prot &= ~_PAGE_NOEXEC; + pte_z = __pte(__pa(kasan_zero_page) | pgt_prot); + + memsize = get_mem_detect_end(); + if (!memsize) + kasan_early_panic("cannot detect physical memory size\n"); + /* respect mem= cmdline parameter */ + if (memory_end_set && memsize > memory_end) + memsize = memory_end; + memsize = min(memsize, KASAN_SHADOW_START); + + if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) { + /* 4 level paging */ + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); + crst_table_init((unsigned long *)early_pg_dir, + _REGION2_ENTRY_EMPTY); + untracked_mem_end = vmax = _REGION1_SIZE; + asce_type = _ASCE_TYPE_REGION2; + } else { + /* 3 level paging */ + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE)); + crst_table_init((unsigned long *)early_pg_dir, + _REGION3_ENTRY_EMPTY); + untracked_mem_end = vmax = _REGION2_SIZE; + asce_type = _ASCE_TYPE_REGION3; + } + + /* init kasan zero shadow */ + crst_table_init((unsigned long *)kasan_zero_p4d, p4d_val(p4d_z)); + crst_table_init((unsigned long *)kasan_zero_pud, pud_val(pud_z)); + crst_table_init((unsigned long *)kasan_zero_pmd, pmd_val(pmd_z)); + memset64((u64 *)kasan_zero_pte, pte_val(pte_z), PTRS_PER_PTE); + + shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; + pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = + round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + if (pgalloc_low + shadow_alloc_size > memsize) + kasan_early_panic("out of memory during initialisation\n"); + + if (has_edat) { + segment_pos = round_down(memsize, _SEGMENT_SIZE); + segment_low = segment_pos - shadow_alloc_size; + pgalloc_pos = segment_low; + } else { + pgalloc_pos = memsize; + } + init_mm.pgd = early_pg_dir; + /* + * Current memory layout: + * +- 0 -------------+ +- shadow start -+ + * | 1:1 ram mapping | /| 1/8 ram | + * +- end of ram ----+ / +----------------+ + * | ... gap ... |/ | kasan | + * +- shadow start --+ | zero | + * | 1/8 addr space | | page | + * +- shadow end -+ | mapping | + * | ... gap ... |\ | (untracked) | + * +- modules vaddr -+ \ +----------------+ + * | 2Gb | \| unmapped | allocated per module + * +-----------------+ +- shadow end ---+ + */ + /* populate kasan shadow (for identity mapping and zero page mapping) */ + kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP); + if (IS_ENABLED(CONFIG_MODULES)) + untracked_mem_end = vmax - MODULES_LEN; + kasan_early_vmemmap_populate(__sha(max_physmem_end), + __sha(untracked_mem_end), + POPULATE_ZERO_SHADOW); + /* memory allocated for identity mapping structs will be freed later */ + pgalloc_freeable = pgalloc_pos; + /* populate identity mapping */ + kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE); + kasan_set_pgd(early_pg_dir, asce_type); + kasan_enable_dat(); + /* enable kasan */ + init_task.kasan_depth = 0; + memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); + sclp_early_printk("KernelAddressSanitizer initialized\n"); +} + +void __init kasan_copy_shadow(pgd_t *pg_dir) +{ + /* + * At this point we are still running on early pages setup early_pg_dir, + * while swapper_pg_dir has just been initialized with identity mapping. + * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. + */ + + pgd_t *pg_dir_src; + pgd_t *pg_dir_dst; + p4d_t *p4_dir_src; + p4d_t *p4_dir_dst; + pud_t *pu_dir_src; + pud_t *pu_dir_dst; + + pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); + pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START); + p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); + p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); + if (!p4d_folded(*p4_dir_src)) { + /* 4 level paging */ + memcpy(p4_dir_dst, p4_dir_src, + (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); + return; + } + /* 3 level paging */ + pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START); + pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START); + memcpy(pu_dir_dst, pu_dir_src, + (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t)); +} + +void __init kasan_free_early_identity(void) +{ + memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); +} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 7be06475809b..97b3ee53852b 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -89,10 +89,8 @@ static int __memcpy_real(void *dest, void *src, size_t count) return rc; } -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, void *src, size_t count) +static unsigned long _memcpy_real(unsigned long dest, unsigned long src, + unsigned long count) { int irqs_disabled, rc; unsigned long flags; @@ -103,7 +101,7 @@ int memcpy_real(void *dest, void *src, size_t count) irqs_disabled = arch_irqs_disabled_flags(flags); if (!irqs_disabled) trace_hardirqs_off(); - rc = __memcpy_real(dest, src, count); + rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); if (!irqs_disabled) trace_hardirqs_on(); __arch_local_irq_ssm(flags); @@ -111,6 +109,23 @@ int memcpy_real(void *dest, void *src, size_t count) } /* + * Copy memory in real mode (kernel to kernel) + */ +int memcpy_real(void *dest, void *src, size_t count) +{ + if (S390_lowcore.nodat_stack != 0) + return CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, + 3, dest, src, count); + /* + * This is a really early memcpy_real call, the stacks are + * not set up yet. Just call _memcpy_real on the early boot + * stack + */ + return _memcpy_real((unsigned long) dest,(unsigned long) src, + (unsigned long) count); +} + +/* * Copy memory in absolute mode (kernel to kernel) */ void memcpy_absolute(void *dest, void *src, size_t count) diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c deleted file mode 100644 index 21f6c82c8296..000000000000 --- a/arch/s390/mm/mem_detect.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2008, 2009 - * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> - */ - -#include <linux/kernel.h> -#include <linux/memblock.h> -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/seq_file.h> -#include <asm/ipl.h> -#include <asm/sclp.h> -#include <asm/setup.h> - -#define CHUNK_READ_WRITE 0 -#define CHUNK_READ_ONLY 1 - -static inline void memblock_physmem_add(phys_addr_t start, phys_addr_t size) -{ - memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n", - start, start + size - 1); - memblock_add_range(&memblock.memory, start, size, 0, 0); - memblock_add_range(&memblock.physmem, start, size, 0, 0); -} - -void __init detect_memory_memblock(void) -{ - unsigned long memsize, rnmax, rzm, addr, size; - int type; - - rzm = sclp.rzm; - rnmax = sclp.rnmax; - memsize = rzm * rnmax; - if (!rzm) - rzm = 1UL << 17; - max_physmem_end = memsize; - addr = 0; - /* keep memblock lists close to the kernel */ - memblock_set_bottom_up(true); - do { - size = 0; - /* assume lowcore is writable */ - type = addr ? tprot(addr) : CHUNK_READ_WRITE; - do { - size += rzm; - if (max_physmem_end && addr + size >= max_physmem_end) - break; - } while (type == tprot(addr + size)); - if (type == CHUNK_READ_WRITE || type == CHUNK_READ_ONLY) { - if (max_physmem_end && (addr + size > max_physmem_end)) - size = max_physmem_end - addr; - memblock_physmem_add(addr, size); - } - addr += size; - } while (addr < max_physmem_end); - memblock_set_bottom_up(false); - if (!max_physmem_end) - max_physmem_end = memblock_end_of_DRAM(); - memblock_dump_all(); -} diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 831bdcf407bb..0a7627cdb34e 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,11 +37,11 @@ static unsigned long stack_maxrandom_size(void) #define MIN_GAP (32*1024*1024) #define MAX_GAP (STACK_TOP/6*5) -static inline int mmap_is_legacy(void) +static inline int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + if (rlim_stack->rlim_cur == RLIM_INFINITY) return 1; return sysctl_legacy_va_layout; } @@ -56,9 +56,10 @@ static unsigned long mmap_base_legacy(unsigned long rnd) return TASK_UNMAPPED_BASE + rnd; } -static inline unsigned long mmap_base(unsigned long rnd) +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) { - unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap = rlim_stack->rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; @@ -184,7 +185,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -195,11 +196,11 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: */ - if (mmap_is_legacy()) { + if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 382153ff17e3..dc3cede7f2ec 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -271,7 +271,7 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable_dat(page, 0); + set_page_stable_dat(page, order); else set_page_unused(page, order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index c44171588d08..f8c6faab41f4 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -14,7 +14,7 @@ static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) { - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0" + asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" : [addr] "+a" (addr) : [skey] "d" (skey)); return addr; } @@ -23,8 +23,6 @@ void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; - if (!PAGE_DEFAULT_KEY) - return; while (start < end) { if (MACHINE_HAS_EDAT1) { /* set storage keys for a 1MB frame */ @@ -37,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) continue; } } - page_set_storage_key(start, PAGE_DEFAULT_KEY, 0); + page_set_storage_key(start, PAGE_DEFAULT_KEY, 1); start += PAGE_SIZE; } } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index cb364153c43c..814f26520aa2 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -6,8 +6,9 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/mm.h> #include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/mm.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <asm/gmap.h> @@ -27,7 +28,7 @@ static struct ctl_table page_table_sysctl[] = { .data = &page_table_allocate_pgste, .maxlen = sizeof(int), .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &page_table_allocate_pgste_min, .extra2 = &page_table_allocate_pgste_max, }, @@ -100,6 +101,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); } else { crst_table_init(table, _REGION1_ENTRY_EMPTY); pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); @@ -189,14 +191,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); - mask = atomic_read(&page->_mapcount); + mask = atomic_read(&page->_refcount) >> 24; mask = (mask | (mask >> 4)) & 3; if (mask != 3) { table = (unsigned long *) page_to_phys(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); + atomic_xor_bits(&page->_refcount, + 1U << (bit + 24)); list_del(&page->lru); } } @@ -217,12 +220,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); + atomic_xor_bits(&page->_refcount, 3 << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); + atomic_xor_bits(&page->_refcount, 1 << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -241,7 +244,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else @@ -249,10 +253,11 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) spin_unlock_bh(&mm->context.lock); if (mask != 0) return; + } else { + atomic_xor_bits(&page->_refcount, 3U << 24); } pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); } @@ -273,7 +278,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else @@ -295,12 +301,15 @@ static void __tlb_remove_table(void *_table) break; case 1: /* lower 2K of a 4K page table */ case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask >>= 24; + if (mask != 0) break; /* fallthrough */ case 3: /* 4K page table with pgstes */ + if (mask & 3) + atomic_xor_bits(&page->_refcount, 3 << 24); pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); break; } @@ -366,3 +375,293 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) if ((*batch)->nr == MAX_TABLE_BATCH) tlb_flush_mmu(tlb); } + +/* + * Base infrastructure required to generate basic asces, region, segment, + * and page tables that do not make use of enhanced features like EDAT1. + */ + +static struct kmem_cache *base_pgt_cache; + +static unsigned long base_pgt_alloc(void) +{ + u64 *table; + + table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); + if (table) + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + return (unsigned long) table; +} + +static void base_pgt_free(unsigned long table) +{ + kmem_cache_free(base_pgt_cache, (void *) table); +} + +static unsigned long base_crst_alloc(unsigned long val) +{ + unsigned long table; + + table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (table) + crst_table_init((unsigned long *)table, val); + return table; +} + +static void base_crst_free(unsigned long table) +{ + free_pages(table, CRST_ALLOC_ORDER); +} + +#define BASE_ADDR_END_FUNC(NAME, SIZE) \ +static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ + unsigned long end) \ +{ \ + unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \ + \ + return (next - 1) < (end - 1) ? next : end; \ +} + +BASE_ADDR_END_FUNC(page, _PAGE_SIZE) +BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) +BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) +BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) +BASE_ADDR_END_FUNC(region1, _REGION1_SIZE) + +static inline unsigned long base_lra(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %0,0(%1)\n" + : "=d" (real) : "a" (address) : "cc"); + return real; +} + +static int base_page_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *pte, next; + + if (!alloc) + return 0; + pte = (unsigned long *) origin; + pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; + do { + next = base_page_addr_end(addr, end); + *pte = base_lra(addr); + } while (pte++, addr = next, addr < end); + return 0; +} + +static int base_segment_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *ste, next, table; + int rc; + + ste = (unsigned long *) origin; + ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + do { + next = base_segment_addr_end(addr, end); + if (*ste & _SEGMENT_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_pgt_alloc(); + if (!table) + return -ENOMEM; + *ste = table | _SEGMENT_ENTRY; + } + table = *ste & _SEGMENT_ENTRY_ORIGIN; + rc = base_page_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_pgt_free(table); + cond_resched(); + } while (ste++, addr = next, addr < end); + return 0; +} + +static int base_region3_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rtte, next, table; + int rc; + + rtte = (unsigned long *) origin; + rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; + do { + next = base_region3_addr_end(addr, end); + if (*rtte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rtte = table | _REGION3_ENTRY; + } + table = *rtte & _REGION_ENTRY_ORIGIN; + rc = base_segment_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rtte++, addr = next, addr < end); + return 0; +} + +static int base_region2_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rste, next, table; + int rc; + + rste = (unsigned long *) origin; + rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; + do { + next = base_region2_addr_end(addr, end); + if (*rste & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rste = table | _REGION2_ENTRY; + } + table = *rste & _REGION_ENTRY_ORIGIN; + rc = base_region3_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rste++, addr = next, addr < end); + return 0; +} + +static int base_region1_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rfte, next, table; + int rc; + + rfte = (unsigned long *) origin; + rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; + do { + next = base_region1_addr_end(addr, end); + if (*rfte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rfte = table | _REGION1_ENTRY; + } + table = *rfte & _REGION_ENTRY_ORIGIN; + rc = base_region2_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rfte++, addr = next, addr < end); + return 0; +} + +/** + * base_asce_free - free asce and tables returned from base_asce_alloc() + * @asce: asce to be freed + * + * Frees all region, segment, and page tables that were allocated with a + * corresponding base_asce_alloc() call. + */ +void base_asce_free(unsigned long asce) +{ + unsigned long table = asce & _ASCE_ORIGIN; + + if (!asce) + return; + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_SEGMENT: + base_segment_walk(table, 0, _REGION3_SIZE, 0); + break; + case _ASCE_TYPE_REGION3: + base_region3_walk(table, 0, _REGION2_SIZE, 0); + break; + case _ASCE_TYPE_REGION2: + base_region2_walk(table, 0, _REGION1_SIZE, 0); + break; + case _ASCE_TYPE_REGION1: + base_region1_walk(table, 0, -_PAGE_SIZE, 0); + break; + } + base_crst_free(table); +} + +static int base_pgt_cache_init(void) +{ + static DEFINE_MUTEX(base_pgt_cache_mutex); + unsigned long sz = _PAGE_TABLE_SIZE; + + if (base_pgt_cache) + return 0; + mutex_lock(&base_pgt_cache_mutex); + if (!base_pgt_cache) + base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL); + mutex_unlock(&base_pgt_cache_mutex); + return base_pgt_cache ? 0 : -ENOMEM; +} + +/** + * base_asce_alloc - create kernel mapping without enhanced DAT features + * @addr: virtual start address of kernel mapping + * @num_pages: number of consecutive pages + * + * Generate an asce, including all required region, segment and page tables, + * that can be used to access the virtual kernel mapping. The difference is + * that the returned asce does not make use of any enhanced DAT features like + * e.g. large pages. This is required for some I/O functions that pass an + * asce, like e.g. some service call requests. + * + * Note: the returned asce may NEVER be attached to any cpu. It may only be + * used for I/O requests. tlb entries that might result because the + * asce was attached to a cpu won't be cleared. + */ +unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) +{ + unsigned long asce, table, end; + int rc; + + if (base_pgt_cache_init()) + return 0; + end = addr + num_pages * PAGE_SIZE; + if (end <= _REGION3_SIZE) { + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_segment_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION2_SIZE) { + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region3_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION1_SIZE) { + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region2_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + table = base_crst_alloc(_REGION1_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region1_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + } + if (rc) { + base_asce_free(asce); + asce = 0; + } + return asce; +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4f2b65d01a70..f2cc7da473e4 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -158,7 +158,7 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, #ifdef CONFIG_PGSTE unsigned long address, bits, skey; - if (!mm_use_skey(mm) || pte_val(pte) & _PAGE_INVALID) + if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) return pgste; address = pte_val(pte) & PAGE_MASK; skey = (unsigned long) page_get_storage_key(address); @@ -180,7 +180,7 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, unsigned long address; unsigned long nkey; - if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID) + if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) return; VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); address = pte_val(entry) & PAGE_MASK; @@ -347,18 +347,27 @@ static inline void pmdp_idte_local(struct mm_struct *mm, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_local(mm, addr); } static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) + if (MACHINE_HAS_TLB_GUEST) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else if (MACHINE_HAS_IDTE) { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - else + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else { __pmdp_csp(pmdp); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_csp(mm, addr); + } } static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, @@ -392,6 +401,8 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; + if (mm_has_pgste(mm)) + gmap_pmdp_invalidate(mm, addr); } else { pmdp_idte_global(mm, addr, pmdp); } @@ -399,6 +410,24 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } +static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + return pmd; +} + pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) { @@ -693,40 +722,14 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) /* * Test and reset if a guest page is dirty */ -bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; pgste_t pgste; - pte_t *ptep; pte_t pte; bool dirty; int nodat; - pgd = pgd_offset(mm, addr); - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return false; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return false; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return false; - /* We can't run guests backed by huge pages, but userspace can - * still set them up and then try to migrate them without any - * migration support. - */ - if (pmd_large(*pmd)) - return true; - - ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (unlikely(!ptep)) - return false; - pgste = pgste_get_lock(ptep); dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); pgste_val(pgste) &= ~PGSTE_UC_BIT; @@ -742,21 +745,43 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) *ptep = pte; } pgste_set_unlock(ptep, pgste); - - spin_unlock(ptl); return dirty; } -EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty); +EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq) { - unsigned long keyul; + unsigned long keyul, paddr; spinlock_t *ptl; pgste_t old, new; + pmd_t *pmdp; pte_t *ptep; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + /* + * Huge pmds need quiescing operations, they are + * always mapped. + */ + page_set_storage_key(paddr, key, 1); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -767,14 +792,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long address, bits, skey; + unsigned long bits, skey; - address = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); + paddr = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(paddr); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); /* Set storage key ACC and FP */ - page_set_storage_key(address, skey, !nq); + page_set_storage_key(paddr, skey, !nq); /* Merge host changed & referenced into pgste */ pgste_val(new) |= bits << 52; } @@ -830,11 +855,32 @@ EXPORT_SYMBOL(cond_set_guest_storage_key); int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) { spinlock_t *ptl; + unsigned long paddr; pgste_t old, new; + pmd_t *pmdp; pte_t *ptep; int cc = 0; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + cc = page_reset_referenced(paddr); + spin_unlock(ptl); + return cc; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -843,7 +889,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pgste_val(new) &= ~PGSTE_GR_BIT; if (!(pte_val(*ptep) & _PAGE_INVALID)) { - cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); + paddr = pte_val(*ptep) & PAGE_MASK; + cc = page_reset_referenced(paddr); /* Merge real referenced bit into host-set */ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; } @@ -862,18 +909,42 @@ EXPORT_SYMBOL(reset_guest_reference_bit); int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key) { + unsigned long paddr; spinlock_t *ptl; pgste_t pgste; + pmd_t *pmdp; pte_t *ptep; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + /* Not yet mapped memory has a zero key */ + spin_unlock(ptl); + *key = 0; + return 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + *key = page_get_storage_key(paddr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + paddr = pte_val(*ptep) & PAGE_MASK; if (!(pte_val(*ptep) & _PAGE_INVALID)) - *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + *key = page_get_storage_key(paddr); /* Reflect guest's logical view, not physical */ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index db55561c5981..0472e27febdf 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -4,14 +4,13 @@ * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> -#include <linux/memblock.h> #include <asm/cacheflush.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> @@ -36,7 +35,7 @@ static void __ref *vmem_alloc_pages(unsigned int order) if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return (void *) memblock_alloc(size, size); + return (void *) memblock_phys_alloc(size, size); } void *vmem_crst_alloc(unsigned long val) @@ -57,7 +56,7 @@ pte_t __ref *vmem_pte_alloc(void) if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_alloc(size, size); + pte = (pte_t *) memblock_phys_alloc(size, size); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); |

