From ae531c26c5c2a28ca1b35a75b39b3b256850f2c8 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 24 Apr 2008 23:40:47 +0200 Subject: x86: introduce /dev/mem restrictions with a config option This patch introduces a restriction on /dev/mem: Only non-memory can be read or written unless the newly introduced config option is set. The X server needs access to /dev/mem for the PCI space, but it doesn't need access to memory; both the file permissions and SELinux permissions of /dev/mem just make X effectively super-super powerful. With the exception of the BIOS area, there's just no valid app that uses /dev/mem on actual memory. Other popular users of /dev/mem are rootkits and the like. (note: mmap access of memory via /dev/mem was already not allowed since a really long time) People who want to use /dev/mem for kernel debugging can enable the config option. The restrictions of this patch have been in the Fedora and RHEL kernels for at least 4 years without any problems. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_32.c | 19 +++++++++++++++++++ arch/x86/mm/init_64.c | 20 ++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ec62da85fd7..39852d539018 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ff7906a9a4d..49c274ee2fba 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -664,6 +664,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; -- cgit v1.2.1 From e045fb2a988a9a1964059b0d33dbaf18d12f925f Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Tue, 18 Mar 2008 17:00:15 -0700 Subject: x86: PAT avoid aliasing in /dev/mem read/write Add xlate and unxlate around /dev/mem read/write. This sets up the mapping that can be used for /dev/mem read and write without aliasing worries. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3a4baf95e24d..caac7d5699a7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -336,6 +336,35 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void *)ioremap(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} + #ifdef CONFIG_X86_32 int __initdata early_ioremap_debug; -- cgit v1.2.1 From f0970c13b6a5b01189aeb196ebb573cf87d95839 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Tue, 18 Mar 2008 17:00:20 -0700 Subject: x86: PAT phys_mem_access_prot_allowed for dev/mem mmap Introduce phys_mem_access_prot_allowed(), which checks whether the mapping is possible, without any conflicts and returns success or failure based on that. phys_mem_access_prot() by itself does not allow failure case. This ability to return error is needed for PAT where we may have aliasing conflicts. x86 setup __HAVE_PHYS_MEM_ACCESS_PROT and move x86 specific code out of /dev/mem into arch specific area. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 72c0f6097402..64cc0c18233e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -419,3 +419,42 @@ int free_memtype(u64 start, u64 end) return err; } + +/* /dev/mem interface. Use the previous mapping */ +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + + if (file->f_flags & O_SYNC) { + *vma_prot = pgprot_noncached(*vma_prot); + return 1; + } + +#ifdef CONFIG_X86_32 + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_wc_enabled && + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + *vma_prot = pgprot_noncached(*vma_prot); + return 1; + } +#endif + + return 1; +} -- cgit v1.2.1 From e7f260a276f2c9184fe753732d834b1f6fbe9f17 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Tue, 18 Mar 2008 17:00:21 -0700 Subject: x86: PAT use reserve free memtype in mmap of /dev/mem Use reserve_memtype and free_memtype wrappers for /dev/mem mmaps. The memtype is slightly complicated here, given that we have to support existing X mappings. We fallback on UC_MINUS for that. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 64cc0c18233e..1489aafbfa71 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include int pat_wc_enabled = 1; @@ -190,6 +192,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, return 0; } +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If ret_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If ret_type is non-null, function will return + * available type in ret_type in case of no error. In case of any error + * it will return a negative return value. + */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *ret_type) { @@ -200,9 +217,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Only track when pat_wc_enabled */ if (!pat_wc_enabled) { - if (ret_type) - *ret_type = req_type; - + /* This is identical to page table setting without PAT */ + if (ret_type) { + if (req_type == -1) { + *ret_type = _PAGE_CACHE_WB; + } else { + *ret_type = req_type; + } + } return 0; } @@ -214,8 +236,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return 0; } - req_type &= _PAGE_CACHE_MASK; - err = pat_x_mtrr_type(start, end, req_type, &actual_type); + if (req_type == -1) { + /* + * Special case where caller wants to inherit from mtrr or + * existing pat mapping, defaulting to UC_MINUS in case of + * no match. + */ + u8 mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFE) { /* MTRR match error */ + err = -1; + } + + if (mtrr_type == MTRR_TYPE_WRBACK) { + req_type = _PAGE_CACHE_WB; + actual_type = _PAGE_CACHE_WB; + } else { + req_type = _PAGE_CACHE_UC_MINUS; + actual_type = _PAGE_CACHE_UC_MINUS; + } + } else { + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + } + if (err) { if (ret_type) *ret_type = actual_type; @@ -420,7 +463,14 @@ int free_memtype(u64 start, u64 end) } -/* /dev/mem interface. Use the previous mapping */ +/* + * /dev/mem mmap interface. The memtype used for mapping varies: + * - Use UC for mappings with O_SYNC flag + * - Without O_SYNC flag, if there is any conflict in reserve_memtype, + * inherit the memtype from existing mapping. + * - Else use UC_MINUS memtype (for backward compatibility with existing + * X drivers. + */ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -430,10 +480,13 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { + u64 offset = ((u64) pfn) << PAGE_SHIFT; + unsigned long flags = _PAGE_CACHE_UC_MINUS; + unsigned long ret_flags; + int retval; if (file->f_flags & O_SYNC) { - *vma_prot = pgprot_noncached(*vma_prot); - return 1; + flags = _PAGE_CACHE_UC; } #ifdef CONFIG_X86_32 @@ -451,10 +504,65 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - *vma_prot = pgprot_noncached(*vma_prot); - return 1; + flags = _PAGE_CACHE_UC; } #endif + /* + * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * Without O_SYNC, we want to get + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + if (flags != _PAGE_CACHE_UC_MINUS) { + retval = reserve_memtype(offset, offset + size, flags, NULL); + } else { + retval = reserve_memtype(offset, offset + size, -1, &ret_flags); + } + + if (retval < 0) + return 0; + + flags = ret_flags; + + if (pfn <= max_pfn_mapped && + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + free_memtype(offset, offset + size); + printk(KERN_DEBUG + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + offset, offset + size); + return 0; + } + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); return 1; } + +void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + reserve_memtype(addr, addr + size, want_flags, &flags); + if (flags != want_flags) { + printk(KERN_DEBUG + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + addr, addr + size, + cattr_name(flags)); + } +} + +void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + + free_memtype(addr, addr + size); +} + -- cgit v1.2.1 From 28eb559b5b0b9b51b9165a9b8faa75b0bb91ca8d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 3 Apr 2008 10:14:33 +0200 Subject: pat: cleanups Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1489aafbfa71..ef8b64b89c7d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -284,7 +284,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, struct memtype *saved_ptr; if (parse->start >= end) { - printk("New Entry\n"); + pr_debug("New Entry\n"); list_add(&new_entry->nd, parse->nd.prev); new_entry = NULL; break; @@ -386,7 +386,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - printk("Overlap at 0x%Lx-0x%Lx\n", + printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, &saved_ptr->nd); @@ -396,7 +396,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (err) { - printk( + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(new_entry->type), cattr_name(req_type)); @@ -408,16 +408,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_entry) { /* No conflict. Not yet added to the list. Add to the tail */ list_add_tail(&new_entry->nd, &memtype_list); - printk("New Entry\n"); - } + pr_debug("New Entry\n"); + } if (ret_type) { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), cattr_name(req_type), cattr_name(*ret_type)); } else { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(actual_type), cattr_name(req_type)); @@ -454,11 +454,11 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (err) { - printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", current->comm, current->pid, start, end); } - printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); return err; } @@ -529,7 +529,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (pfn <= max_pfn_mapped && ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); - printk(KERN_DEBUG + printk(KERN_INFO "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", current->comm, current->pid, cattr_name(flags), @@ -550,7 +550,7 @@ void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) reserve_memtype(addr, addr + size, want_flags, &flags); if (flags != want_flags) { - printk(KERN_DEBUG + printk(KERN_INFO "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", current->comm, current->pid, cattr_name(want_flags), -- cgit v1.2.1