summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/boot.h18
-rw-r--r--arch/x86/boot/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/cmdline.c12
-rw-r--r--arch/x86/boot/compressed/head_64.S48
-rw-r--r--arch/x86/boot/header.S10
-rw-r--r--arch/x86/include/asm/init.h28
-rw-r--r--arch/x86/include/asm/kexec.h6
-rw-r--r--arch/x86/include/asm/mmzone_32.h6
-rw-r--r--arch/x86/include/asm/numa.h2
-rw-r--r--arch/x86/include/asm/numa_64.h6
-rw-r--r--arch/x86/include/asm/page.h7
-rw-r--r--arch/x86/include/asm/page_32.h1
-rw-r--r--arch/x86/include/asm/page_64.h36
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/realmode.h3
-rw-r--r--arch/x86/include/asm/uaccess.h55
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c131
-rw-r--r--arch/x86/kernel/head_64.S210
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c1
-rw-r--r--arch/x86/kernel/kvm.c11
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/setup.c260
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c4
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lib/getuser.S43
-rw-r--r--arch/x86/mm/init.c459
-rw-r--r--arch/x86/mm/init_32.c106
-rw-r--r--arch/x86/mm/init_64.c255
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa.c32
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr.c66
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/physaddr.c60
-rw-r--r--arch/x86/platform/efi/efi.c11
-rw-r--r--arch/x86/power/hibernate_32.c2
-rw-r--r--arch/x86/power/hibernate_64.c66
-rw-r--r--arch/x86/realmode/init.c49
-rw-r--r--arch/x86/xen/mmu.c28
62 files changed, 1384 insertions, 1192 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b44c0b50e569..ff0e5f3c844e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1277,10 +1277,6 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5a1053..5b7531966b84 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -285,16 +285,26 @@ struct biosregs {
void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
/* cmdline.c */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
{
- return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
}
static inline int cmdline_find_option_bool(const char *option)
{
- return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option_bool(cmd_line_ptr, option);
}
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f708c04..625d21b0cd3f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
* Returns the length of the argument (regardless of if it was
* truncated to fit in the buffer), or -1 on not found.
*/
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
{
addr_t cptr;
char c;
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
st_bufcpy /* Copying this to buffer */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
* Returns the position of that option (starts counting with 1)
* or 0 on not found
*/
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
{
addr_t cptr;
char c;
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
st_wordskip, /* Miscompare, skip */
} state = st_wordstart;
- if (!cmdline_ptr || cmdline_ptr >= 0x100000)
- return -1; /* No command line, or inaccessible */
+ if (!cmdline_ptr)
+ return -1; /* No command line */
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index 10f6b1178c68..bffd73b45b1f 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
int cmdline_find_option(const char *option, char *buffer, int bufsize)
{
- return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+ return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
}
int cmdline_find_option_bool(const char *option)
{
- return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+ return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
}
#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f5d1aaa0dec8..c1d383d1fb7e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -37,6 +37,12 @@
__HEAD
.code32
ENTRY(startup_32)
+ /*
+ * 32bit entry is 0 and it is ABI so immutable!
+ * If we come here directly from a bootloader,
+ * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+ * all need to be under the 4G limit.
+ */
cld
/*
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -154,6 +160,12 @@ ENTRY(startup_32)
btsl $_EFER_LME, %eax
wrmsr
+ /* After gdt is loaded */
+ xorl %eax, %eax
+ lldt %ax
+ movl $0x20, %eax
+ ltr %ax
+
/*
* Setup for the jump to 64bit mode
*
@@ -176,28 +188,18 @@ ENTRY(startup_32)
lret
ENDPROC(startup_32)
-no_longmode:
- /* This isn't an x86-64 CPU so hang */
-1:
- hlt
- jmp 1b
-
-#include "../../kernel/verify_cpu.S"
-
- /*
- * Be careful here startup_64 needs to be at a predictable
- * address so I can export it in an ELF header. Bootloaders
- * should look at the ELF header to find this address, as
- * it may change in the future.
- */
.code64
.org 0x200
ENTRY(startup_64)
/*
+ * 64bit entry is 0x200 and it is ABI so immutable!
* We come here either from startup_32 or directly from a
- * 64bit bootloader. If we come here from a bootloader we depend on
- * an identity mapped page table being provied that maps our
- * entire text+data+bss and hopefully all of memory.
+ * 64bit bootloader.
+ * If we come here from a bootloader, kernel(text+data+bss+brk),
+ * ramdisk, zero_page, command line could be above 4G.
+ * We depend on an identity mapped page table being provided
+ * that maps our entire kernel(text+data+bss+brk), zero page
+ * and command line.
*/
#ifdef CONFIG_EFI_STUB
/*
@@ -247,9 +249,6 @@ preferred_addr:
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs
- lldt %ax
- movl $0x20, %eax
- ltr %ax
/*
* Compute the decompressed kernel start address. It is where
@@ -349,6 +348,15 @@ relocated:
*/
jmp *%rbp
+ .code32
+no_longmode:
+ /* This isn't an x86-64 CPU so hang */
+1:
+ hlt
+ jmp 1b
+
+#include "../../kernel/verify_cpu.S"
+
.data
gdt:
.word gdt_end - gdt
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 944ce595f767..9ec06a1f6d61 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -374,6 +374,14 @@ xloadflags:
#else
# define XLF0 0
#endif
+
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+ /* kernel/boot_param/ramdisk could be loaded above 4g */
+# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
+#else
+# define XLF1 0
+#endif
+
#ifdef CONFIG_EFI_STUB
# ifdef CONFIG_X86_64
# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
@@ -383,7 +391,7 @@ xloadflags:
#else
# define XLF23 0
#endif
- .word XLF0 | XLF23
+ .word XLF0 | XLF1 | XLF23
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
#added with boot protocol
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index adcc0ae73d09..223042086f4e 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,20 +1,14 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
+struct x86_mapping_info {
+ void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void *context; /* context for alloc_pgt_page */
+ unsigned long pmd_flag; /* page flag for PMD entry */
+ bool kernel_mapping; /* kernel mapping or ident mapping */
+};
-extern void __init zone_sizes_init(void);
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end);
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
- unsigned long page_size_mask);
-
-
-extern unsigned long __initdata pgt_buf_start;
-extern unsigned long __meminitdata pgt_buf_end;
-extern unsigned long __meminitdata pgt_buf_top;
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6080d2694bad..17483a492f18 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
#else
/* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
/* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
/* Allocate one page for the pdp and the second for the code */
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index eb05fb3b02fb..8a9b3e288cb4 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
#include <asm/numaq.h>
-extern void resume_map_numa_kva(pgd_t *pgd);
-
-#else /* !CONFIG_NUMA */
-
-static inline void resume_map_numa_kva(pgd_t *pgd) {}
-
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 49119fcea2dc..52560a2038e1 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
#ifdef CONFIG_X86_32
# include <asm/numa_32.h>
-#else
-# include <asm/numa_64.h>
#endif
#ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index 0c05f7ae46e8..000000000000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-extern unsigned long numa_free_all_bootmem(void);
-
-#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 8ca82839288a..c87892442e53 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -17,6 +17,10 @@
struct page;
+#include <linux/range.h>
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
static inline void clear_user_page(void *page, unsigned long vaddr,
struct page *pg)
{
@@ -44,7 +48,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* case properly. Once all supported versions of gcc understand it, we can
* remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
*/
-#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+#define __pa_symbol(x) \
+ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index da4e762406f7..4d550d04b609 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#endif
+#define __phys_addr_symbol(x) __phys_addr(x)
#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
#ifdef CONFIG_FLATMEM
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 072694ed81a5..0f1ddee6a0ce 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -3,4 +3,40 @@
#include <asm/page_64_types.h>
+#ifndef __ASSEMBLY__
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+static inline unsigned long __phys_addr_nodebug(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
+
+ return x;
+}
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+extern unsigned long __phys_addr_symbol(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#define __phys_addr_symbol(x) \
+ ((unsigned long)(x) - __START_KERNEL_map + phys_base)
+#endif
+
+#define __phys_reloc_hide(x) (x)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_pfn)
+#endif
+
+void clear_page(void *page);
+void copy_page(void *to, void *from);
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 320f7bb95f76..8b491e66eaa8 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -50,26 +50,4 @@
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
-#ifndef __ASSEMBLY__
-void clear_page(void *page);
-void copy_page(void *to, void *from);
-
-/* duplicated to the one in bootmem.h */
-extern unsigned long max_pfn;
-extern unsigned long phys_base;
-
-extern unsigned long __phys_addr(unsigned long);
-#define __phys_reloc_hide(x) (x)
-
-#define vmemmap ((struct page *)VMEMMAP_START)
-
-extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
-extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-
-#endif /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_pfn)
-#endif
-
#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index e21fdd10479f..54c97879195e 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
}
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
+
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fc304279b559..1e672234c4ff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -395,6 +395,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
+#include <linux/log2.h>
static inline int pte_none(pte_t pte)
{
@@ -620,6 +621,8 @@ static inline int pgd_none(pgd_t pgd)
#ifndef __ASSEMBLY__
extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
@@ -786,6 +789,20 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
memcpy(dst, src, count * sizeof(pgd_t));
}
+#define PTE_SHIFT ilog2(PTRS_PER_PTE)
+static inline int page_level_shift(enum pg_level level)
+{
+ return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
+}
+static inline unsigned long page_level_size(enum pg_level level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(enum pg_level level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
/*
* The x86 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 615b0c78449f..e22c1dbf7feb 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -180,6 +180,11 @@ extern void cleanup_highmap(void);
#define __HAVE_ARCH_PTE_SAME
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16fbbbd..2d883440cb9a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_64_DEFS_H
#define _ASM_X86_PGTABLE_64_DEFS_H
+#include <asm/sparsemem.h>
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3c32db8c539d..e6423002c10b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
-extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
@@ -331,7 +330,7 @@ extern void native_pagetable_init(void);
struct seq_file;
extern void arch_report_meminfo(struct seq_file *m);
-enum {
+enum pg_level {
PG_LEVEL_NONE,
PG_LEVEL_4K,
PG_LEVEL_2M,
@@ -352,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
* as a pte too.
*/
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern phys_addr_t slow_virt_to_phys(void *__address);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d172588efae5..8277941cbe99 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -721,6 +721,7 @@ extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
extern void early_trap_init(void);
+void early_trap_pf_init(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fe1ec5bcd846..9c6b890d5e7a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
extern unsigned char secondary_startup_64[];
#endif
-extern void __init setup_real_mode(void);
+void reserve_real_mode(void);
+void setup_real_mode(void);
#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1709801d18ec..5ee26875baea 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -125,13 +125,12 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
-#define __get_user_x(size, ret, x, ptr) \
- asm volatile("call __get_user_" #size \
- : "=a" (ret), "=d" (x) \
- : "0" (ptr)) \
-
-/* Careful: we have to cast the result to the type of the pointer
- * for sign reasons */
+/*
+ * This is a type: either unsigned long, if the argument fits into
+ * that type, or otherwise unsigned long long.
+ */
+#define __inttype(x) \
+__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
* get_user: - Get a simple variable from user space.
@@ -150,38 +149,26 @@ extern int __get_user_bad(void);
* Returns zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
-#ifdef CONFIG_X86_32
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(X, __ret_gu, __val_gu, ptr)
-#else
-#define __get_user_8(__ret_gu, __val_gu, ptr) \
- __get_user_x(8, __ret_gu, __val_gu, ptr)
-#endif
-
+/*
+ * Careful: we have to cast the result to the type of the pointer
+ * for sign reasons.
+ *
+ * The use of %edx as the register specifier is a bit of a
+ * simplification, as gcc only cares about it as the starting point
+ * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
+ * (%ecx being the next register in gcc's x86 register sequence), and
+ * %rdx on 64 bits.
+ */
#define get_user(x, ptr) \
({ \
int __ret_gu; \
- unsigned long __val_gu; \
+ register __inttype(*(ptr)) __val_gu asm("%edx"); \
__chk_user_ptr(ptr); \
might_fault(); \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __get_user_x(1, __ret_gu, __val_gu, ptr); \
- break; \
- case 2: \
- __get_user_x(2, __ret_gu, __val_gu, ptr); \
- break; \
- case 4: \
- __get_user_x(4, __ret_gu, __val_gu, ptr); \
- break; \
- case 8: \
- __get_user_8(__ret_gu, __val_gu, ptr); \
- break; \
- default: \
- __get_user_x(X, __ret_gu, __val_gu, ptr); \
- break; \
- } \
- (x) = (__typeof__(*(ptr)))__val_gu; \
+ asm volatile("call __get_user_%P3" \
+ : "=a" (__ret_gu), "=r" (__val_gu) \
+ : "0" (ptr), "i" (sizeof(*(ptr)))); \
+ (x) = (__typeof__(*(ptr))) __val_gu; \
__ret_gu; \
})
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 7669941cc9d2..d8d99222b36a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -69,17 +69,6 @@ struct x86_init_oem {
};
/**
- * struct x86_init_mapping - platform specific initial kernel pagetable setup
- * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
- *
- * For more details on the purpose of this hook, look in
- * init_memory_mapping and the commit that added it.
- */
-struct x86_init_mapping {
- void (*pagetable_reserve)(u64 start, u64 end);
-};
-
-/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_init: platform specific paging initialization call to setup
* the kernel pagetables and prepare accessors functions.
@@ -136,7 +125,6 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
- struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f4..cfc755dc1607 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
-# include <asm/numa_64.h>
#endif /* X86 */
#define BAD_MADT_ENTRY(entry, end) ( \
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005a..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)__pa(&initial_page_table);
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
aper_base = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 9c2aa89a11cb..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -28,6 +28,7 @@
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/apic_flat_64.h>
+#include <asm/pgtable.h>
static int numachip_system __read_mostly;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 84bee67141ad..edd77e7508b3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
# include <asm/mmconfig.h>
# include <asm/cacheflush.h>
#endif
@@ -680,12 +679,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* benefit in doing so.
*/
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c5..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
#ifdef CONFIG_X86_64
#include <linux/topology.h>
-#include <asm/numa_64.h>
#endif
#include "cpu.h"
@@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void)
#ifdef CONFIG_X86_F00F_BUG
static void __cpuinit trap_init_f00f_bug(void)
{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+ __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
/*
* Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
return *p == '\0' ? 0 : -EINVAL;
}
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d8..42a392a9fd02 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ ip = (unsigned long)__va(__pa_symbol(ip));
return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ ip = (unsigned long)__va(__pa_symbol(ip));
return probe_kernel_write((void *)ip, val, size);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 6773c918b8cc..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,20 +33,6 @@ void __init i386_start_kernel(void)
{
sanitize_boot_params(&boot_params);
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
-
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_MRST:
@@ -60,11 +46,5 @@ void __init i386_start_kernel(void)
break;
}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 849fc9e63c2f..57334f4cd3af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,11 +27,81 @@
#include <asm/bios_ebda.h>
#include <asm/bootparam_utils.h>
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+ unsigned long i;
+
+ for (i = 0; i < PTRS_PER_PGD-1; i++)
+ early_level4_pgt[i].pgd = 0;
+
+ next_early_pgt = 0;
+
+ write_cr3(__pa(early_level4_pgt));
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ unsigned long i;
+ pgdval_t pgd, *pgd_p;
+ pudval_t pud, *pud_p;
+ pmdval_t pmd, *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ return -1;
+
+again:
+ pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (pgd)
+ pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_p[i] = 0;
+ *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_p[i] = 0;
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+ pmd_p[pmd_index(address)] = pmd;
+
+ return 0;
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -42,14 +112,25 @@ static void __init clear_bss(void)
(unsigned long) __bss_stop - (unsigned long) __bss_start);
}
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
+ unsigned long cmd_line_ptr;
memcpy(&boot_params, real_mode_data, sizeof boot_params);
sanitize_boot_params(&boot_params);
- if (boot_params.hdr.cmd_line_ptr) {
- command_line = __va(boot_params.hdr.cmd_line_ptr);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
}
@@ -72,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
- /* Make NULL pointers segfault */
- zap_identity_mappings();
-
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -89,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
+ copy_bootdata(__va(real_mode_data));
+
if (console_loglevel == 10)
early_printk("Kernel alive\n");
+ clear_page(init_level4_pgt);
+ /* set init_level4_pgt kernel high mapping*/
+ init_level4_pgt[511] = early_level4_pgt[511];
+
x86_64_start_reservations(real_mode_data);
}
void __init x86_64_start_reservations(char *real_mode_data)
{
- copy_bootdata(__va(real_mode_data));
-
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
reserve_ebda_region();
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..d94f6d68be2a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
-
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
* tables and then reload them.
*/
- /* Compute the delta between the address I am compiled to run at and the
+ /*
+ * Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
testl %eax, %eax
jnz bad_address
- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
+ /*
+ * Is the address too large?
*/
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ leaq _text(%rip), %rax
+ shrq $MAX_PHYSMEM_BITS, %rax
+ jnz bad_address
- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Fixup the physical addresses in the page table
+ */
+ addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
- /* Add an Identity mapping if I am above 1G */
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+ leaq early_level4_pgt(%rip), %rbx
movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+ shrq $PGDIR_SHIFT, %rax
- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ movq %rdx, 0(%rbx,%rax,8)
+ movq %rdx, 8(%rbx,%rax,8)
+ addq $4096, %rdx
movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ shrq $PUD_SHIFT, %rax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, (4096+0)(%rbx,%rax,8)
+ movq %rdx, (4096+8)(%rbx,%rax,8)
+
+ addq $8192, %rbx
+ movq %rdi, %rax
+ shrq $PMD_SHIFT, %rdi
+ addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+ leaq (_end - 1)(%rip), %rcx
+ shrq $PMD_SHIFT, %rcx
+ subq %rdi, %rcx
+ incl %ecx
+
+1:
+ andq $(PTRS_PER_PMD - 1), %rdi
+ movq %rax, (%rbx,%rdi,8)
+ incq %rdi
+ addq $PMD_SIZE, %rax
+ decl %ecx
+ jnz 1b
/*
* Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
-
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
- /* Due to ENTRY(), sometimes the empty space gets filled with
- * zeros. Better take a jmp than relying on empty space being
- * filled with 0x90 (nop)
- */
- jmp secondary_startup_64
+ movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ jmp 1f
ENTRY(secondary_startup_64)
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+1:
+
/* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
- movq %rax, %cr4
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ movq stack_start(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr
- /* esi is pointer to real mode structure with interesting info.
+ /* rsi is pointer to real mode structure with interesting info.
pass it to C */
- movl %esi, %edi
+ movq %rsi, %rdi
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
* a far return.
+ *
+ * Note: do not change to far jump indirect with 64bit offset.
+ *
+ * AMD does not support far jump indirect with 64bit offset.
+ * AMD64 Architecture Programmer's Manual, Volume 3: states only
+ * JMP FAR mem16:16 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ * JMP FAR mem16:32 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ *
+ * Intel64 does support 64bit offset.
+ * Software Developer Manual Vol 2: states:
+ * FF /5 JMP m16:16 Jump far, absolute indirect,
+ * address given in m16:16
+ * FF /5 JMP m16:32 Jump far, absolute indirect,
+ * address given in m16:32.
+ * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+ * address given in m16:64.
*/
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
/* SMP bootup changes these two */
__REFDATA
- .align 8
- ENTRY(initial_code)
+ .balign 8
+ GLOBAL(initial_code)
.quad x86_64_start_kernel
- ENTRY(initial_gs)
+ GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
- ENTRY(stack_start)
+ GLOBAL(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
bad_address:
jmp bad_address
- .section ".init.text","ax"
+ __INIT
.globl early_idt_handlers
early_idt_handlers:
# 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
pushq %r11 # 0(%rsp)
cmpl $__KERNEL_CS,96(%rsp)
- jne 10f
+ jne 11f
+
+ cmpl $14,72(%rsp) # Page fault?
+ jnz 10f
+ GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f # All good
+10:
leaq 88(%rsp),%rdi # Pointer to %rip
call early_fixup_exception
andl %eax,%eax
jnz 20f # Found an exception entry
-10:
+11:
#ifdef CONFIG_EARLY_PRINTK
GET_CR2_INTO(%r9) # can clobber any volatile register if pv
movl 80(%rsp),%r8d # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
1: hlt
jmp 1b
-20: # Exception table entry found
+20: # Exception table entry found or page table generated
popq %r11
popq %r10
popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
decl early_recursion_flag(%rip)
INTERRUPT_RETURN
+ __INITDATA
+
.balign 4
early_recursion_flag:
.long 0
@@ -374,11 +417,10 @@ early_idt_msg:
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
- .previous
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
@@ -388,24 +430,37 @@ ENTRY(name)
i = i + 1 ; \
.endr
+ __INITDATA
+NEXT_PAGE(early_level4_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
.data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
+
+#ifndef CONFIG_XEN
NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .fill 512,8,0
+#else
+NEXT_PAGE(init_level4_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 511,8,0
+ .fill 511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+ /* Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#endif
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
NEXT_PAGE(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0
#undef PMDS
-#undef NEXT_PAGE
.data
.align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
.skip IDT_ENTRIES * 16
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050e..0fa69127209a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
EXPORT_SYMBOL(__put_user_1);
EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 2b44ea5f269d..b686a904d7c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -297,9 +297,9 @@ static void kvm_register_steal_time(void)
memset(st, 0, sizeof(*st));
- wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
- printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
- cpu, __pa(st));
+ wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_info("kvm-stealtime: cpu %d, msr %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
}
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
return;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = __pa(&__get_cpu_var(apf_reason));
+ u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
__get_cpu_var(kvm_apic_eoi) = 0;
- pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
+ pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
wrmsrl(MSR_KVM_PV_EOI_EN, pa);
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f8..9f966dc0b9e4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -162,8 +162,8 @@ int kvm_register_clock(char *txt)
int low, high, ret;
struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
- low = (int)__pa(src) | 1;
- high = ((u64)__pa(src) >> 32);
+ low = (int)slow_virt_to_phys(src) | 1;
+ high = ((u64)slow_virt_to_phys(src) >> 32);
ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
#include <linux/io.h>
#include <linux/suspend.h>
+#include <asm/init.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
- unsigned long addr)
-{
- pud_t *pud;
- pmd_t *pmd;
- struct page *page;
- int result = -ENOMEM;
-
- addr &= PMD_MASK;
- pgd += pgd_index(addr);
- if (!pgd_present(*pgd)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pud = (pud_t *)page_address(page);
- clear_page(pud);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pmd = (pmd_t *)page_address(page);
- clear_page(pmd);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- result = 0;
-out:
- return result;
-}
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
- unsigned long end_addr;
-
- addr &= PAGE_MASK;
- end_addr = addr + PUD_SIZE;
- while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- addr += PMD_SIZE;
- }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + PGDIR_SIZE;
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pmd_t *level2p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level2p = (pmd_t *)page_address(page);
- init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
- addr += PUD_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pud_clear(level3p++);
- addr += PUD_SIZE;
- }
-out:
- return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pud_t *level3p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level3p = (pud_t *)page_address(page);
- result = init_level3_page(image, level3p, addr, last_addr);
- if (result)
- goto out;
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
- addr += PGDIR_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pgd_clear(level4p++);
- addr += PGDIR_SIZE;
- }
-out:
- return result;
-}
-
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
return result;
}
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
+
+ return p;
+}
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ };
+ unsigned long mstart, mend;
pgd_t *level4p;
int result;
+ int i;
+
level4p = (pgd_t *)__va(start_pgtable);
- result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
- if (result)
- return result;
+ clear_page(level4p);
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+ if (result)
+ return result;
+ }
+
/*
- * image->start may be outside 0 ~ max_pfn, for example when
- * jump back to original kernel from kexeced kernel
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
*/
- result = init_one_level2_page(image, level4p, image->start);
- if (result)
- return result;
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+
+ if (result)
+ return result;
+ }
+
return init_transition_pgtable(image, level4p);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8b24289cc10c..915f5efefcf5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
#include <asm/topology.h>
#include <asm/apicdef.h>
#include <asm/amd_nb.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
#include <asm/mce.h>
#include <asm/alternative.h>
#include <asm/prom.h>
/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
static void __init cleanup_highmap(void)
{
}
@@ -296,8 +284,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa(_brk_start),
- __pa(_brk_end) - __pa(_brk_start));
+ memblock_reserve(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ return ramdisk_size;
+}
+
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
- /* We need to move the initrd down into lowmem */
- ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
- PAGE_SIZE);
+ /* We need to move the initrd down into directly mapped mem */
+ ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
- /* Note: this includes all the lowmem currently occupied by
+ /* Note: this includes all the mem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
q = (char *)initrd_start;
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- /* Copy the highmem portion of the initrd */
+ /* Copy the initrd */
while (ramdisk_size) {
slop = ramdisk_image & ~PAGE_MASK;
clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
ramdisk_image += clen;
ramdisk_size -= clen;
}
- /* high pages is not converted by early_res_to_bootmem */
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_image = get_ramdisk_image();
+ ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
+static void __init early_reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
static void __init reserve_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+ u64 mapped_size;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
initrd_start = 0;
- if (ramdisk_size >= (end_of_lowmem>>1)) {
+ mapped_size = memblock_mem_size(max_pfn_mapped);
+ if (ramdisk_size >= (mapped_size>>1))
panic("initrd too large to handle, "
"disabling initrd (%lld needed, %lld available)\n",
- ramdisk_size, end_of_lowmem>>1);
- }
+ ramdisk_size, mapped_size>>1);
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
-
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- /*
- * don't need to reserve again, already reserved early
- * in i386_start_kernel
- */
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
+static void __init early_reserve_initrd(void)
+{
+}
static void __init reserve_initrd(void)
{
}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
struct setup_data *data;
u64 pa_data;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
u64 pa_data;
int found = 0;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
struct setup_data *data;
u64 pa_data;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
- * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
- * limit once kexec-tools are fixed.
*/
#ifdef CONFIG_X86_32
# define CRASH_KERNEL_ADDR_MAX (512 << 20)
#else
-# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+# define CRASH_KERNEL_ADDR_MAX MAXMEM
+#endif
+
+static void __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long low_base = 0, low_size = 0;
+ unsigned long total_low_mem;
+ unsigned long long base;
+ int ret;
+
+ total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem,
+ &low_size, &base);
+ if (ret != 0 || low_size <= 0)
+ return;
+
+ low_base = memblock_find_in_range(low_size, (1ULL<<32),
+ low_size, alignment);
+
+ if (!low_base) {
+ pr_info("crashkernel low reservation failed - No suitable area found.\n");
+
+ return;
+ }
+
+ memblock_reserve(low_base, low_size);
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
#endif
+}
static void __init reserve_crashkernel(void)
{
+ const unsigned long long alignment = 16<<20; /* 16M */
unsigned long long total_mem;
unsigned long long crash_size, crash_base;
int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
/* 0 means: find the address automatically */
if (crash_base <= 0) {
- const unsigned long long alignment = 16<<20; /* 16M */
-
/*
* kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
+
} else {
unsigned long long start;
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
+
+ if (crash_base >= (1ULL<<32))
+ reserve_crashkernel_low();
}
#else
static void __init reserve_crashkernel(void)
@@ -608,8 +644,6 @@ static __init void reserve_ibft_region(void)
memblock_reserve(addr, size);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
@@ -698,8 +732,7 @@ static void __init trim_bios_range(void)
* since some BIOSes are known to corrupt low memory. See the
* Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
- E820_RAM, E820_RESERVED);
+ e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
/*
* special case: Some BIOSen report the PC BIOS
@@ -711,6 +744,29 @@ static void __init trim_bios_range(void)
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820_all_mapped(start, start + size, E820_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+ e820_remove_range(start, size, E820_RAM, 0);
+ e820_add_region(start, size, E820_RAM);
+}
+
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
static int __init parse_reservelow(char *p)
{
unsigned long long size;
@@ -733,6 +789,11 @@ static int __init parse_reservelow(char *p)
early_param("reservelow", parse_reservelow);
+static void __init trim_low_memory_range(void)
+{
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -748,6 +809,17 @@ early_param("reservelow", parse_reservelow);
void __init setup_arch(char **cmdline_p)
{
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
+ early_reserve_initrd();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
@@ -835,12 +907,12 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -906,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
+ e820_add_kernel_range();
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
@@ -955,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
reserve_ibft_region();
+ early_alloc_pgt_buf();
+
/*
* Need to conclude brk, before memblock_x86_fill()
* it could use memblock_find_in_range, could overlap with
@@ -964,7 +1039,7 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock.current_limit = get_max_mapped();
+ memblock.current_limit = ISA_END_ADDRESS;
memblock_x86_fill();
/*
@@ -981,41 +1056,22 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
+#ifdef CONFIG_X86_32
printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
(max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
- setup_real_mode();
+ reserve_real_mode();
trim_platform_memory_ranges();
+ trim_low_memory_range();
- init_gbpages();
-
- /* max_pfn_mapped is updated here */
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- max_pfn_mapped = max_low_pfn_mapped;
-
-#ifdef CONFIG_X86_64
- if (max_pfn > max_low_pfn) {
- int i;
- unsigned long start, end;
- unsigned long start_pfn, end_pfn;
-
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
- NULL) {
+ init_mem_mapping();
- end = PFN_PHYS(end_pfn);
- if (end <= (1UL<<32))
- continue;
+ early_trap_pf_init();
- start = PFN_PHYS(start_pfn);
- max_pfn_mapped = init_memory_mapping(
- max((1UL<<32), start), end);
- }
+ setup_real_mode();
- /* can we preseve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- }
-#endif
memblock.current_limit = get_max_mapped();
dma_contiguous_reserve(0);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e9..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
/* int3 can be called from all */
set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+#ifdef CONFIG_X86_32
set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
load_idt(&idt_descr);
}
+void __init early_trap_pf_init(void)
+{
+#ifdef CONFIG_X86_64
+ set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
+}
+
void __init trap_init(void)
{
int i;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd102950..b014d9414d08 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
+#ifndef CONFIG_DEBUG_VIRTUAL
+EXPORT_SYMBOL(phys_base);
+#endif
EXPORT_SYMBOL(empty_zero_page);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d065d67c2672..45a14dbbddaf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -63,10 +63,6 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
- .mapping = {
- .pagetable_reserve = native_pagetable_reserve,
- },
-
.paging = {
.pagetable_init = native_pagetable_init,
},
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index df4176cdbb32..1cbd89ca5569 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3)
current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */
- if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
+ if (cr3 != __pa_symbol(swapper_pg_dir) &&
+ cr3 != __pa_symbol(initial_page_table))
cr3_changed = true;
}
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 156b9c804670..a4512359656a 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -15,11 +15,10 @@
* __get_user_X
*
* Inputs: %[r|e]ax contains the address.
- * The register is modified, but all changes are undone
- * before returning because the C code doesn't know about it.
*
* Outputs: %[r|e]ax is error code (0 or -EFAULT)
* %[r|e]dx contains zero-extended value
+ * %ecx contains the high half for 32-bit __get_user_8
*
*
* These functions should not modify any other registers,
@@ -42,7 +41,7 @@ ENTRY(__get_user_1)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
-1: movzb (%_ASM_AX),%edx
+1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
ret
@@ -72,29 +71,42 @@ ENTRY(__get_user_4)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
ASM_STAC
-3: mov -3(%_ASM_AX),%edx
+3: movl -3(%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
ret
CFI_ENDPROC
ENDPROC(__get_user_4)
-#ifdef CONFIG_X86_64
ENTRY(__get_user_8)
CFI_STARTPROC
+#ifdef CONFIG_X86_64
add $7,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
- jae bad_get_user
+ jae bad_get_user
ASM_STAC
-4: movq -7(%_ASM_AX),%_ASM_DX
+4: movq -7(%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
ret
+#else
+ add $7,%_ASM_AX
+ jc bad_get_user_8
+ GET_THREAD_INFO(%_ASM_DX)
+ cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user_8
+ ASM_STAC
+4: movl -7(%_ASM_AX),%edx
+5: movl -3(%_ASM_AX),%ecx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#endif
CFI_ENDPROC
ENDPROC(__get_user_8)
-#endif
+
bad_get_user:
CFI_STARTPROC
@@ -105,9 +117,24 @@ bad_get_user:
CFI_ENDPROC
END(bad_get_user)
+#ifdef CONFIG_X86_32
+bad_get_user_8:
+ CFI_STARTPROC
+ xor %edx,%edx
+ xor %ecx,%ecx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+ CFI_ENDPROC
+END(bad_get_user_8)
+#endif
+
_ASM_EXTABLE(1b,bad_get_user)
_ASM_EXTABLE(2b,bad_get_user)
_ASM_EXTABLE(3b,bad_get_user)
#ifdef CONFIG_X86_64
_ASM_EXTABLE(4b,bad_get_user)
+#else
+ _ASM_EXTABLE(4b,bad_get_user_8)
+ _ASM_EXTABLE(5b,bad_get_user_8)
#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..d41815265a0b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,86 +17,132 @@
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
-int after_bootmem;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
- = 1
-#endif
-;
+static unsigned long min_pfn_mapped;
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
+static bool __initdata can_use_brk_pgt = true;
/*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
- * pages. Then find enough contiguous space for those page tables.
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
*/
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+__ref void *alloc_low_pages(unsigned int num)
{
+ unsigned long pfn;
int i;
- unsigned long puds = 0, pmds = 0, ptes = 0, tables;
- unsigned long start = 0, good_end;
- phys_addr_t base;
- for (i = 0; i < nr_range; i++) {
- unsigned long range, extra;
+ if (after_bootmem) {
+ unsigned int order;
- range = mr[i].end - mr[i].start;
- puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
- extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
- pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else {
- pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
- }
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_page: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_page: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
- if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
- extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
-#endif
- ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else {
- ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
- }
+ for (i = 0; i < num; i++) {
+ void *adr;
+
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
}
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+ return __va(pfn << PAGE_SHIFT);
+}
-#ifdef CONFIG_X86_32
- /* for fixmap */
- tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
- good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
- base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- if (!base)
- panic("Cannot find space for the kernel page tables");
+ base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
- printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
- mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
- (pgt_buf_top << PAGE_SHIFT) - 1);
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+#endif
}
-void __init native_pagetable_reserve(u64 start, u64 end)
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
{
- memblock_reserve(start, end - start);
+ init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (direct_gbpages)
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ set_in_cr4(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ set_in_cr4(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
}
#ifdef CONFIG_X86_32
@@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
}
/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
- unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
{
- unsigned long page_size_mask = 0;
- unsigned long start_pfn, end_pfn;
- unsigned long ret = 0;
- unsigned long pos;
-
- struct map_range mr[NR_RANGE_MR];
- int nr_range, i;
- int use_pse, use_gbpages;
+ int i;
- printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
- start, end - 1);
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- /*
- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
- * This will simplify cpa(), which otherwise needs to support splitting
- * large pages into small in interrupt context, etc.
- */
- use_pse = use_gbpages = 0;
-#else
- use_pse = cpu_has_pse;
- use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
#endif
- /* Enable PSE if available */
- if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
- /* Enable PGE if available */
- if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
}
+}
- if (use_gbpages)
- page_size_mask |= 1 << PG_LEVEL_1G;
- if (use_pse)
- page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
- memset(mr, 0, sizeof(mr));
- nr_range = 0;
+ limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
- start_pfn = start >> PAGE_SHIFT;
- pos = start_pfn << PAGE_SHIFT;
+ pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
@@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
- if (pos == 0)
- end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
else
- end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
- if (end_pfn > (end >> PAGE_SHIFT))
- end_pfn = end >> PAGE_SHIFT;
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* big page (2M) range */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
- start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
- << (PUD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
- << (PMD_SHIFT - PAGE_SHIFT);
- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
- pos = end_pfn << PAGE_SHIFT;
+ pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
- start_pfn = pos>>PAGE_SHIFT;
- end_pfn = end>>PAGE_SHIFT;
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
/* try to merge same page size and continuous */
@@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
nr_range--;
}
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
- /*
- * Find space for the kernel direct mapping tables.
- *
- * Later we should allocate these tables in the local node of the
- * memory mapped. Unfortunately this is done currently before the
- * nodes are discovered.
- */
- if (!after_bootmem)
- find_early_table_space(mr, nr_range);
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
-#ifdef CONFIG_X86_32
- early_ioremap_page_table_range_init();
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
- load_cr3(swapper_pg_dir);
-#endif
+ return ret >> PAGE_SHIFT;
+}
- __flush_tlb_all();
+/*
+ * would have hole in the middle or ends, and only ram parts will be mapped.
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
- /*
- * Reserve the kernel pagetable pages we used (pgt_buf_start -
- * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
- * so that they can be reused for other purposes.
- *
- * On native it just means calling memblock_reserve, on Xen it also
- * means marking RW the pagetable pages that we allocated before
- * but that haven't been used.
- *
- * In fact on xen we mark RO the whole range pgt_buf_start -
- * pgt_buf_top, because we have to make sure that when
- * init_memory_mapping reaches the pagetable pages area, it maps
- * RO all the pagetable pages, including the ones that are beyond
- * pgt_buf_end at that time.
- */
- if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
- PFN_PHYS(pgt_buf_end));
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
- if (!after_bootmem)
- early_memtest(start, end);
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
- return ret >> PAGE_SHIFT;
+ return mapped_ram_size;
}
+/* (PUD_SHIFT-PMD_SHIFT)/2 */
+#define STEP_SIZE_SHIFT 5
+void __init init_mem_mapping(void)
+{
+ unsigned long end, real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+ unsigned long new_mapped_ram_size;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /* xen has big range in reserved near end of ram, skip it at first */
+ addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+ PAGE_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+ while (last_start > ISA_END_ADDRESS) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < ISA_END_ADDRESS)
+ start = ISA_END_ADDRESS;
+ } else
+ start = ISA_END_ADDRESS;
+ new_mapped_ram_size = init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ /* only increase step_size after big range get mapped */
+ if (new_mapped_ram_size > mapped_ram_size)
+ step_size <<= STEP_SIZE_SHIFT;
+ mapped_ram_size += new_mapped_ram_size;
+ }
+
+ if (real_end < end)
+ init_range_memory_mapping(real_end, end);
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..b299724f6e34 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
#include <asm/page_types.h>
#include <asm/init.h>
+#include "mm_internal.h"
+
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
-static __init void *alloc_low_page(void)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = __va(pfn * PAGE_SIZE);
- clear_page(adr);
- return adr;
-}
-
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- if (after_bootmem)
- pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
- else
- pmd_table = (pmd_t *)alloc_low_page();
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
- pte_t *page_table = NULL;
-
- if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
- page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
- if (!page_table)
- page_table =
- (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
- } else
- page_table = (pte_t *)alloc_low_page();
+ pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
}
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
- unsigned long vaddr, pte_t *lastpte)
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
- && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
- && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
- || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
- newpte = alloc_low_page();
+ newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
- pmd, vaddr, pte);
+ pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
@@ -310,6 +321,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
+ pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
/*
* Remove any mappings which extend past the end of physical
- * memory from the boot time page table:
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
*/
- for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
if (!pmd_present(*pmd))
break;
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
* artificially via the highmem=x boot parameter then create
* it:
*/
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
- after_bootmem = 1;
}
/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
+ after_bootmem = 1;
+
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6eeead43758..3eba7f429880 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
void __init cleanup_highmap(void)
{
unsigned long vaddr = __START_KERNEL_map;
- unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
if (pmd_none(*pmd))
continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
}
}
-static __ref void *alloc_low_page(unsigned long *phys)
-{
- unsigned long pfn = pgt_buf_end++;
- void *adr;
-
- if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
- *phys = __pa(adr);
-
- return adr;
- }
-
- if (pfn >= pgt_buf_top)
- panic("alloc_low_page: ran out of memory");
-
- adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- clear_page(adr);
- *phys = pfn * PAGE_SIZE;
- return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
- void *adr;
- unsigned long phys, left;
-
- if (after_bootmem)
- return virt;
-
- phys = __pa(virt);
- left = phys & (PAGE_SIZE - 1);
- adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
- adr = (void *)(((unsigned long)adr) | left);
-
- return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
- if (after_bootmem)
- return;
-
- early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
static unsigned long __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
pgprot_t prot)
{
- unsigned pages = 0;
+ unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
pte_t *pte = pte_page + pte_index(addr);
- for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
- if (!after_bootmem) {
- for(; i < PTRS_PER_PTE; i++, pte++)
- set_pte(pte, __pte(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
}
/*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
int i = pmd_index(address);
for (; i < PTRS_PER_PMD; i++, address = next) {
- unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
+ next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
- if (!after_bootmem) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
- }
- break;
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
}
- next = (address & PMD_MASK) + PMD_SIZE;
-
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
- pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
end, prot);
- unmap_low_page(pte);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte(address >> PAGE_SHIFT,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pte = alloc_low_page(&pte_phys);
+ pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
- unmap_low_page(pte);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+ pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = next) {
- unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- if (addr >= end)
- break;
-
next = (addr & PUD_MASK) + PUD_SIZE;
-
- if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
- set_pud(pud, __pud(0));
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
continue;
}
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
- pmd = map_low_page(pmd_offset(pud, 0));
+ pmd = pmd_offset(pud, 0);
last_map_addr = phys_pmd_init(pmd, addr, end,
page_size_mask, prot);
- unmap_low_page(pmd);
__flush_tlb_all();
continue;
}
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
last_map_addr = next;
continue;
}
- pmd = alloc_low_page(&pmd_phys);
+ pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
prot);
- unmap_low_page(pmd);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, __va(pmd_phys));
+ pud_populate(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
__flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
- unsigned long pud_phys;
pud_t *pud;
- next = (start + PGDIR_SIZE) & PGDIR_MASK;
- if (next > end)
- next = end;
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
- pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
last_map_addr = phys_pud_init(pud, __pa(start),
__pa(end), page_size_mask);
- unmap_low_page(pud);
continue;
}
- pud = alloc_low_page(&pud_phys);
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
page_size_mask);
- unmap_low_page(pud);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, __va(pud_phys));
+ pgd_populate(&init_mm, pgd, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- last_mapped_pfn = init_memory_mapping(start, start + size);
- if (last_mapped_pfn > max_pfn_mapped)
- max_pfn_mapped = last_mapped_pfn;
+ init_memory_mapping(start, start + size);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
@@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
static struct kcore_list kcore_vsyscall;
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
@@ -698,11 +733,8 @@ void __init mem_init(void)
reservedpages = 0;
/* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
- totalram_pages = numa_free_all_bootmem();
-#else
+ register_page_bootmem_info();
totalram_pages = free_all_bootmem();
-#endif
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -772,12 +804,11 @@ void set_kernel_text_ro(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
- unsigned long rodata_start =
- ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
- unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
- unsigned long data_start = (unsigned long) &_sdata;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end = PFN_ALIGN(&_end);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -786,10 +817,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
/*
- * The rodata section (but not the kernel text!) should also be
- * not-executable.
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
*/
- set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
rodata_test();
@@ -802,12 +833,12 @@ void mark_rodata_ro(void)
#endif
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(text_end)),
- (unsigned long)
- page_address(virt_to_page(rodata_start)));
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+
free_init_pages("unused kernel memory",
- (unsigned long) page_address(virt_to_page(rodata_end)),
- (unsigned long) page_address(virt_to_page(data_start)));
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
}
#endif
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2d125be1bae9..8504f3698753 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- bool remapped = false;
u64 nd_pa;
void *nd;
int tnid;
@@ -205,37 +204,28 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
- /* initialize remap allocator before aligning to ZONE_ALIGN */
- init_alloc_remap(nid, start, end);
-
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
nid, start, end - 1);
/*
- * Allocate node data. Try remap allocator first, node-local
- * memory and then any node. Never allocate in DMA zone.
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
*/
- nd = alloc_remap(nid, nd_size);
- if (nd) {
- nd_pa = __pa(nd);
- remapped = true;
- } else {
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
}
+ nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n",
- nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (!remapped && tnid != nid)
+ if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid. The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function. For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, nr_pages, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- nr_pages = (node_remap_end_vaddr[node] -
- node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem. As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted. The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal. alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long size, pfn;
- u64 node_pa, remap_pa;
- void *remap_va;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones where
- * memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, start_pfn, end_pfn);
-
- /* calculate the necessary space aligned to large page size */
- size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
- size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- size = ALIGN(size, LARGE_PAGE_BYTES);
-
- /* allocate node memory and the lowmem remap area */
- node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (!node_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
- size, nid);
- return;
- }
- memblock_reserve(node_pa, size);
-
- remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT,
- size, LARGE_PAGE_BYTES);
- if (!remap_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
- size, nid);
- memblock_free(node_pa, size);
- return;
- }
- memblock_reserve(remap_pa, size);
- remap_va = phys_to_virt(remap_pa);
-
- /* perform actual remap */
- for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
- set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
- (node_pa >> PAGE_SHIFT) + pfn,
- PAGE_KERNEL_LARGE);
-
- /* initialize remap allocator parameters */
- node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
- node_remap_start_vaddr[nid] = remap_va;
- node_remap_end_vaddr[nid] = remap_va + size;
- node_remap_alloc_vaddr[nid] = remap_va;
-
- printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
- nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
void __init initmem_init(void)
{
x86_numa_init();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
{
x86_numa_init();
}
-
-unsigned long __init numa_free_all_bootmem(void)
-{
- unsigned long pages = 0;
- int i;
-
- for_each_online_node(i)
- pages += free_all_bootmem_node(NODE_DATA(i));
-
- pages += free_low_memory_core_early(MAX_NUMNODES);
-
- return pages;
-}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
void __init x86_numa_init(void);
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d23503..a1b1c88f9caf 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -94,12 +94,12 @@ static inline void split_page_count(int level) { }
static inline unsigned long highmap_start_pfn(void)
{
- return __pa(_text) >> PAGE_SHIFT;
+ return __pa_symbol(_text) >> PAGE_SHIFT;
}
static inline unsigned long highmap_end_pfn(void)
{
- return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
}
#endif
@@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases.
*/
- if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
- __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ unsigned long psize;
+ unsigned long pmask;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ offset = virt_addr & ~pmask;
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
* Set the new pmd in all the pgds we know about:
*/
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
@@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
- unsigned int level;
+ enum pg_level level;
if (cpa->force_split)
return 1;
@@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
switch (level) {
case PG_LEVEL_2M:
- psize = PMD_PAGE_SIZE;
- pmask = PMD_PAGE_MASK;
- break;
#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
- psize = PUD_PAGE_SIZE;
- pmask = PUD_PAGE_MASK;
- break;
#endif
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+ break;
default:
do_split = -EINVAL;
goto out_unlock;
@@ -551,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
- if (address >= (unsigned long)__va(0) &&
- address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+ PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
-#ifdef CONFIG_X86_64
- if (address >= (unsigned long)__va(1UL<<32) &&
- address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
- split_page_count(level);
-#endif
-
/*
* Install the new, split up pagetable.
*
@@ -729,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
- if (cpa->pfn >= max_pfn_mapped)
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
-#ifdef CONFIG_X86_64
- if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
- return 0;
-#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0eb572eda406..2610bd93c896 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
- if (base >= __pa(high_memory))
+ if (base > __pa(high_memory-1))
return 0;
- id_sz = (__pa(high_memory) < base + size) ?
+ id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e27fbf887f3b..193350b51f90 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -334,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
pmd_update_defer(vma->vm_mm, address, pmdp);
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
}
return changed;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b4..e666cbbb9261 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
+#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -8,33 +9,54 @@
#ifdef CONFIG_X86_64
+#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
- x += phys_base;
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
- VIRTUAL_BUG_ON(x < PAGE_OFFSET);
- x -= PAGE_OFFSET;
- VIRTUAL_BUG_ON(!phys_addr_valid(x));
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
+
return x;
}
EXPORT_SYMBOL(__phys_addr);
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
bool __virt_addr_valid(unsigned long x)
{
- if (x >= __START_KERNEL_map) {
- x -= __START_KERNEL_map;
- if (x >= KERNEL_IMAGE_SIZE)
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
return false;
- x += phys_base;
} else {
- if (x < PAGE_OFFSET)
- return false;
- x -= PAGE_OFFSET;
- if (!phys_addr_valid(x))
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
return false;
}
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
+ unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
- return x - PAGE_OFFSET;
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 928bf837040a..70b2a3a305d6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -416,8 +416,8 @@ void __init efi_reserve_boot_services(void)
* - Not within any part of the kernel
* - Not the bios reserved area
*/
- if ((start+size >= virt_to_phys(_text)
- && start <= virt_to_phys(_end)) ||
+ if ((start+size >= __pa_symbol(_text)
+ && start <= __pa_symbol(_end)) ||
!e820_all_mapped(start, start+size, E820_RAM) ||
memblock_is_region_reserved(start, size)) {
/* Could not reserve, skip it */
@@ -843,7 +843,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
- u64 end, systab, end_pfn;
+ u64 end, systab, start_pfn, end_pfn;
void *p, *va, *new_memmap = NULL;
int count = 0;
@@ -896,10 +896,9 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
+ start_pfn = PFN_DOWN(md->phys_addr);
end_pfn = PFN_UP(end);
- if (end_pfn <= max_low_pfn_mapped
- || (end_pfn > (1UL << (32 - PAGE_SHIFT))
- && end_pfn <= max_pfn_mapped)) {
+ if (pfn_range_is_mapped(start_pfn, end_pfn)) {
va = __va(md->phys_addr);
if (!(md->attribute & EFI_MEMORY_WB))
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 74202c1910cd..7d28c885d238 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
}
}
- resume_map_numa_kva(pgd_base);
-
return 0;
}
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314d13e5..a0fde91c16cf 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/suspend.h>
+
+#include <asm/init.h>
#include <asm/proto.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
void *relocated_restore_code;
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void *alloc_pgt_page(void *context)
{
- long i, j;
-
- i = pud_index(address);
- pud = pud + i;
- for (; i < PTRS_PER_PUD; pud++, i++) {
- unsigned long paddr;
- pmd_t *pmd;
-
- paddr = address + i*PUD_SIZE;
- if (paddr >= end)
- break;
-
- pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
- if (!pmd)
- return -ENOMEM;
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
- unsigned long pe;
-
- if (paddr >= end)
- break;
- pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
- pe &= __supported_pte_mask;
- set_pmd(pmd, __pmd(pe));
- }
- }
- return 0;
+ return (void *)get_safe_page(GFP_ATOMIC);
}
static int set_up_temporary_mappings(void)
{
- unsigned long start, end, next;
- int error;
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernel_mapping = true,
+ };
+ unsigned long mstart, mend;
+ int result;
+ int i;
temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
init_level4_pgt[pgd_index(__START_KERNEL_map)]);
/* Set up the direct mapping from scratch */
- start = (unsigned long)pfn_to_kaddr(0);
- end = (unsigned long)pfn_to_kaddr(max_pfn);
-
- for (; start < end; start = next) {
- pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
- if (!pud)
- return -ENOMEM;
- next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
- if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
- return error;
- set_pgd(temp_level4_pgt + pgd_index(start),
- mk_kernel_pgd(__pa(pud)));
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info, temp_level4_pgt,
+ mstart, mend);
+
+ if (result)
+ return result;
}
+
return 0;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565af5bd..a44f457e70a1 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -8,9 +8,26 @@
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
-void __init setup_real_mode(void)
+void __init reserve_real_mode(void)
{
phys_addr_t mem;
+ unsigned char *base;
+ size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+ /* Has to be under 1M so we can execute real-mode AP code. */
+ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ if (!mem)
+ panic("Cannot allocate trampoline\n");
+
+ base = __va(mem);
+ memblock_reserve(mem, size);
+ real_mode_header = (struct real_mode_header *) base;
+ printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+ base, (unsigned long long)mem, size);
+}
+
+void __init setup_real_mode(void)
+{
u16 real_mode_seg;
u32 *rel;
u32 count;
@@ -25,16 +42,7 @@ void __init setup_real_mode(void)
u64 efer;
#endif
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
-
- base = __va(mem);
- memblock_reserve(mem, size);
- real_mode_header = (struct real_mode_header *) base;
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- base, (unsigned long long)mem, size);
+ base = (unsigned char *)real_mode_header;
memcpy(base, real_mode_blob, size);
@@ -62,9 +70,9 @@ void __init setup_real_mode(void)
__va(real_mode_header->trampoline_header);
#ifdef CONFIG_X86_32
- trampoline_header->start = __pa(startup_32_smp);
+ trampoline_header->start = __pa_symbol(startup_32_smp);
trampoline_header->gdt_limit = __BOOT_DS + 7;
- trampoline_header->gdt_base = __pa(boot_gdt);
+ trampoline_header->gdt_base = __pa_symbol(boot_gdt);
#else
/*
* Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
@@ -78,16 +86,18 @@ void __init setup_real_mode(void)
*trampoline_cr4_features = read_cr4();
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
- trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
- trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+ trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
+ trampoline_pgd[511] = init_level4_pgt[511].pgd;
#endif
}
/*
- * set_real_mode_permissions() gets called very early, to guarantee the
- * availability of low memory. This is before the proper kernel page
+ * reserve_real_mode() gets called very early, to guarantee the
+ * availability of low memory. This is before the proper kernel page
* tables are set up, so we cannot set page permissions in that
- * function. Thus, we use an arch_initcall instead.
+ * function. Also trampoline code will be executed by APs so we
+ * need to mark it executable at do_pre_smp_initcalls() at least,
+ * thus run it as a early_initcall().
*/
static int __init set_real_mode_permissions(void)
{
@@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void)
return 0;
}
-
-arch_initcall(set_real_mode_permissions);
+early_initcall(set_real_mode_permissions);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 01de35c77221..f5e86eee4e0e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
- /* reserve the range used */
- native_pagetable_reserve(start, end);
-
- /* set as RW the rest */
- printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
- PFN_PHYS(pgt_buf_top));
- while (end < PFN_PHYS(pgt_buf_top)) {
- make_lowmem_page_readwrite(__va(end));
- end += PAGE_SIZE;
- }
-}
-
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
-
- /*
- * If the new pfn is within the range of the newly allocated
- * kernel pagetable, and it isn't being mapped into an
- * early_ioremap fixmap slot as a freshly allocated page, make sure
- * it is RO.
- */
- if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
- (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
- pte = pte_wrprotect(pte);
-
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
void __init xen_init_mmu_ops(void)
{
- x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_init = xen_pagetable_init;
pv_mmu_ops = xen_mmu_ops;
OpenPOWER on IntegriCloud