diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
28 files changed, 1797 insertions, 852 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7b484f55553..24c18362e5ea 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -21,7 +21,6 @@ config PPC_PSERIES select PPC_DOORBELL select HOTPLUG_CPU select ARCH_RANDOM - select PPC_DOORBELL select FORCE_SMP select SWIOTLB default y @@ -108,6 +107,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" depends on PPC_SMLPAR + select MEMORY_BALLOON default y help Select this option, if you want to enable the kernel interface @@ -145,3 +145,17 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + help + There are certain POWER platforms which support secure guests using + the Protected Execution Facility, with the help of an Ultravisor + executing below the hypervisor layer. This enables support for + those guests. + + If unsure, say "N". diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index ab3d59aeacca..a3c74a5cf20d 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -26,6 +26,8 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_PPC_SVM) += svm.o +obj-$(CONFIG_FA_DUMP) += rtas-fadump.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index b33251d75927..9dba7e880885 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,6 +19,10 @@ #include <linux/stringify.h> #include <linux/swap.h> #include <linux/device.h> +#include <linux/mount.h> +#include <linux/pseudo_fs.h> +#include <linux/magic.h> +#include <linux/balloon_compaction.h> #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/mmu.h> @@ -38,12 +42,8 @@ #define CMM_MIN_MEM_MB 256 #define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10)) #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) -/* - * The priority level tries to ensure that this notifier is called as - * late as possible to reduce thrashing in the shared memory pool. - */ + #define CMM_MEM_HOTPLUG_PRI 1 -#define CMM_MEM_ISOLATE_PRI 15 static unsigned int delay = CMM_DEFAULT_DELAY; static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY; @@ -51,6 +51,8 @@ static unsigned int oom_kb = CMM_OOM_KB; static unsigned int cmm_debug = CMM_DEBUG; static unsigned int cmm_disabled = CMM_DISABLE; static unsigned long min_mem_mb = CMM_MIN_MEM_MB; +static bool __read_mostly simulate; +static unsigned long simulate_loan_target_kb; static struct device cmm_dev; MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>"); @@ -74,35 +76,31 @@ MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. " module_param_named(debug, cmm_debug, uint, 0644); MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. " "[Default=" __stringify(CMM_DEBUG) "]"); - -#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long)) +module_param_named(simulate, simulate, bool, 0444); +MODULE_PARM_DESC(simulate, "Enable simulation mode (no communication with hw)."); #define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); } -struct cmm_page_array { - struct cmm_page_array *next; - unsigned long index; - unsigned long page[CMM_NR_PAGES]; -}; - -static unsigned long loaned_pages; +static atomic_long_t loaned_pages; static unsigned long loaned_pages_target; static unsigned long oom_freed_pages; -static struct cmm_page_array *cmm_page_list; -static DEFINE_SPINLOCK(cmm_lock); - static DEFINE_MUTEX(hotplug_mutex); static int hotplug_occurred; /* protected by the hotplug mutex */ static struct task_struct *cmm_thread_ptr; +static struct balloon_dev_info b_dev_info; -static long plpar_page_set_loaned(unsigned long vpa) +static long plpar_page_set_loaned(struct page *page) { + const unsigned long vpa = page_to_phys(page); unsigned long cmo_page_sz = cmo_get_page_size(); long rc = 0; int i; + if (unlikely(simulate)) + return 0; + for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0); @@ -113,12 +111,16 @@ static long plpar_page_set_loaned(unsigned long vpa) return rc; } -static long plpar_page_set_active(unsigned long vpa) +static long plpar_page_set_active(struct page *page) { + const unsigned long vpa = page_to_phys(page); unsigned long cmo_page_sz = cmo_get_page_size(); long rc = 0; int i; + if (unlikely(simulate)) + return 0; + for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0); @@ -138,8 +140,7 @@ static long plpar_page_set_active(unsigned long vpa) **/ static long cmm_alloc_pages(long nr) { - struct cmm_page_array *pa, *npa; - unsigned long addr; + struct page *page; long rc; cmm_dbg("Begin request for %ld pages\n", nr); @@ -156,46 +157,19 @@ static long cmm_alloc_pages(long nr) break; } - addr = __get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC); - if (!addr) + page = balloon_page_alloc(); + if (!page) break; - spin_lock(&cmm_lock); - pa = cmm_page_list; - if (!pa || pa->index >= CMM_NR_PAGES) { - /* Need a new page for the page list. */ - spin_unlock(&cmm_lock); - npa = (struct cmm_page_array *)__get_free_page( - GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC); - if (!npa) { - pr_info("%s: Can not allocate new page list\n", __func__); - free_page(addr); - break; - } - spin_lock(&cmm_lock); - pa = cmm_page_list; - - if (!pa || pa->index >= CMM_NR_PAGES) { - npa->next = pa; - npa->index = 0; - pa = npa; - cmm_page_list = pa; - } else - free_page((unsigned long) npa); - } - - if ((rc = plpar_page_set_loaned(__pa(addr)))) { + rc = plpar_page_set_loaned(page); + if (rc) { pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc); - spin_unlock(&cmm_lock); - free_page(addr); + __free_page(page); break; } - pa->page[pa->index++] = addr; - loaned_pages++; - totalram_pages_dec(); - spin_unlock(&cmm_lock); + balloon_page_enqueue(&b_dev_info, page); + atomic_long_inc(&loaned_pages); + adjust_managed_page_count(page, -1); nr--; } @@ -212,30 +186,19 @@ static long cmm_alloc_pages(long nr) **/ static long cmm_free_pages(long nr) { - struct cmm_page_array *pa; - unsigned long addr; + struct page *page; cmm_dbg("Begin free of %ld pages.\n", nr); - spin_lock(&cmm_lock); - pa = cmm_page_list; while (nr) { - if (!pa || pa->index <= 0) + page = balloon_page_dequeue(&b_dev_info); + if (!page) break; - addr = pa->page[--pa->index]; - - if (pa->index == 0) { - pa = pa->next; - free_page((unsigned long) cmm_page_list); - cmm_page_list = pa; - } - - plpar_page_set_active(__pa(addr)); - free_page(addr); - loaned_pages--; + plpar_page_set_active(page); + adjust_managed_page_count(page, 1); + __free_page(page); + atomic_long_dec(&loaned_pages); nr--; - totalram_pages_inc(); } - spin_unlock(&cmm_lock); cmm_dbg("End request with %ld pages unfulfilled\n", nr); return nr; } @@ -257,7 +220,7 @@ static int cmm_oom_notify(struct notifier_block *self, cmm_dbg("OOM processing started\n"); nr = cmm_free_pages(nr); - loaned_pages_target = loaned_pages; + loaned_pages_target = atomic_long_read(&loaned_pages); *freed += KB2PAGES(oom_kb) - nr; oom_freed_pages += KB2PAGES(oom_kb) - nr; cmm_dbg("OOM processing complete\n"); @@ -274,19 +237,24 @@ static int cmm_oom_notify(struct notifier_block *self, **/ static void cmm_get_mpp(void) { + const long __loaned_pages = atomic_long_read(&loaned_pages); + const long total_pages = totalram_pages() + __loaned_pages; int rc; struct hvcall_mpp_data mpp_data; signed long active_pages_target, page_loan_request, target; - signed long total_pages = totalram_pages() + loaned_pages; signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; - rc = h_get_mpp(&mpp_data); - - if (rc != H_SUCCESS) - return; - - page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); - target = page_loan_request + (signed long)loaned_pages; + if (likely(!simulate)) { + rc = h_get_mpp(&mpp_data); + if (rc != H_SUCCESS) + return; + page_loan_request = div_s64((s64)mpp_data.loan_request, + PAGE_SIZE); + target = page_loan_request + __loaned_pages; + } else { + target = KB2PAGES(simulate_loan_target_kb); + page_loan_request = target - __loaned_pages; + } if (target < 0 || total_pages < min_mem_pages) target = 0; @@ -307,7 +275,7 @@ static void cmm_get_mpp(void) loaned_pages_target = target; cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", - page_loan_request, loaned_pages, loaned_pages_target, + page_loan_request, __loaned_pages, loaned_pages_target, oom_freed_pages, totalram_pages()); } @@ -325,6 +293,7 @@ static struct notifier_block cmm_oom_nb = { static int cmm_thread(void *dummy) { unsigned long timeleft; + long __loaned_pages; while (1) { timeleft = msleep_interruptible(delay * 1000); @@ -355,11 +324,12 @@ static int cmm_thread(void *dummy) cmm_get_mpp(); - if (loaned_pages_target > loaned_pages) { - if (cmm_alloc_pages(loaned_pages_target - loaned_pages)) - loaned_pages_target = loaned_pages; - } else if (loaned_pages_target < loaned_pages) - cmm_free_pages(loaned_pages - loaned_pages_target); + __loaned_pages = atomic_long_read(&loaned_pages); + if (loaned_pages_target > __loaned_pages) { + if (cmm_alloc_pages(loaned_pages_target - __loaned_pages)) + loaned_pages_target = __loaned_pages; + } else if (loaned_pages_target < __loaned_pages) + cmm_free_pages(__loaned_pages - loaned_pages_target); } return 0; } @@ -373,7 +343,7 @@ static int cmm_thread(void *dummy) } \ static DEVICE_ATTR(name, 0444, show_##name, NULL) -CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages)); +CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(atomic_long_read(&loaned_pages))); CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target)); static ssize_t show_oom_pages(struct device *dev, @@ -406,11 +376,18 @@ static struct device_attribute *cmm_attrs[] = { &dev_attr_oom_freed_kb, }; +static DEVICE_ULONG_ATTR(simulate_loan_target_kb, 0644, + simulate_loan_target_kb); + static struct bus_type cmm_subsys = { .name = "cmm", .dev_name = "cmm", }; +static void cmm_release_device(struct device *dev) +{ +} + /** * cmm_sysfs_register - Register with sysfs * @@ -426,6 +403,7 @@ static int cmm_sysfs_register(struct device *dev) dev->id = 0; dev->bus = &cmm_subsys; + dev->release = cmm_release_device; if ((rc = device_register(dev))) goto subsys_unregister; @@ -435,6 +413,11 @@ static int cmm_sysfs_register(struct device *dev) goto fail; } + if (!simulate) + return 0; + rc = device_create_file(dev, &dev_attr_simulate_loan_target_kb.attr); + if (rc) + goto fail; return 0; fail: @@ -471,7 +454,7 @@ static int cmm_reboot_notifier(struct notifier_block *nb, if (cmm_thread_ptr) kthread_stop(cmm_thread_ptr); cmm_thread_ptr = NULL; - cmm_free_pages(loaned_pages); + cmm_free_pages(atomic_long_read(&loaned_pages)); } return NOTIFY_DONE; } @@ -481,142 +464,6 @@ static struct notifier_block cmm_reboot_nb = { }; /** - * cmm_count_pages - Count the number of pages loaned in a particular range. - * - * @arg: memory_isolate_notify structure with address range and count - * - * Return value: - * 0 on success - **/ -static unsigned long cmm_count_pages(void *arg) -{ - struct memory_isolate_notify *marg = arg; - struct cmm_page_array *pa; - unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn); - unsigned long end = start + (marg->nr_pages << PAGE_SHIFT); - unsigned long idx; - - spin_lock(&cmm_lock); - pa = cmm_page_list; - while (pa) { - if ((unsigned long)pa >= start && (unsigned long)pa < end) - marg->pages_found++; - for (idx = 0; idx < pa->index; idx++) - if (pa->page[idx] >= start && pa->page[idx] < end) - marg->pages_found++; - pa = pa->next; - } - spin_unlock(&cmm_lock); - return 0; -} - -/** - * cmm_memory_isolate_cb - Handle memory isolation notifier calls - * @self: notifier block struct - * @action: action to take - * @arg: struct memory_isolate_notify data for handler - * - * Return value: - * NOTIFY_OK or notifier error based on subfunction return value - **/ -static int cmm_memory_isolate_cb(struct notifier_block *self, - unsigned long action, void *arg) -{ - int ret = 0; - - if (action == MEM_ISOLATE_COUNT) - ret = cmm_count_pages(arg); - - return notifier_from_errno(ret); -} - -static struct notifier_block cmm_mem_isolate_nb = { - .notifier_call = cmm_memory_isolate_cb, - .priority = CMM_MEM_ISOLATE_PRI -}; - -/** - * cmm_mem_going_offline - Unloan pages where memory is to be removed - * @arg: memory_notify structure with page range to be offlined - * - * Return value: - * 0 on success - **/ -static int cmm_mem_going_offline(void *arg) -{ - struct memory_notify *marg = arg; - unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn); - unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT); - struct cmm_page_array *pa_curr, *pa_last, *npa; - unsigned long idx; - unsigned long freed = 0; - - cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n", - start_page, marg->nr_pages); - spin_lock(&cmm_lock); - - /* Search the page list for pages in the range to be offlined */ - pa_last = pa_curr = cmm_page_list; - while (pa_curr) { - for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) { - if ((pa_curr->page[idx] < start_page) || - (pa_curr->page[idx] >= end_page)) - continue; - - plpar_page_set_active(__pa(pa_curr->page[idx])); - free_page(pa_curr->page[idx]); - freed++; - loaned_pages--; - totalram_pages_inc(); - pa_curr->page[idx] = pa_last->page[--pa_last->index]; - if (pa_last->index == 0) { - if (pa_curr == pa_last) - pa_curr = pa_last->next; - pa_last = pa_last->next; - free_page((unsigned long)cmm_page_list); - cmm_page_list = pa_last; - } - } - pa_curr = pa_curr->next; - } - - /* Search for page list structures in the range to be offlined */ - pa_last = NULL; - pa_curr = cmm_page_list; - while (pa_curr) { - if (((unsigned long)pa_curr >= start_page) && - ((unsigned long)pa_curr < end_page)) { - npa = (struct cmm_page_array *)__get_free_page( - GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC); - if (!npa) { - spin_unlock(&cmm_lock); - cmm_dbg("Failed to allocate memory for list " - "management. Memory hotplug " - "failed.\n"); - return -ENOMEM; - } - memcpy(npa, pa_curr, PAGE_SIZE); - if (pa_curr == cmm_page_list) - cmm_page_list = npa; - if (pa_last) - pa_last->next = npa; - free_page((unsigned long) pa_curr); - freed++; - pa_curr = npa; - } - - pa_last = pa_curr; - pa_curr = pa_curr->next; - } - - spin_unlock(&cmm_lock); - cmm_dbg("Released %ld pages in the search range.\n", freed); - - return 0; -} - -/** * cmm_memory_cb - Handle memory hotplug notifier calls * @self: notifier block struct * @action: action to take @@ -635,7 +482,6 @@ static int cmm_memory_cb(struct notifier_block *self, case MEM_GOING_OFFLINE: mutex_lock(&hotplug_mutex); hotplug_occurred = 1; - ret = cmm_mem_going_offline(arg); break; case MEM_OFFLINE: case MEM_CANCEL_OFFLINE: @@ -656,6 +502,116 @@ static struct notifier_block cmm_mem_nb = { .priority = CMM_MEM_HOTPLUG_PRI }; +#ifdef CONFIG_BALLOON_COMPACTION +static struct vfsmount *balloon_mnt; + +static int cmm_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type balloon_fs = { + .name = "ppc-cmm", + .init_fs_context = cmm_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int cmm_migratepage(struct balloon_dev_info *b_dev_info, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + unsigned long flags; + + /* + * loan/"inflate" the newpage first. + * + * We might race against the cmm_thread who might discover after our + * loan request that another page is to be unloaned. However, once + * the cmm_thread runs again later, this error will automatically + * be corrected. + */ + if (plpar_page_set_loaned(newpage)) { + /* Unlikely, but possible. Tell the caller not to retry now. */ + pr_err_ratelimited("%s: Cannot set page to loaned.", __func__); + return -EBUSY; + } + + /* balloon page list reference */ + get_page(newpage); + + /* + * When we migrate a page to a different zone, we have to fixup the + * count of both involved zones as we adjusted the managed page count + * when inflating. + */ + if (page_zone(page) != page_zone(newpage)) { + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(b_dev_info, newpage); + balloon_page_delete(page); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + /* + * activate/"deflate" the old page. We ignore any errors just like the + * other callers. + */ + plpar_page_set_active(page); + + /* balloon page list reference */ + put_page(page); + + return MIGRATEPAGE_SUCCESS; +} + +static int cmm_balloon_compaction_init(void) +{ + int rc; + + balloon_devinfo_init(&b_dev_info); + b_dev_info.migratepage = cmm_migratepage; + + balloon_mnt = kern_mount(&balloon_fs); + if (IS_ERR(balloon_mnt)) { + rc = PTR_ERR(balloon_mnt); + balloon_mnt = NULL; + return rc; + } + + b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); + if (IS_ERR(b_dev_info.inode)) { + rc = PTR_ERR(b_dev_info.inode); + b_dev_info.inode = NULL; + kern_unmount(balloon_mnt); + balloon_mnt = NULL; + return rc; + } + + b_dev_info.inode->i_mapping->a_ops = &balloon_aops; + return 0; +} +static void cmm_balloon_compaction_deinit(void) +{ + if (b_dev_info.inode) + iput(b_dev_info.inode); + b_dev_info.inode = NULL; + kern_unmount(balloon_mnt); + balloon_mnt = NULL; +} +#else /* CONFIG_BALLOON_COMPACTION */ +static int cmm_balloon_compaction_init(void) +{ + return 0; +} + +static void cmm_balloon_compaction_deinit(void) +{ +} +#endif /* CONFIG_BALLOON_COMPACTION */ + /** * cmm_init - Module initialization * @@ -664,26 +620,31 @@ static struct notifier_block cmm_mem_nb = { **/ static int cmm_init(void) { - int rc = -ENOMEM; + int rc; - if (!firmware_has_feature(FW_FEATURE_CMO)) + if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate) return -EOPNOTSUPP; - if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0) + rc = cmm_balloon_compaction_init(); + if (rc) return rc; + rc = register_oom_notifier(&cmm_oom_nb); + if (rc < 0) + goto out_balloon_compaction; + if ((rc = register_reboot_notifier(&cmm_reboot_nb))) goto out_oom_notifier; if ((rc = cmm_sysfs_register(&cmm_dev))) goto out_reboot_notifier; - if (register_memory_notifier(&cmm_mem_nb) || - register_memory_isolate_notifier(&cmm_mem_isolate_nb)) + rc = register_memory_notifier(&cmm_mem_nb); + if (rc) goto out_unregister_notifier; if (cmm_disabled) - return rc; + return 0; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (IS_ERR(cmm_thread_ptr)) { @@ -691,16 +652,16 @@ static int cmm_init(void) goto out_unregister_notifier; } - return rc; - + return 0; out_unregister_notifier: unregister_memory_notifier(&cmm_mem_nb); - unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_unregister_sysfs(&cmm_dev); out_reboot_notifier: unregister_reboot_notifier(&cmm_reboot_nb); out_oom_notifier: unregister_oom_notifier(&cmm_oom_nb); +out_balloon_compaction: + cmm_balloon_compaction_deinit(); return rc; } @@ -717,9 +678,9 @@ static void cmm_exit(void) unregister_oom_notifier(&cmm_oom_nb); unregister_reboot_notifier(&cmm_reboot_nb); unregister_memory_notifier(&cmm_mem_nb); - unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); - cmm_free_pages(loaned_pages); + cmm_free_pages(atomic_long_read(&loaned_pages)); cmm_unregister_sysfs(&cmm_dev); + cmm_balloon_compaction_deinit(); } /** @@ -739,7 +700,7 @@ static int cmm_set_disable(const char *val, const struct kernel_param *kp) if (cmm_thread_ptr) kthread_stop(cmm_thread_ptr); cmm_thread_ptr = NULL; - cmm_free_pages(loaned_pages); + cmm_free_pages(atomic_long_read(&loaned_pages)); } else if (!disable && cmm_disabled) { cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (IS_ERR(cmm_thread_ptr)) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 2b87480f2837..eab8aa293743 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -19,7 +19,6 @@ struct dtl { struct dtl_entry *buf; - struct dentry *file; int cpu; int buf_entries; u64 last_idx; @@ -320,46 +319,28 @@ static const struct file_operations dtl_fops = { static struct dentry *dtl_dir; -static int dtl_setup_file(struct dtl *dtl) +static void dtl_setup_file(struct dtl *dtl) { char name[10]; sprintf(name, "cpu-%d", dtl->cpu); - dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops); - if (!dtl->file) - return -ENOMEM; - - return 0; + debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops); } static int dtl_init(void) { - struct dentry *event_mask_file, *buf_entries_file; - int rc, i; + int i; if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return -ENODEV; /* set up common debugfs structure */ - rc = -ENOMEM; dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); - if (!dtl_dir) { - printk(KERN_WARNING "%s: can't create dtl root dir\n", - __func__); - goto err; - } - event_mask_file = debugfs_create_x8("dtl_event_mask", 0600, - dtl_dir, &dtl_event_mask); - buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400, - dtl_dir, &dtl_buf_entries); - - if (!event_mask_file || !buf_entries_file) { - printk(KERN_WARNING "%s: can't create dtl files\n", __func__); - goto err_remove_dir; - } + debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); + debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); /* set up the per-cpu log structures */ for_each_possible_cpu(i) { @@ -367,16 +348,9 @@ static int dtl_init(void) spin_lock_init(&dtl->lock); dtl->cpu = i; - rc = dtl_setup_file(dtl); - if (rc) - goto err_remove_dir; + dtl_setup_file(dtl); } return 0; - -err_remove_dir: - debugfs_remove_recursive(dtl_dir); -err: - return rc; } machine_arch_initcall(pseries, dtl_init); diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 9edae1863e2f..893ba3f562c4 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -42,42 +42,44 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; -#ifdef CONFIG_PCI_IOV void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - struct pci_dn *physfn_pdn; - struct eeh_dev *edev; - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - pdn->device_id = pdev->device; - pdn->vendor_id = pdev->vendor; - pdn->class_code = pdev->class; - /* - * Last allow unfreeze return code used for retrieval - * by user space in eeh-sysfs to show the last command - * completion from platform. - */ - pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; - edev = pdn_to_eeh_dev(pdn); + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct pci_dn *physfn_pdn; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + pdn->device_id = pdev->device; + pdn->vendor_id = pdev->vendor; + pdn->class_code = pdev->class; + /* + * Last allow unfreeze return code used for retrieval + * by user space in eeh-sysfs to show the last command + * completion from platform. + */ + pdn->last_allow_rc = 0; + physfn_pdn = pci_get_pdn(pdev->physfn); + pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; + } +#endif eeh_add_device_early(pdn); eeh_add_device_late(pdev); - edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ - eeh_sysfs_add_device(pdev); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); -} + edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ + eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + } #endif + eeh_sysfs_add_device(pdev); +} /* * Buffer for reporting slot-error-detail rtas calls. Its here @@ -144,10 +146,8 @@ static int pseries_eeh_init(void) /* Set EEH probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); -#ifdef CONFIG_PCI_IOV /* Set EEH machine dependent code */ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; -#endif return 0; } @@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* * Update class code and mode of eeh device. We need * correctly reflects that current device is root port @@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); /* Enable EEH on the device */ + eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); - if (!ret) { + if (ret) { + eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); + } else { /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; @@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if (enable) { eeh_add_flag(EEH_ENABLED); eeh_add_to_parent_pe(edev); - - pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr); } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && (pdn_to_eeh_dev(pdn->parent))->pe) { /* This device doesn't support EEH, but it may have an @@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } + eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", + (enable ? "enabled" : "unsupported"), ret); } /* Save memory bars */ diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index d4a8f1702417..3e49cc23a97a 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -22,6 +22,7 @@ #include <asm/firmware.h> #include <asm/prom.h> #include <asm/udbg.h> +#include <asm/svm.h> #include "pseries.h" @@ -55,7 +56,8 @@ hypertas_fw_features_table[] = { {FW_FEATURE_LLAN, "hcall-lLAN"}, {FW_FEATURE_BULK_REMOVE, "hcall-bulk"}, {FW_FEATURE_XDABR, "hcall-xdabr"}, - {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE, + "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, {FW_FEATURE_VPHN, "hcall-vphn"}, {FW_FEATURE_SET_MODE, "hcall-set-mode"}, @@ -100,6 +102,12 @@ static void __init fw_hypertas_feature_init(const char *hypertas, } } + if (is_secure_guest() && + (powerpc_firmware_features & FW_FEATURE_PUT_TCE_IND)) { + powerpc_firmware_features &= ~FW_FEATURE_PUT_TCE_IND; + pr_debug("SVM: disabling PUT_TCE_IND firmware feature\n"); + } + pr_debug(" <- fw_hypertas_feature_init()\n"); } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index bbda646b63b5..3e8cbfe7a80f 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -338,6 +338,62 @@ static void pseries_remove_processor(struct device_node *np) cpu_maps_update_done(); } +static int dlpar_offline_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const __be32 *intserv; + u32 thread; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + thread = be32_to_cpu(intserv[i]); + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != thread) + continue; + + if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + break; + + if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { + set_preferred_offline_state(cpu, + CPU_STATE_OFFLINE); + cpu_maps_update_done(); + timed_topology_update(1); + rc = device_offline(get_cpu_device(cpu)); + if (rc) + goto out; + cpu_maps_update_begin(); + break; + } + + /* + * The cpu is in CPU_STATE_INACTIVE. + * Upgrade it's state to CPU_STATE_OFFLINE. + */ + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + WARN_ON(plpar_hcall_norets(H_PROD, thread) != H_SUCCESS); + __cpu_die(cpu); + break; + } + if (cpu == num_possible_cpus()) { + pr_warn("Could not find cpu to offline with physical id 0x%x\n", + thread); + } + } + cpu_maps_update_done(); + +out: + return rc; +} + static int dlpar_online_cpu(struct device_node *dn) { int rc = 0; @@ -364,8 +420,10 @@ static int dlpar_online_cpu(struct device_node *dn) timed_topology_update(1); find_and_online_cpu_nid(cpu); rc = device_online(get_cpu_device(cpu)); - if (rc) + if (rc) { + dlpar_offline_cpu(dn); goto out; + } cpu_maps_update_begin(); break; @@ -407,17 +465,67 @@ static bool dlpar_cpu_exists(struct device_node *parent, u32 drc_index) return found; } +static bool drc_info_valid_index(struct device_node *parent, u32 drc_index) +{ + struct property *info; + struct of_drc_info drc; + const __be32 *value; + u32 index; + int count, i, j; + + info = of_find_property(parent, "ibm,drc-info", NULL); + if (!info) + return false; + + value = of_prop_next_u32(info, NULL, &count); + + /* First value of ibm,drc-info is number of drc-info records */ + if (value) + value++; + else + return false; + + for (i = 0; i < count; i++) { + if (of_read_drc_info_cell(&info, &value, &drc)) + return false; + + if (strncmp(drc.drc_type, "CPU", 3)) + break; + + if (drc_index > drc.last_drc_index) + continue; + + index = drc.drc_index_start; + for (j = 0; j < drc.num_sequential_elems; j++) { + if (drc_index == index) + return true; + + index += drc.sequential_inc; + } + } + + return false; +} + static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index) { bool found = false; int rc, index; - index = 0; + if (of_find_property(parent, "ibm,drc-info", NULL)) + return drc_info_valid_index(parent, drc_index); + + /* Note that the format of the ibm,drc-indexes array is + * the number of entries in the array followed by the array + * of drc values so we start looking at index = 1. + */ + index = 1; while (!found) { u32 drc; rc = of_property_read_u32_index(parent, "ibm,drc-indexes", index++, &drc); + if (rc) break; @@ -505,63 +613,6 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return rc; } -static int dlpar_offline_cpu(struct device_node *dn) -{ - int rc = 0; - unsigned int cpu; - int len, nthreads, i; - const __be32 *intserv; - u32 thread; - - intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); - if (!intserv) - return -EINVAL; - - nthreads = len / sizeof(u32); - - cpu_maps_update_begin(); - for (i = 0; i < nthreads; i++) { - thread = be32_to_cpu(intserv[i]); - for_each_present_cpu(cpu) { - if (get_hard_smp_processor_id(cpu) != thread) - continue; - - if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) - break; - - if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { - set_preferred_offline_state(cpu, - CPU_STATE_OFFLINE); - cpu_maps_update_done(); - timed_topology_update(1); - rc = device_offline(get_cpu_device(cpu)); - if (rc) - goto out; - cpu_maps_update_begin(); - break; - - } - - /* - * The cpu is in CPU_STATE_INACTIVE. - * Upgrade it's state to CPU_STATE_OFFLINE. - */ - set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); - BUG_ON(plpar_hcall_norets(H_PROD, thread) - != H_SUCCESS); - __cpu_die(cpu); - break; - } - if (cpu == num_possible_cpus()) - printk(KERN_WARNING "Could not find cpu to offline with physical id 0x%x\n", thread); - } - cpu_maps_update_done(); - -out: - return rc; - -} - static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; @@ -717,19 +768,52 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove) return rc; } -static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add) +static int find_drc_info_cpus_to_add(struct device_node *cpus, + struct property *info, + u32 *cpu_drcs, u32 cpus_to_add) { - struct device_node *parent; + struct of_drc_info drc; + const __be32 *value; + u32 count, drc_index; int cpus_found = 0; - int index, rc; + int i, j; - parent = of_find_node_by_path("/cpus"); - if (!parent) { - pr_warn("Could not find CPU root node in device tree\n"); - kfree(cpu_drcs); + if (!info) return -1; + + value = of_prop_next_u32(info, NULL, &count); + if (value) + value++; + + for (i = 0; i < count; i++) { + of_read_drc_info_cell(&info, &value, &drc); + if (strncmp(drc.drc_type, "CPU", 3)) + break; + + drc_index = drc.drc_index_start; + for (j = 0; j < drc.num_sequential_elems; j++) { + if (dlpar_cpu_exists(cpus, drc_index)) + continue; + + cpu_drcs[cpus_found++] = drc_index; + + if (cpus_found == cpus_to_add) + return cpus_found; + + drc_index += drc.sequential_inc; + } } + return cpus_found; +} + +static int find_drc_index_cpus_to_add(struct device_node *cpus, + u32 *cpu_drcs, u32 cpus_to_add) +{ + int cpus_found = 0; + int index, rc; + u32 drc_index; + /* Search the ibm,drc-indexes array for possible CPU drcs to * add. Note that the format of the ibm,drc-indexes array is * the number of entries in the array followed by the array @@ -737,25 +821,25 @@ static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add) */ index = 1; while (cpus_found < cpus_to_add) { - u32 drc; + rc = of_property_read_u32_index(cpus, "ibm,drc-indexes", + index++, &drc_index); - rc = of_property_read_u32_index(parent, "ibm,drc-indexes", - index++, &drc); if (rc) break; - if (dlpar_cpu_exists(parent, drc)) + if (dlpar_cpu_exists(cpus, drc_index)) continue; - cpu_drcs[cpus_found++] = drc; + cpu_drcs[cpus_found++] = drc_index; } - of_node_put(parent); return cpus_found; } static int dlpar_cpu_add_by_count(u32 cpus_to_add) { + struct device_node *parent; + struct property *info; u32 *cpu_drcs; int cpus_added = 0; int cpus_found; @@ -767,7 +851,21 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) if (!cpu_drcs) return -EINVAL; - cpus_found = find_dlpar_cpus_to_add(cpu_drcs, cpus_to_add); + parent = of_find_node_by_path("/cpus"); + if (!parent) { + pr_warn("Could not find CPU root node in device tree\n"); + kfree(cpu_drcs); + return -1; + } + + info = of_find_property(parent, "ibm,drc-info", NULL); + if (info) + cpus_found = find_drc_info_cpus_to_add(parent, info, cpu_drcs, cpus_to_add); + else + cpus_found = find_drc_index_cpus_to_add(parent, cpu_drcs, cpus_to_add); + + of_node_put(parent); + if (cpus_found < cpus_to_add) { pr_warn("Failed to find enough CPUs (%d of %d) to add\n", cpus_found, cpus_to_add); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 46d0d35b9ca4..a4d40a3ceea3 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -338,7 +338,7 @@ static int pseries_remove_mem_node(struct device_node *np) static bool lmb_is_removable(struct drmem_lmb *lmb) { int i, scns_per_block; - int rc = 1; + bool rc = true; unsigned long pfn, block_sz; u64 phys_addr; @@ -360,14 +360,16 @@ static bool lmb_is_removable(struct drmem_lmb *lmb) for (i = 0; i < scns_per_block; i++) { pfn = PFN_DOWN(phys_addr); - if (!pfn_present(pfn)) + if (!pfn_present(pfn)) { + phys_addr += MIN_MEMORY_BLOCK_SIZE; continue; + } - rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION); + rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION); phys_addr += MIN_MEMORY_BLOCK_SIZE; } - return rc ? true : false; + return rc; } static int dlpar_add_lmb(struct drmem_lmb *); @@ -880,34 +882,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) switch (hp_elog->action) { case PSERIES_HP_ELOG_ACTION_ADD: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_add_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_add_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_add_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; case PSERIES_HP_ELOG_ACTION_REMOVE: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_remove_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_remove_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_remove_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index bcc1b67417a8..c40c62ec432e 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -129,7 +129,6 @@ static void probe_hcall_exit(void *ignored, unsigned long opcode, long retval, static int __init hcall_inst_init(void) { struct dentry *hcall_root; - struct dentry *hcall_file; char cpu_name_buf[CPU_NAME_BUF_SIZE]; int cpu; @@ -145,17 +144,12 @@ static int __init hcall_inst_init(void) } hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL); - if (!hcall_root) - return -ENOMEM; for_each_possible_cpu(cpu) { snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu); - hcall_file = debugfs_create_file(cpu_name_buf, 0444, - hcall_root, - per_cpu(hcall_stats, cpu), - &hcall_inst_seq_fops); - if (!hcall_file) - return -ENOMEM; + debugfs_create_file(cpu_name_buf, 0444, hcall_root, + per_cpu(hcall_stats, cpu), + &hcall_inst_seq_fops); } return 0; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 889dc2e44b89..2e0a8eab5588 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -132,10 +132,10 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(struct iommu_table*, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); -static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, long npages, unsigned long uaddr, enum dma_data_direction direction, unsigned long attrs) @@ -146,25 +146,25 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; long tcenum_start = tcenum, npages_start = npages; - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); + tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift; + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(tbl, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, (npages_start - (npages + 1))); break; } if (rc && printk_ratelimit()) { printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); printk("\ttce val = 0x%llx\n", tce ); dump_stack(); @@ -192,8 +192,9 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; unsigned long flags; - if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages, uaddr, direction, attrs); } @@ -209,8 +210,9 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, /* If allocation fails, fall back to the loop implementation */ if (!tcep) { local_irq_restore(flags); - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, - direction, attrs); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, + npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } @@ -261,16 +263,16 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); - printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tindex = 0x%llx\n", (u64)liobn); printk("\ttcenum = 0x%llx\n", (u64)tcenum); dump_stack(); } @@ -284,8 +286,8 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n { u64 rc; - if (!firmware_has_feature(FW_FEATURE_MULTITCE)) - return tce_free_pSeriesLP(tbl, tcenum, npages); + if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) + return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); @@ -400,6 +402,19 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, u64 rc = 0; long l, limit; + if (!firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { + unsigned long tceshift = be32_to_cpu(maprange->tce_shift); + unsigned long dmastart = (start_pfn << PAGE_SHIFT) + + be64_to_cpu(maprange->dma_base); + unsigned long tcenum = dmastart >> tceshift; + unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift; + void *uaddr = __va(start_pfn << PAGE_SHIFT); + + return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn), + tcenum, tceshift, npages, (unsigned long) uaddr, + DMA_BIDIRECTIONAL, 0); + } + local_irq_disable(); /* to protect tcep and the page behind it */ tcep = __this_cpu_read(tce_page); @@ -609,7 +624,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -621,7 +636,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction) + long *tce, enum dma_data_direction *direction, + bool realmode) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; @@ -649,7 +665,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, #ifdef CONFIG_IOMMU_API - .exchange = tce_exchange_pseries, + .xchg_no_kill = tce_exchange_pseries, #endif .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP @@ -690,7 +706,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node); + iommu_init_table(tbl, ppci->phb->node, 0, 0); iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -719,7 +735,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node); + iommu_init_table(tbl, phb->node, 0, 0); set_iommu_table_base(&dev->dev, tbl); return; } @@ -1169,7 +1185,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->table_group); @@ -1325,9 +1341,11 @@ static int __init disable_multitce(char *str) { if (strcmp(str, "off") == 0 && firmware_has_feature(FW_FEATURE_LPAR) && - firmware_has_feature(FW_FEATURE_MULTITCE)) { + (firmware_has_feature(FW_FEATURE_PUT_TCE_IND) || + firmware_has_feature(FW_FEATURE_STUFF_TCE))) { printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); - powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; + powerpc_firmware_features &= + ~(FW_FEATURE_PUT_TCE_IND | FW_FEATURE_STUFF_TCE); } return 1; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 09bb878c21e0..3c3da25b445c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -56,6 +56,22 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +/* + * H_BLOCK_REMOVE supported block size for this page size in segment who's base + * page size is that page size. + * + * The first index is the segment base page size, the second one is the actual + * page size. + */ +static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; + +/* + * Due to the involved complexity, and that the current hypervisor is only + * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE + * buffer size to 8 size block. + */ +#define HBLKRM_SUPPORTED_BLOCK_SIZE 8 + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static u8 dtl_mask = DTL_LOG_PREEMPT; #else @@ -566,12 +582,12 @@ static int vcpudispatch_stats_open(struct inode *inode, struct file *file) return single_open(file, vcpudispatch_stats_display, NULL); } -static const struct file_operations vcpudispatch_stats_proc_ops = { - .open = vcpudispatch_stats_open, - .read = seq_read, - .write = vcpudispatch_stats_write, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops vcpudispatch_stats_proc_ops = { + .proc_open = vcpudispatch_stats_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static ssize_t vcpudispatch_stats_freq_write(struct file *file, @@ -610,12 +626,12 @@ static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file) return single_open(file, vcpudispatch_stats_freq_display, NULL); } -static const struct file_operations vcpudispatch_stats_freq_proc_ops = { - .open = vcpudispatch_stats_freq_open, - .read = seq_read, - .write = vcpudispatch_stats_freq_write, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops vcpudispatch_stats_freq_proc_ops = { + .proc_open = vcpudispatch_stats_freq_open, + .proc_read = seq_read, + .proc_write = vcpudispatch_stats_freq_write, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init vcpudispatch_stats_procfs_init(void) @@ -758,7 +774,7 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) /* don't remove a bolted entry */ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset, - (0x1UL << 4), &dummy1, &dummy2); + HPTE_V_BOLTED, &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; @@ -922,11 +938,19 @@ static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); - /* Bolted entries are always in the primary group */ + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot = __pSeries_lpar_hpte_find(want_v, hpte_group); - if (slot < 0) - return -1; + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __pSeries_lpar_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; + } return hpte_group + slot; } @@ -984,6 +1008,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, #define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL #define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL +/* + * Returned true if we are supporting this block size for the specified segment + * base page size and actual page size. + * + * Currently, we only support 8 size block. + */ +static inline bool is_supported_hlbkrm(int bpsize, int psize) +{ + return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE); +} + /** * H_BLOCK_REMOVE caller. * @idx should point to the latest @param entry set with a PTEX. @@ -1143,7 +1178,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + /* Assuming THP size is 16M */ + if (is_supported_hlbkrm(psize, MMU_PAGE_16M)) hugepage_block_invalidate(slot, vpn, count, psize, ssize); else hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); @@ -1312,6 +1348,140 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, } /* + * TLB Block Invalidate Characteristics + * + * These characteristics define the size of the block the hcall H_BLOCK_REMOVE + * is able to process for each couple segment base page size, actual page size. + * + * The ibm,get-system-parameter properties is returning a buffer with the + * following layout: + * + * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ] + * ----------------- + * TLB Block Invalidate Specifiers: + * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ] + * [ 1 byte Number of page sizes (N) that are supported for the specified + * TLB invalidate block size ] + * [ 1 byte Encoded segment base page size and actual page size + * MSB=0 means 4k segment base page size and actual page size + * MSB=1 the penc value in mmu_psize_def ] + * ... + * ----------------- + * Next TLB Block Invalidate Specifiers... + * ----------------- + * [ 0 ] + */ +static inline void set_hblkrm_bloc_size(int bpsize, int psize, + unsigned int block_size) +{ + if (block_size > hblkrm_size[bpsize][psize]) + hblkrm_size[bpsize][psize] = block_size; +} + +/* + * Decode the Encoded segment base page size and actual page size. + * PAPR specifies: + * - bit 7 is the L bit + * - bits 0-5 are the penc value + * If the L bit is 0, this means 4K segment base page size and actual page size + * otherwise the penc value should be read. + */ +#define HBLKRM_L_MASK 0x80 +#define HBLKRM_PENC_MASK 0x3f +static inline void __init check_lp_set_hblkrm(unsigned int lp, + unsigned int block_size) +{ + unsigned int bpsize, psize; + + /* First, check the L bit, if not set, this means 4K */ + if ((lp & HBLKRM_L_MASK) == 0) { + set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size); + return; + } + + lp &= HBLKRM_PENC_MASK; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) { + struct mmu_psize_def *def = &mmu_psize_defs[bpsize]; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (def->penc[psize] == lp) { + set_hblkrm_bloc_size(bpsize, psize, block_size); + return; + } + } + } +} + +#define SPLPAR_TLB_BIC_TOKEN 50 + +/* + * The size of the TLB Block Invalidate Characteristics is variable. But at the + * maximum it will be the number of possible page sizes *2 + 10 bytes. + * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size + * (128 bytes) for the buffer to get plenty of space. + */ +#define SPLPAR_TLB_BIC_MAXLENGTH 128 + +void __init pseries_lpar_read_hblkrm_characteristics(void) +{ + unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH]; + int call_status, len, idx, bpsize; + + if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + return; + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_TLB_BIC_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH); + local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0'; + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + pr_warn("%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __func__, call_status); + return; + } + + /* + * The first two (2) bytes of the data in the buffer are the length of + * the returned data, not counting these first two (2) bytes. + */ + len = be16_to_cpu(*((u16 *)local_buffer)) + 2; + if (len > SPLPAR_TLB_BIC_MAXLENGTH) { + pr_warn("%s too large returned buffer %d", __func__, len); + return; + } + + idx = 2; + while (idx < len) { + u8 block_shift = local_buffer[idx++]; + u32 block_size; + unsigned int npsize; + + if (!block_shift) + break; + + block_size = 1 << block_shift; + + for (npsize = local_buffer[idx++]; + npsize > 0 && idx < len; npsize--) + check_lp_set_hblkrm((unsigned int) local_buffer[idx++], + block_size); + } + + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (idx = 0; idx < MMU_PAGE_COUNT; idx++) + if (hblkrm_size[bpsize][idx]) + pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d", + bpsize, idx, hblkrm_size[bpsize][idx]); +} + +/* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. */ @@ -1330,7 +1500,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); - if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) { + if (is_supported_hlbkrm(batch->psize, batch->psize)) { do_block_remove(number, batch, param); goto out; } @@ -1413,7 +1583,10 @@ static int pseries_lpar_resize_hpt_commit(void *data) return 0; } -/* Must be called in user context */ +/* + * Must be called in process context. The caller must hold the + * cpus_lock. + */ static int pseries_lpar_resize_hpt(unsigned long shift) { struct hpt_resize_state state = { @@ -1467,7 +1640,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift) t1 = ktime_get(); - rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); + rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit, + &state, NULL); t2 = ktime_get(); @@ -1527,16 +1701,24 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; - register_process_table = pseries_lpar_register_process_table; if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pseries_lpar_register_process_table(0, 0, 0); } void radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); - register_process_table = pseries_lpar_register_process_table; + + pseries_lpar_register_process_table(__pa(process_tb), + 0, PRTB_SIZE_SHIFT - 12); } #ifdef CONFIG_PPC_SMLPAR @@ -1818,30 +2000,17 @@ static int __init vpa_debugfs_init(void) { char name[16]; long i; - static struct dentry *vpa_dir; + struct dentry *vpa_dir; if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); - if (!vpa_dir) { - pr_warn("%s: can't create vpa root dir\n", __func__); - return -ENOMEM; - } /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { - struct dentry *d; - sprintf(name, "cpu-%ld", i); - - d = debugfs_create_file(name, 0400, vpa_dir, (void *)i, - &vpa_fops); - if (!d) { - pr_warn("%s: can't create per-cpu vpa file\n", - __func__); - return -ENOMEM; - } + debugfs_create_file(name, 0400, vpa_dir, (void *)i, &vpa_fops); } return 0; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index e33e8bc4b69b..b8d28ab88178 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -435,10 +435,10 @@ static void maxmem_data(struct seq_file *m) { unsigned long maxmem = 0; - maxmem += drmem_info->n_lmbs * drmem_info->lmb_size; + maxmem += (unsigned long)drmem_info->n_lmbs * drmem_info->lmb_size; maxmem += hugetlb_total_pages() * PAGE_SIZE; - seq_printf(m, "MaxMem=%ld\n", maxmem); + seq_printf(m, "MaxMem=%lu\n", maxmem); } static int pseries_lparcfg_data(struct seq_file *m, void *v) @@ -698,12 +698,12 @@ static int lparcfg_open(struct inode *inode, struct file *file) return single_open(file, lparcfg_data, NULL); } -static const struct file_operations lparcfg_fops = { - .read = seq_read, - .write = lparcfg_write, - .open = lparcfg_open, - .release = single_release, - .llseek = seq_lseek, +static const struct proc_ops lparcfg_proc_ops = { + .proc_read = seq_read, + .proc_write = lparcfg_write, + .proc_open = lparcfg_open, + .proc_release = single_release, + .proc_lseek = seq_lseek, }; static int __init lparcfg_init(void) @@ -714,7 +714,7 @@ static int __init lparcfg_init(void) if (firmware_has_feature(FW_FEATURE_SPLPAR)) mode |= 0200; - if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) { + if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_proc_ops)) { printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); return -EIO; } diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index fe812bebdf5e..b571285f6c14 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -9,6 +9,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> +#include <linux/sched.h> #include <linux/smp.h> #include <linux/stat.h> #include <linux/completion.h> @@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope) prop_data += vd; } + + cond_resched(); } + + cond_resched(); } while (rtas_rc == 1); of_node_put(dn); @@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope) add_dt_node(phandle, drc_index); break; } + + cond_resched(); } } + + cond_resched(); } while (rc == 1); kfree(rtas_buf); diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c index 6df192f38f80..66dfd8256712 100644 --- a/arch/powerpc/platforms/pseries/of_helpers.c +++ b/arch/powerpc/platforms/pseries/of_helpers.c @@ -45,14 +45,14 @@ struct device_node *pseries_of_derive_parent(const char *path) int of_read_drc_info_cell(struct property **prop, const __be32 **curval, struct of_drc_info *data) { - const char *p; + const char *p = (char *)(*curval); const __be32 *p2; if (!data) return -EINVAL; /* Get drc-type:encode-string */ - p = data->drc_type = (char*) (*curval); + data->drc_type = (char *)p; p = of_prop_next_string(*prop, p); if (!p) return -EINVAL; @@ -65,9 +65,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval, /* Get drc-index-start:encode-int */ p2 = (const __be32 *)p; - p2 = of_prop_next_u32(*prop, p2, &data->drc_index_start); - if (!p2) - return -EINVAL; + data->drc_index_start = be32_to_cpu(*p2); /* Get drc-name-suffix-start:encode-int */ p2 = of_prop_next_u32(*prop, p2, &data->drc_name_suffix_start); diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index a5ac371a3f06..0b4467e378e5 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -65,29 +65,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p) cond_resched(); } while (rc == H_BUSY); - if (rc) { - /* H_OVERLAP needs a separate error path */ - if (rc == H_OVERLAP) - return -EBUSY; - - dev_err(&p->pdev->dev, "bind err: %lld\n", rc); - return -ENXIO; - } + if (rc) + return rc; p->bound_addr = saved; - - dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res); - - return 0; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", + p->drc_index, (unsigned long)saved); + return rc; } -static int drc_pmem_unbind(struct papr_scm_priv *p) +static void drc_pmem_unbind(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; uint64_t token = 0; int64_t rc; - dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index); + dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index); /* NB: unbind has the same retry requirements as drc_pmem_bind() */ do { @@ -110,12 +103,48 @@ static int drc_pmem_unbind(struct papr_scm_priv *p) if (rc) dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); else - dev_dbg(&p->pdev->dev, "unbind drc %x complete\n", + dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n", p->drc_index); - return rc == H_SUCCESS ? 0 : -ENXIO; + return; +} + +static int drc_pmem_query_n_bind(struct papr_scm_priv *p) +{ + unsigned long start_addr; + unsigned long end_addr; + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + int64_t rc; + + + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, 0); + if (rc) + goto err_out; + start_addr = ret[0]; + + /* Make sure the full region is bound. */ + rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret, + p->drc_index, p->blocks - 1); + if (rc) + goto err_out; + end_addr = ret[0]; + + if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size)) + goto err_out; + + p->bound_addr = start_addr; + dev_dbg(&p->pdev->dev, "bound drc 0x%x to 0x%lx\n", p->drc_index, start_addr); + return rc; + +err_out: + dev_info(&p->pdev->dev, + "Failed to query, trying an unbind followed by bind"); + drc_pmem_unbind(p); + return drc_pmem_bind(p); } + static int papr_scm_meta_get(struct papr_scm_priv *p, struct nd_cmd_get_config_data_hdr *hdr) { @@ -124,7 +153,7 @@ static int papr_scm_meta_get(struct papr_scm_priv *p, int len, read; int64_t ret; - if ((hdr->in_offset + hdr->in_length) >= p->metadata_size) + if ((hdr->in_offset + hdr->in_length) > p->metadata_size) return -EINVAL; for (len = hdr->in_length; len; len -= read) { @@ -178,7 +207,7 @@ static int papr_scm_meta_set(struct papr_scm_priv *p, __be64 data_be; int64_t ret; - if ((hdr->in_offset + hdr->in_length) >= p->metadata_size) + if ((hdr->in_offset + hdr->in_length) > p->metadata_size) return -EINVAL; for (len = hdr->in_length; len; len -= wrote) { @@ -256,25 +285,6 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, return 0; } -static const struct attribute_group *region_attr_groups[] = { - &nd_region_attribute_group, - &nd_device_attribute_group, - &nd_mapping_attribute_group, - &nd_numa_attribute_group, - NULL, -}; - -static const struct attribute_group *bus_attr_groups[] = { - &nvdimm_bus_attribute_group, - NULL, -}; - -static const struct attribute_group *papr_scm_dimm_groups[] = { - &nvdimm_attribute_group, - &nd_device_attribute_group, - NULL, -}; - static inline int papr_scm_node(int node) { int min_dist = INT_MAX, dist; @@ -305,7 +315,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) p->bus_desc.ndctl = papr_scm_ndctl; p->bus_desc.module = THIS_MODULE; p->bus_desc.of_node = p->pdev->dev.of_node; - p->bus_desc.attr_groups = bus_attr_groups; p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); if (!p->bus_desc.provider_name) @@ -314,14 +323,15 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) p->bus = nvdimm_bus_register(NULL, &p->bus_desc); if (!p->bus) { dev_err(dev, "Error creating nvdimm bus %pOF\n", p->dn); + kfree(p->bus_desc.provider_name); return -ENXIO; } dimm_flags = 0; set_bit(NDD_ALIASING, &dimm_flags); - p->nvdimm = nvdimm_create(p->bus, p, papr_scm_dimm_groups, - dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL); + p->nvdimm = nvdimm_create(p->bus, p, NULL, dimm_flags, + PAPR_SCM_DIMM_CMD_MASK, 0, NULL); if (!p->nvdimm) { dev_err(dev, "Error creating DIMM object for %pOF\n", p->dn); goto err; @@ -338,7 +348,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) mapping.size = p->blocks * p->block_size; // XXX: potential overflow? memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.attr_groups = region_attr_groups; target_nid = dev_to_node(&p->pdev->dev); online_nid = papr_scm_node(target_nid); ndr_desc.numa_node = online_nid; @@ -349,7 +358,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ndr_desc.mapping = &mapping; ndr_desc.num_mappings = 1; ndr_desc.nd_set = &p->nd_set; - set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); if (p->is_volatile) p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc); @@ -436,14 +444,14 @@ static int papr_scm_probe(struct platform_device *pdev) rc = drc_pmem_bind(p); /* If phyp says drc memory still bound then force unbound and retry */ - if (rc == -EBUSY) { - dev_warn(&pdev->dev, "Retrying bind after unbinding\n"); - drc_pmem_unbind(p); - rc = drc_pmem_bind(p); - } + if (rc == H_OVERLAP) + rc = drc_pmem_query_n_bind(p); - if (rc) + if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, "bind err: %d\n", rc); + rc = -ENXIO; goto err; + } /* setup the resource for the newly bound range */ p->res.start = p->bound_addr; @@ -470,6 +478,7 @@ static int papr_scm_remove(struct platform_device *pdev) nvdimm_bus_unregister(p->bus); drc_pmem_unbind(p); + kfree(p->bus_desc.provider_name); kfree(p); return 0; @@ -485,7 +494,6 @@ static struct platform_driver papr_scm_driver = { .remove = papr_scm_remove, .driver = { .name = "papr_scm", - .owner = THIS_MODULE, .of_match_table = papr_scm_match, }, }; diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1eae1d09980c..911534b89c85 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -192,7 +192,7 @@ int pseries_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) int pseries_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate PCI data */ - add_dev_pci_data(pdev); + add_sriov_vf_pdns(pdev); return pseries_pci_sriov_enable(pdev, num_vfs); } @@ -204,7 +204,7 @@ int pseries_pcibios_sriov_disable(struct pci_dev *pdev) /* Releasing pe_num_map */ kfree(pdn->pe_num_map); /* Release PCI data */ - remove_dev_pci_data(pdev); + remove_sriov_vf_pdns(pdev); pci_vf_drivers_autoprobe(pdev, true); return 0; } @@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void) pSeries_request_regions(); - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); #ifdef CONFIG_PCI_IOV ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 561917fa54a8..361986e4354e 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code * for RPA-compliant PPC64 platform. @@ -6,23 +7,6 @@ * * Updates, 2005, John Rose <johnrose@austin.ibm.com> * Updates, 2005, Linas Vepstas <linas@austin.ibm.com> - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/pci.h> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index a6624d4bd9d0..13fa370a87e4 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -112,5 +112,6 @@ static inline unsigned long cmo_get_page_size(void) int dlpar_workqueue_init(void); void pseries_setup_rfi_flush(void); +void pseries_lpar_read_hblkrm_characteristics(void); #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c index a96874f9492f..09e98d301db0 100644 --- a/arch/powerpc/platforms/pseries/pseries_energy.c +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -36,6 +36,7 @@ static int sysfs_entries; static u32 cpu_to_drc_index(int cpu) { struct device_node *dn = NULL; + struct property *info; int thread_index; int rc = 1; u32 ret = 0; @@ -47,20 +48,18 @@ static u32 cpu_to_drc_index(int cpu) /* Convert logical cpu number to core number */ thread_index = cpu_core_index_of_thread(cpu); - if (firmware_has_feature(FW_FEATURE_DRC_INFO)) { - struct property *info = NULL; + info = of_find_property(dn, "ibm,drc-info", NULL); + if (info) { struct of_drc_info drc; int j; u32 num_set_entries; const __be32 *value; - info = of_find_property(dn, "ibm,drc-info", NULL); - if (info == NULL) - goto err_of_node_put; - value = of_prop_next_u32(info, NULL, &num_set_entries); if (!value) goto err_of_node_put; + else + value++; for (j = 0; j < num_set_entries; j++) { @@ -110,6 +109,7 @@ err: static int drc_index_to_cpu(u32 drc_index) { struct device_node *dn = NULL; + struct property *info; const int *indexes; int thread_index = 0, cpu = 0; int rc = 1; @@ -117,21 +117,18 @@ static int drc_index_to_cpu(u32 drc_index) dn = of_find_node_by_path("/cpus"); if (dn == NULL) goto err; - - if (firmware_has_feature(FW_FEATURE_DRC_INFO)) { - struct property *info = NULL; + info = of_find_property(dn, "ibm,drc-info", NULL); + if (info) { struct of_drc_info drc; int j; u32 num_set_entries; const __be32 *value; - info = of_find_property(dn, "ibm,drc-info", NULL); - if (info == NULL) - goto err_of_node_put; - value = of_prop_next_u32(info, NULL, &num_set_entries); if (!value) goto err_of_node_put; + else + value++; for (j = 0; j < num_set_entries; j++) { diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f16fdd0f71f7..1d7f973c647b 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -76,6 +76,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_UE 0x00 #define MC_ERROR_TYPE_SLB 0x01 #define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_UNKNOWN 0x03 #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 @@ -87,6 +88,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_UE_LOAD_STORE 3 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 +#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 +#define UE_LOGICAL_ADDR_PROVIDED 0x20 + #define MC_ERROR_SLB_PARITY 0 #define MC_ERROR_SLB_MULTIHIT 1 #define MC_ERROR_SLB_INDETERMINATE 2 @@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) } } -static -inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) -{ - __be64 addr = 0; - - switch (mlog->error_type) { - case MC_ERROR_TYPE_UE: - if (mlog->sub_err_type & 0x40) - addr = mlog->effective_address; - break; - case MC_ERROR_TYPE_SLB: - case MC_ERROR_TYPE_ERAT: - case MC_ERROR_TYPE_TLB: - if (mlog->sub_err_type & 0x80) - addr = mlog->effective_address; - default: - break; - } - return be64_to_cpu(addr); -} - /* * Enable the hotplug interrupt late because processing them may touch other * devices or systems (e.g. hugepages) that have not been initialized at the @@ -272,7 +255,7 @@ static void rtas_parse_epow_errlog(struct rtas_error_log *log) break; case EPOW_SYSTEM_SHUTDOWN: - handle_system_shutdown(epow_log->event_modifier); + handle_system_shutdown(modifier); break; case EPOW_SYSTEM_HALT: @@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -#define VAL_TO_STRING(ar, val) \ - (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") -static void pseries_print_mce_info(struct pt_regs *regs, - struct rtas_error_log *errp) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { - const char *level, *sevstr; + struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; - u8 error_type, err_sub_type; - u64 addr; - u8 initiator = rtas_error_initiator(errp); int disposition = rtas_error_disposition(errp); + int initiator = rtas_error_initiator(errp); + int severity = rtas_error_severity(errp); + u8 error_type, err_sub_type; - static const char * const initiators[] = { - [0] = "Unknown", - [1] = "CPU", - [2] = "PCI", - [3] = "ISA", - [4] = "Memory", - [5] = "Power Mgmt", - }; - static const char * const mc_err_types[] = { - [0] = "UE", - [1] = "SLB", - [2] = "ERAT", - [3] = "Unknown", - [4] = "TLB", - [5] = "D-Cache", - [6] = "Unknown", - [7] = "I-Cache", - }; - static const char * const mc_ue_types[] = { - [0] = "Indeterminate", - [1] = "Instruction fetch", - [2] = "Page table walk ifetch", - [3] = "Load/Store", - [4] = "Page table walk Load/Store", - }; - - /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ - static const char * const mc_slb_types[] = { - [0] = "Parity", - [1] = "Multihit", - [2] = "Indeterminate", - }; - - /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ - static const char * const mc_soft_types[] = { - [0] = "Unknown", - [1] = "Parity", - [2] = "Multihit", - [3] = "Indeterminate", - }; - - if (!rtas_error_extended(errp)) { - pr_err("Machine check interrupt: Missing extended error log\n"); - return; - } + if (initiator == RTAS_INITIATOR_UNKNOWN) + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + else if (initiator == RTAS_INITIATOR_CPU) + mce_err.initiator = MCE_INITIATOR_CPU; + else if (initiator == RTAS_INITIATOR_PCI) + mce_err.initiator = MCE_INITIATOR_PCI; + else if (initiator == RTAS_INITIATOR_ISA) + mce_err.initiator = MCE_INITIATOR_ISA; + else if (initiator == RTAS_INITIATOR_MEMORY) + mce_err.initiator = MCE_INITIATOR_MEMORY; + else if (initiator == RTAS_INITIATOR_POWERMGM) + mce_err.initiator = MCE_INITIATOR_POWERMGM; + else + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + + if (severity == RTAS_SEVERITY_NO_ERROR) + mce_err.severity = MCE_SEV_NO_ERROR; + else if (severity == RTAS_SEVERITY_EVENT) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_WARNING) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_ERROR_SYNC) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_ERROR) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; + else + mce_err.severity = MCE_SEV_FATAL; + + if (severity <= RTAS_SEVERITY_ERROR_SYNC) + mce_err.sync_error = true; + else + mce_err.sync_error = false; + + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err.error_class = MCE_ECLASS_UNKNOWN; + + if (!rtas_error_extended(errp)) + goto out; pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) - return; + goto out; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; err_sub_type = rtas_mc_error_sub_type(mce_log); - switch (rtas_error_severity(errp)) { - case RTAS_SEVERITY_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case RTAS_SEVERITY_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case RTAS_SEVERITY_ERROR: - case RTAS_SEVERITY_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case RTAS_SEVERITY_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + switch (mce_log->error_type) { + case MC_ERROR_TYPE_UE: + mce_err.error_type = MCE_ERROR_TYPE_UE; + switch (err_sub_type) { + case MC_ERROR_UE_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case MC_ERROR_UE_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + break; + case MC_ERROR_UE_INDETERMINATE: + default: + mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); -#ifdef CONFIG_PPC_BOOK3S_64 - /* Display faulty slb contents for SLB errors. */ - if (error_type == MC_ERROR_TYPE_SLB) - slb_dump_contents(local_paca->mce_faulty_slbs); -#endif + if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { + paddr = be64_to_cpu(mce_log->logical_address); + } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { + unsigned long pfn; - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - disposition == RTAS_DISP_FULLY_RECOVERED ? - "Recovered" : "Not recovered"); - if (user_mode(regs)) { - printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, - regs->nip, current->pid, current->comm); - } else { - printk("%s NIP [%016lx]: %pS\n", level, regs->nip, - (void *)regs->nip); - } - printk("%s Initiator: %s\n", level, - VAL_TO_STRING(initiators, initiator)); + pfn = addr_to_pfn(regs, eaddr); + if (pfn != ULONG_MAX) + paddr = pfn << PAGE_SHIFT; + } - switch (error_type) { - case MC_ERROR_TYPE_UE: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_ue_types, err_sub_type)); break; case MC_ERROR_TYPE_SLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_slb_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_SLB; + switch (err_sub_type) { + case MC_ERROR_SLB_PARITY: + mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case MC_ERROR_SLB_MULTIHIT: + mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case MC_ERROR_SLB_INDETERMINATE: + default: + mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: + mce_err.error_type = MCE_ERROR_TYPE_ERAT; + switch (err_sub_type) { + case MC_ERROR_ERAT_PARITY: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; + break; + case MC_ERROR_ERAT_MULTIHIT: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + break; + case MC_ERROR_ERAT_INDETERMINATE: + default: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; case MC_ERROR_TYPE_TLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_soft_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_TLB; + switch (err_sub_type) { + case MC_ERROR_TLB_PARITY: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; + break; + case MC_ERROR_TLB_MULTIHIT: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case MC_ERROR_TLB_INDETERMINATE: + default: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; + case MC_ERROR_TYPE_D_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; + case MC_ERROR_TYPE_I_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + break; + case MC_ERROR_TYPE_UNKNOWN: default: - printk("%s Error type: %s\n", level, - VAL_TO_STRING(mc_err_types, error_type)); + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; break; } - addr = rtas_mc_get_effective_addr(mce_log); - if (addr) - printk("%s Effective address: %016llx\n", level, addr); -} - -static int mce_handle_error(struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - int disposition = rtas_error_disposition(errp); - u8 error_type; - - if (!rtas_error_extended(errp)) - goto out; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (pseries_log == NULL) - goto out; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; - #ifdef CONFIG_PPC_BOOK3S_64 if (disposition == RTAS_DISP_NOT_RECOVERED) { switch (error_type) { @@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; - rtas_set_disposition_recovered(errp); break; default: break; } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; } #endif out: - return disposition; -} - -#ifdef CONFIG_MEMORY_FAILURE - -static DEFINE_PER_CPU(int, rtas_ue_count); -static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); + save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, + &mce_err, regs->nip, eaddr, paddr); -#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 -#define UE_LOGICAL_ADDR_PROVIDED 0x20 - - -static void pseries_hwpoison_work_fn(struct work_struct *work) -{ - unsigned long paddr; - int index; - - while (__this_cpu_read(rtas_ue_count) > 0) { - index = __this_cpu_read(rtas_ue_count) - 1; - paddr = __this_cpu_read(rtas_ue_paddr[index]); - memory_failure(paddr >> PAGE_SHIFT, 0); - __this_cpu_dec(rtas_ue_count); - } -} - -static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); - -static void queue_ue_paddr(unsigned long paddr) -{ - int index; - - index = __this_cpu_inc_return(rtas_ue_count) - 1; - if (index >= MAX_MC_EVT) { - __this_cpu_dec(rtas_ue_count); - return; - } - this_cpu_write(rtas_ue_paddr[index], paddr); - schedule_work(&hwpoison_work); -} - -static void pseries_do_memory_failure(struct pt_regs *regs, - struct pseries_mc_errorlog *mce_log) -{ - unsigned long paddr; - - if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { - paddr = be64_to_cpu(mce_log->logical_address); - } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { - unsigned long pfn; - - pfn = addr_to_pfn(regs, - be64_to_cpu(mce_log->effective_address)); - if (pfn == ULONG_MAX) - return; - paddr = pfn << PAGE_SHIFT; - } else { - return; - } - queue_ue_paddr(paddr); -} - -static void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - - if (!rtas_error_extended(errp)) - return; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (!pseries_log) - return; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - - if (mce_log->error_type == MC_ERROR_TYPE_UE) - pseries_do_memory_failure(regs, mce_log); + return disposition; } -#else -static inline void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) { } -#endif /*CONFIG_MEMORY_FAILURE */ /* * Process MCE rtas errlog event. @@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work) * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ -static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) +static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - int disposition = rtas_error_disposition(err); - - pseries_print_mce_info(regs, err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; - - } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ recovered = 1; + } else if (evt->severity == MCE_SEV_FATAL) { + /* Fatal machine check */ + pr_err("Machine check interrupt is fatal\n"); + recovered = 0; + } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - printk(KERN_ERR "MCE: limited recovery, system may " - "be degraded\n"); - recovered = 1; - - } else if (user_mode(regs) && !is_global_init(current) && - rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { - + if (!recovered && evt->sync_error) { /* - * If we received a synchronous error when in userspace - * kill the task. Firmware may report details of the fail - * asynchronously, so we can't rely on the target and type - * fields being valid here. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. + * + * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - printk(KERN_ERR "MCE: uncorrectable error, killing task " - "%s:%d\n", current->comm, current->pid); - - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } - pseries_process_ue(regs, err); - - /* Queue irq work to log this rtas event later. */ - irq_work_queue(&mce_errlog_process_work); - return recovered; } @@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) */ int pSeries_machine_check_exception(struct pt_regs *regs) { - struct rtas_error_log *errp; + struct machine_check_event evt; - if (fwnmi_active) { - fwnmi_release_errinfo(); - errp = fwnmi_get_errlog(); - if (errp && recover_mce(regs, errp)) - return 1; + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; + + /* Print things out */ + if (evt.version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; } + machine_check_print_event_info(&evt, user_mode(regs), false); + + if (recover_mce(regs, &evt)) + return 1; return 0; } @@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs) * to panic. Hence we will call it as soon as we go into * virtual mode. */ - disposition = mce_handle_error(errp); + disposition = mce_handle_error(regs, errp); + fwnmi_release_errinfo(); + + /* Queue irq work to log this rtas event later. */ + irq_work_queue(&mce_errlog_process_work); + if (disposition == RTAS_DISP_FULLY_RECOVERED) return 1; } diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 8a9c4fb95b8b..7f7369fec46b 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -391,9 +391,9 @@ out: return rv ? rv : count; } -static const struct file_operations ofdt_fops = { - .write = ofdt_write, - .llseek = noop_llseek, +static const struct proc_ops ofdt_proc_ops = { + .proc_write = ofdt_write, + .proc_lseek = noop_llseek, }; /* create /proc/powerpc/ofdt write-only by root */ @@ -401,7 +401,7 @@ static int proc_ppc64_create_ofdt(void) { struct proc_dir_entry *ent; - ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_fops); + ent = proc_create("powerpc/ofdt", 0200, NULL, &ofdt_proc_ops); if (ent) proc_set_size(ent, 0); diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c new file mode 100644 index 000000000000..70c3013fdd07 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "rtas fadump: " fmt + +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/delay.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/fadump.h> +#include <asm/fadump-internal.h> + +#include "rtas-fadump.h" + +static struct rtas_fadump_mem_struct fdm; +static const struct rtas_fadump_mem_struct *fdm_active; + +static void rtas_fadump_update_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rmr_region.destination_address); + + fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + + fadump_conf->boot_memory_size); +} + +/* + * This function is called in the capture kernel to get configuration details + * setup in the first kernel and passed to the f/w. + */ +static void rtas_fadump_get_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_addr[0] = + be64_to_cpu(fdm->rmr_region.source_address); + fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); + fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; + fadump_conf->boot_mem_regs_cnt = 1; + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = + be64_to_cpu(fdm->cpu_state_data.destination_address); + + rtas_fadump_update_config(fadump_conf, fdm); +} + +static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + + memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm.header.dump_format_version = cpu_to_be32(0x00000001); + fdm.header.dump_num_sections = cpu_to_be16(3); + fdm.header.dump_status_flag = 0; + fdm.header.offset_first_dump_section = + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, + cpu_state_data)); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm.header.dd_block_size = 0; + fdm.header.dd_block_offset = 0; + fdm.header.dd_num_blocks = 0; + fdm.header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm.header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm.cpu_state_data.request_flag = + cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.cpu_state_data.source_data_type = + cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.cpu_state_data.source_address = 0; + fdm.cpu_state_data.source_len = + cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + addr += fadump_conf->cpu_state_data_size; + + /* hpte region section */ + fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.hpte_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.hpte_region.source_address = 0; + fdm.hpte_region.source_len = + cpu_to_be64(fadump_conf->hpte_region_size); + fdm.hpte_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->hpte_region_size; + + /* RMA region section */ + fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rmr_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rmr_region.source_address = cpu_to_be64(0); + fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); + fdm.rmr_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_memory_size; + + rtas_fadump_update_config(fadump_conf, &fdm); + + return addr; +} + +static u64 rtas_fadump_get_bootmem_min(void) +{ + return RTAS_FADUMP_MIN_BOOT_MEM; +} + +static int rtas_fadump_register(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc, err = -EIO; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_REGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case 0: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case -1: + pr_err("Failed to register. Hardware Error(%d).\n", rc); + break; + case -3: + if (!is_fadump_boot_mem_contiguous()) + pr_err("Can't have holes in boot memory area.\n"); + else if (!is_fadump_reserved_mem_contiguous()) + pr_err("Can't have holes in reserved memory area.\n"); + + pr_err("Failed to register. Parameter Error(%d).\n", rc); + err = -EINVAL; + break; + case -9: + pr_err("Already registered!\n"); + fadump_conf->dump_registered = 1; + err = -EEXIST; + break; + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; + } + + return err; +} + +static int rtas_fadump_unregister(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_UNREGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to un-register - unexpected error(%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_registered = 0; + return 0; +} + +static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_INVALIDATE, fdm_active, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to invalidate - unexpected error (%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + fdm_active = NULL; + return 0; +} + +#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000 +static inline int rtas_fadump_gpr_index(u64 id) +{ + char str[3]; + int i = -1; + + if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) { + /* get the digits at the end */ + id &= ~RTAS_FADUMP_GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + if (kstrtoint(str, 10, &i)) + i = -EINVAL; + if (i > 31) + i = -1; + } + return i; +} + +void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +{ + int i; + + i = rtas_fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct rtas_fadump_reg_entry* +rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, + struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { + rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), + be64_to_cpu(reg_entry->reg_value)); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) +{ + struct rtas_fadump_reg_save_area_header *reg_header; + struct fadump_crash_info_header *fdh = NULL; + struct rtas_fadump_reg_entry *reg_entry; + u32 num_cpus, *note_buf; + int i, rc = 0, cpu = 0; + struct pt_regs regs; + unsigned long addr; + void *vaddr; + + addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); + vaddr = __va(addr); + + reg_header = vaddr; + if (be64_to_cpu(reg_header->magic_number) != + fadump_str_to_u64("REGSAVE")) { + pr_err("Unable to read register save area.\n"); + return -ENOENT; + } + + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); + pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); + + vaddr += be32_to_cpu(reg_header->num_cpu_offset); + num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct rtas_fadump_reg_entry *)vaddr; + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + + if (fadump_conf->fadumphdr_addr) + fdh = __va(fadump_conf->fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (be64_to_cpu(reg_entry->reg_id) != + fadump_str_to_u64("CPUSTRT")) { + pr_err("Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = (be64_to_cpu(reg_entry->reg_value) & + RTAS_FADUMP_CPU_ID_MASK); + if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = rtas_fadump_read_regs(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + final_note(note_buf); + + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + } + return 0; + +error_out: + fadump_free_cpu_notes_buf(); + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init rtas_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fadump_conf->fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((be16_to_cpu(fdm_active->header.dump_status_flag) == + RTAS_FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + pr_err("Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + pr_err("Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = rtas_fadump_build_cpu_notes(fadump_conf); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static void rtas_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct rtas_fadump_section *cpu_data_section; + const struct rtas_fadump_mem_struct *fdm_ptr; + + if (fdm_active) + fdm_ptr = fdm_active; + else + fdm_ptr = &fdm; + + cpu_data_section = &(fdm_ptr->cpu_state_data); + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(cpu_data_section->destination_address), + be64_to_cpu(cpu_data_section->destination_address) + + be64_to_cpu(cpu_data_section->source_len) - 1, + be64_to_cpu(cpu_data_section->source_len), + be64_to_cpu(cpu_data_section->bytes_dumped)); + + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->hpte_region.destination_address), + be64_to_cpu(fdm_ptr->hpte_region.destination_address) + + be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, + be64_to_cpu(fdm_ptr->hpte_region.source_len), + be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rmr_region.source_address), + be64_to_cpu(fdm_ptr->rmr_region.destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rmr_region.source_len), + be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + + /* Dump is active. Show reserved area start address. */ + if (fdm_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)msg); +} + +static struct fadump_ops rtas_fadump_ops = { + .fadump_init_mem_struct = rtas_fadump_init_mem_struct, + .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, + .fadump_register = rtas_fadump_register, + .fadump_unregister = rtas_fadump_unregister, + .fadump_invalidate = rtas_fadump_invalidate, + .fadump_process = rtas_fadump_process, + .fadump_region_show = rtas_fadump_region_show, + .fadump_trigger = rtas_fadump_trigger, +}; + +void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + int i, size, num_sections; + const __be32 *sections; + const __be32 *token; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return; + + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* Firmware supports 64-bit value for size, align it to pagesize. */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE); + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) { + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active)); + } + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives the size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + fadump_conf->cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case RTAS_FADUMP_HPTE_REGION: + fadump_conf->hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } +} diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h new file mode 100644 index 000000000000..fd59bd7ca9c3 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _PSERIES_RTAS_FADUMP_H +#define _PSERIES_RTAS_FADUMP_H + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need additional 64M + * of memory to avoid OOM issue. + */ +#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26)) + +/* Firmware provided dump sections */ +#define RTAS_FADUMP_CPU_STATE_DATA 0x0001 +#define RTAS_FADUMP_HPTE_REGION 0x0002 +#define RTAS_FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define RTAS_FADUMP_REQUEST_FLAG 0x00000001 + +/* Dump status flag */ +#define RTAS_FADUMP_ERROR_FLAG 0x2000 + +/* Kernel Dump section info */ +struct rtas_fadump_section { + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct rtas_fadump_section_header { + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; + + /* Fields for disk dump option. */ + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + __be32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct rtas_fadump_mem_struct { + struct rtas_fadump_section_header header; + + /* Kernel dump sections */ + struct rtas_fadump_section cpu_state_data; + struct rtas_fadump_section hpte_region; + + /* + * TODO: Extend multiple boot memory regions support in the kernel + * for this platform. + */ + struct rtas_fadump_section rmr_region; +}; + +/* + * The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" + * and ends with "CPUEND". + */ + +/* Register save area header. */ +struct rtas_fadump_reg_save_area_header { + __be64 magic_number; + __be32 version; + __be32 num_cpu_offset; +}; + +/* Register entry. */ +struct rtas_fadump_reg_entry { + __be64 reg_id; + __be64 reg_value; +}; + +/* Utility macros */ +#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != \ + fadump_str_to_u64("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#endif /* _PSERIES_RTAS_FADUMP_H */ diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index a0001280503c..2879c4f0ceb7 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -152,13 +152,12 @@ static int scanlog_release(struct inode * inode, struct file * file) return 0; } -static const struct file_operations scanlog_fops = { - .owner = THIS_MODULE, - .read = scanlog_read, - .write = scanlog_write, - .open = scanlog_open, - .release = scanlog_release, - .llseek = noop_llseek, +static const struct proc_ops scanlog_proc_ops = { + .proc_read = scanlog_read, + .proc_write = scanlog_write, + .proc_open = scanlog_open, + .proc_release = scanlog_release, + .proc_lseek = noop_llseek, }; static int __init scanlog_init(void) @@ -176,7 +175,7 @@ static int __init scanlog_init(void) goto err; ent = proc_create("powerpc/rtas/scan-log-dump", 0400, NULL, - &scanlog_fops); + &scanlog_proc_ops); if (!ent) goto err; return 0; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f5940cc71c37..0c8421dd01ab 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -69,10 +69,14 @@ #include <asm/security_features.h> #include <asm/asm-const.h> #include <asm/swiotlb.h> +#include <asm/svm.h> #include "pseries.h" #include "../../../../drivers/pci/pci.h" +DEFINE_STATIC_KEY_FALSE(shared_processor); +EXPORT_SYMBOL_GPL(shared_processor); + int CMO_PrPSP = -1; int CMO_SecPSP = -1; unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K); @@ -141,17 +145,19 @@ static void __init fwnmi_init(void) } #ifdef CONFIG_PPC_BOOK3S_64 - /* Allocate per cpu slb area to save old slb contents during MCE */ - size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; - slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry), - MEMBLOCK_LOW_LIMIT, ppc64_rma_size, - NUMA_NO_NODE); - if (!slb_ptr) - panic("Failed to allocate %zu bytes below %pa for slb area\n", - size, &ppc64_rma_size); - - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + if (!radix_enabled()) { + /* Allocate per cpu area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = memblock_alloc_try_nid_raw(size, + sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!slb_ptr) + panic("Failed to allocate %zu bytes below %pa for slb area\n", + size, &ppc64_rma_size); + + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + } #endif } @@ -297,8 +303,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); @@ -316,6 +324,9 @@ static void pseries_lpar_idle(void) * low power mode by ceding processor to hypervisor */ + if (!prep_irq_for_idle()) + return; + /* Indicate to hypervisor that we are idle. */ get_lppaca()->idle = 1; @@ -736,6 +747,7 @@ static void __init pSeries_setup_arch(void) pseries_setup_rfi_flush(); setup_stf_barrier(); + pseries_lpar_read_hblkrm_characteristics(); /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); @@ -749,6 +761,10 @@ static void __init pSeries_setup_arch(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); + + if (lppaca_shared_proc(get_lppaca())) + static_branch_enable(&shared_processor); + ppc_md.power_save = pseries_lpar_idle; ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; #ifdef CONFIG_PCI_IOV diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 4b3ef8d9c63f..ad61e90032da 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -41,6 +41,7 @@ #include <asm/dbell.h> #include <asm/plpar_wrappers.h> #include <asm/code-patching.h> +#include <asm/svm.h> #include "pseries.h" #include "offline_states.h" @@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi; diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index 000000000000..40c0637203d5 --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2018 IBM Corporation + * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com> + */ + +#include <linux/mm.h> +#include <asm/machdep.h> +#include <asm/svm.h> +#include <asm/swiotlb.h> +#include <asm/ultravisor.h> + +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* + * Since the guest memory is inaccessible to the host, devices always + * need to use the SWIOTLB buffer for DMA even if dma_capable() says + * otherwise. + */ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +} diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 6601b9d404dc..f682b7babc09 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -605,6 +605,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = { .unmap_page = vio_dma_iommu_unmap_page, .dma_supported = dma_iommu_dma_supported, .get_required_mask = dma_iommu_get_required_mask, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, }; /** @@ -1174,6 +1176,8 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) if (tbl == NULL) return NULL; + kref_init(&tbl->it_kref); + of_parse_dma_window(dev->dev.of_node, dma_window, &tbl->it_index, &offset, &size); @@ -1191,7 +1195,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) else tbl->it_ops = &iommu_table_pseries_ops; - return iommu_init_table(tbl, -1); + return iommu_init_table(tbl, -1, 0, 0); } /** |