From 6b5c19c55266f6efd10ffac0e9f9f2b7aa420a58 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 22 Mar 2017 15:21:47 +1100
Subject: powerpc/mmu: Add real mode support for IOMMU preregistered memory

This makes mm_iommu_lookup() able to work in realmode by replacing
list_for_each_entry_rcu() (which can do debug stuff which can fail in
real mode) with list_for_each_entry_lockless().

This adds realmode version of mm_iommu_ua_to_hpa() which adds
explicit vmalloc'd-to-linear address conversion.
Unlike mm_iommu_ua_to_hpa(), mm_iommu_ua_to_hpa_rm() can fail.

This changes mm_iommu_preregistered() to receive @mm as in real mode
@current does not always have a correct pointer.

This adds realmode version of mm_iommu_lookup() which receives @mm
(for the same reason as for mm_iommu_preregistered()) and uses
lockless version of list_for_each_entry_rcu().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/powerpc/include/asm/mmu_context.h')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..c70c8272523d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+		struct mm_struct *mm, unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
-- 
cgit v1.2.1


From a336f2f5b05c3c02876a365b8f17b3d10920dbd5 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 29 Mar 2017 22:00:46 +1100
Subject: powerpc/mm/hash: Abstract context id allocation for KVM

KVM wants to be able to allocate an MMU context id, which it does
currently by calling __init_new_context().

We're about to rework that code, so provide a wrapper for KVM so it
can not worry about the details.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm/mmu_context.h')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0aca261..7d721101ec78 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -51,7 +51,7 @@ static inline void switch_mmu_context(struct mm_struct *prev,
 	return switch_slb(tsk, next);
 }
 
-extern int __init_new_context(void);
+extern int hash__alloc_context_id(void);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
-- 
cgit v1.2.1


From 82228e362f9b7f4b876d0fbb1036c235797c6b1d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 09:07:00 +0530
Subject: powerpc/pseries: Skip using reserved virtual address range

Now that we use all the available virtual address range, we need to make
sure we don't generate VSID such that it overlaps with the reserved vsid
range. Reserved vsid range include the virtual address range used by the
adjunct partition and also the VRMA virtual segment. We find the context
value that can result in generating such a VSID and reserve it early in
boot.

We don't look at the adjunct range, because for now we disable the
adjunct usage in a Linux LPAR via CAS interface.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Rewrite hash__reserve_context_id(), move the rest into pseries]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc/include/asm/mmu_context.h')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 7d721101ec78..78803a7ebdd9 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -52,6 +52,7 @@ static inline void switch_mmu_context(struct mm_struct *prev,
 }
 
 extern int hash__alloc_context_id(void);
+extern void hash__reserve_context_id(int id);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
-- 
cgit v1.2.1


From 9765ad134a00a01cbcc69c78ff6defbfad209bc5 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 19 Apr 2017 16:38:26 +1000
Subject: powerpc/mm: Ensure IRQs are off in switch_mm()

powerpc expects IRQs to already be (soft) disabled when switch_mm() is
called, as made clear in the commit message of 9c1e105238c4 ("powerpc: Allow
perf_counters to access user memory at interrupt time").

Aside from any race conditions that might exist between switch_mm() and an IRQ,
there is also an unconditional hard_irq_disable() in switch_slb(). If that isn't
followed at some point by an IRQ enable then interrupts will remain disabled
until we return to userspace.

It is true that when switch_mm() is called from the scheduler IRQs are off, but
not when it's called by use_mm(). Looking closer we see that last year in commit
f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
this was made more explicit by the addition of switch_mm_irqs_off() which is now
called by the scheduler, vs switch_mm() which is used by use_mm().

Arguably it is a bug in use_mm() to call switch_mm() in a different context than
it expects, but fixing that will take time.

This was discovered recently when vhost started throwing warnings such as:

  BUG: sleeping function called from invalid context at kernel/mutex.c:578
  in_atomic(): 0, irqs_disabled(): 1, pid: 10768, name: vhost-10760
  no locks held by vhost-10760/10768.
  irq event stamp: 10
  hardirqs last  enabled at (9):  _raw_spin_unlock_irq+0x40/0x80
  hardirqs last disabled at (10): switch_slb+0x2e4/0x490
  softirqs last  enabled at (0):  copy_process+0x5e8/0x1260
  softirqs last disabled at (0):  (null)
  Call Trace:
    show_stack+0x88/0x390 (unreliable)
    dump_stack+0x30/0x44
    __might_sleep+0x1c4/0x2d0
    mutex_lock_nested+0x74/0x5c0
    cgroup_attach_task_all+0x5c/0x180
    vhost_attach_cgroups_work+0x58/0x80 [vhost]
    vhost_worker+0x24c/0x3d0 [vhost]
    kthread+0xec/0x100
    ret_from_kernel_thread+0x5c/0xd4

Prior to commit 04b96e5528ca ("vhost: lockless enqueuing") (Aug 2016) the
vhost_worker() would do a spin_unlock_irq() not long after calling use_mm(),
which had the effect of reenabling IRQs. Since that commit removed the locking
in vhost_worker() the body of the vhost_worker() loop now runs with interrupts
off causing the warnings.

This patch addresses the problem by making the powerpc code mirror the x86 code,
ie. we disable interrupts in switch_mm(), and optimise the scheduler case by
defining switch_mm_irqs_off().

Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[mpe: Flesh out/rewrite change log, add stable]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm/mmu_context.h')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 78803a7ebdd9..a114248de2ee 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -71,8 +71,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
  * switch_mm is the entry point called from the architecture independent
  * code in kernel/sched/core.c
  */
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
+static inline void switch_mm_irqs_off(struct mm_struct *prev,
+				      struct mm_struct *next,
+				      struct task_struct *tsk)
 {
 	/* Mark this context has been used on the new CPU */
 	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
@@ -111,6 +112,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	switch_mmu_context(prev, next, tsk);
 }
 
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+			     struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
+}
+#define switch_mm_irqs_off switch_mm_irqs_off
+
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
 
 /*
-- 
cgit v1.2.1