2 files changed, 52 insertions, 11 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e4a47edc750..08355c853c4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13162,7 +13162,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
 }
 
 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
-                                                       unsigned,
+                                                       unsigned Alignment,
                                                        MachineMemOperand::Flags,
                                                        bool *Fast) const {
   // Depends what it gets converted into if the type is weird.
@@ -13171,23 +13171,18 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
 
   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+  auto Ty = VT.getSimpleVT().SimpleTy;
 
-  switch (VT.getSimpleVT().SimpleTy) {
-  default:
-    return false;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32: {
+  if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
     // Unaligned access can use (for example) LRDB, LRDH, LDR
     if (AllowsUnaligned) {
       if (Fast)
         *Fast = Subtarget->hasV7Ops();
       return true;
     }
-    return false;
   }
-  case MVT::f64:
-  case MVT::v2f64: {
+
+  if (Ty == MVT::f64 || Ty == MVT::v2f64) {
     // For any little-endian targets with neon, we can support unaligned ld/st
     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
     // A big-endian target may also explicitly support unaligned accesses
@@ -13196,9 +13191,52 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
         *Fast = true;
       return true;
     }
-    return false;
   }
+
+  if (!Subtarget->hasMVEIntegerOps())
+    return false;
+  if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
+      Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
+      Ty != MVT::v2f64)
+    return false;
+
+  if (Subtarget->isLittle()) {
+    // In little-endian MVE, the store instructions VSTRB.U8,
+    // VSTRH.U16 and VSTRW.U32 all store the vector register in
+    // exactly the same format, and differ only in the range of
+    // their immediate offset field and the required alignment.
+    //
+    // In particular, VSTRB.U8 can store a vector at byte alignment.
+    // So at this stage we can simply say that loads/stores of all
+    // 128-bit wide vector types are permitted at any alignment,
+    // because we know at least _one_ instruction can manage that.
+    //
+    // Later on we might find that some of those loads are better
+    // generated as VLDRW.U32 if alignment permits, to take
+    // advantage of the larger immediate range. But for the moment,
+    // all that matters is that if we don't lower the load then
+    // _some_ instruction can handle it.
+    if (Fast)
+      *Fast = true;
+    return true;
+  } else {
+    // In big-endian MVE, those instructions aren't so similar
+    // after all, because they reorder the bytes of the vector
+    // differently. So this time we can only store a particular
+    // kind of vector if its alignment is at least the element
+    // type. And we can't store vectors of i64 or f64 at all
+    // without having to do some postprocessing, because there's
+    // no VSTRD.U64.
+    if (Ty == MVT::v16i8 ||
+        ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
+        ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
   }
+
+  return false;
 }
 
 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 1fb9e64fbfa..f07a5c51562 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -4151,12 +4151,14 @@ multiclass MVE_unpred_vector_store<Instruction RegImmInst, PatFrag StoreKind,
   def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
   def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
   def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+  def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
 }
 
 class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
                                    PatFrag LoadKind, int shift>
       : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
           (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+
 multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
                                   int shift> {
   def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
@@ -4165,6 +4167,7 @@ multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
   def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
   def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
   def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+  def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
 }
 
 let Predicates = [HasMVEInt, IsLE] in {