2 files changed, 99 insertions, 11 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 08759f28036..62bc010902c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1484,10 +1484,56 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     auto &MMO = **MI.memoperands_begin();
 
     if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
-      // In the case of G_LOAD, this was a non-extending load already and we're
-      // about to lower to the same instruction.
-      if (MI.getOpcode() == TargetOpcode::G_LOAD)
+      if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+        // This load needs splitting into power of 2 sized loads.
+        if (DstTy.isVector())
           return UnableToLegalize;
+        if (isPowerOf2_32(DstTy.getSizeInBits()))
+          return UnableToLegalize; // Don't know what we're being asked to do.
+
+        // Our strategy here is to generate anyextending loads for the smaller
+        // types up to next power-2 result type, and then combine the two larger
+        // result values together, before truncating back down to the non-pow-2
+        // type.
+        // E.g. v1 = i24 load =>
+        // v2 = i32 load (2 byte)
+        // v3 = i32 load (1 byte)
+        // v4 = i32 shl v2, 16
+        // v5 = i32 or v4, v3
+        // v1 = i24 trunc v5
+        // By doing this we generate the correct truncate which should get
+        // combined away as an artifact with a matching extend.
+        uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
+        uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
+
+        MachineFunction &MF = MIRBuilder.getMF();
+        MachineMemOperand *LargeMMO =
+            MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+        MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
+            &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+
+        LLT PtrTy = MRI.getType(PtrReg);
+        unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
+        LLT AnyExtTy = LLT::scalar(AnyExtSize);
+        unsigned LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        unsigned SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        auto LargeLoad =
+            MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
+
+        auto OffsetCst =
+            MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+        unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+        auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+        auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
+                                              *SmallMMO);
+
+        auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
+        auto Shift = MIRBuilder.buildShl(AnyExtTy, LargeLoad, ShiftAmt);
+        auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, SmallLoad);
+        MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
+        MI.eraseFromParent();
+        return Legalized;
+      }
       MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
       MI.eraseFromParent();
       return Legalized;
@@ -1516,6 +1562,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
 
     return UnableToLegalize;
   }
+  case TargetOpcode::G_STORE: {
+    // Lower a non-power of 2 store into multiple pow-2 stores.
+    // E.g. split an i24 store into an i16 store + i8 store.
+    // We do this by first extending the stored value to the next largest power
+    // of 2 type, and then using truncating stores to store the components.
+    // By doing this, likewise with G_LOAD, generate an extend that can be
+    // artifact-combined away instead of leaving behind extracts.
+    unsigned SrcReg = MI.getOperand(0).getReg();
+    unsigned PtrReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+    if (SrcTy.getSizeInBits() != MMO.getSize() /* in bytes */ * 8)
+      return UnableToLegalize;
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+    if (isPowerOf2_32(SrcTy.getSizeInBits()))
+      return UnableToLegalize; // Don't know what we're being asked to do.
+
+    // Extend to the next pow-2.
+    const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
+    auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
+
+    // Obtain the smaller value by shifting away the larger value.
+    uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
+    uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
+    auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
+    auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
+
+    // Generate the GEP and truncating stores.
+    LLT PtrTy = MRI.getType(PtrReg);
+    auto OffsetCst =
+        MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+    unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+    auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineMemOperand *LargeMMO =
+        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+    MachineMemOperand *SmallMMO =
+        MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+    MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
+    MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 85110b2ec76..8f7a521b559 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -234,14 +234,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .legalForTypesWithMemDesc({{s32, p0, 8, 8},
                                  {s32, p0, 16, 8}})
       .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
-      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
-      //       how to do that yet.
-      .unsupportedIfMemSizeNotPow2()
+      .lowerIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
+      .widenScalarToNextPow2(0)
       .clampMaxNumElements(0, s32, 2)
       .clampMaxNumElements(0, s64, 1)
       .customIf(IsPtrVecPred);
@@ -249,6 +247,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   getActionDefinitionsBuilder(G_STORE)
       .legalForTypesWithMemDesc({{s8, p0, 8, 8},
                                  {s16, p0, 16, 8},
+                                 {s32, p0, 8, 8},
+                                 {s32, p0, 16, 8},
                                  {s32, p0, 32, 8},
                                  {s64, p0, 64, 8},
                                  {p0, p0, 64, 8},
@@ -259,10 +259,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                                  {v4s32, p0, 128, 8},
                                  {v2s64, p0, 128, 8}})
       .clampScalar(0, s8, s64)
-      .widenScalarToNextPow2(0)
-      // TODO: We could support sum-of-pow2's but the lowering code doesn't know
-      //       how to do that yet.
-      .unsupportedIfMemSizeNotPow2()
+      .lowerIfMemSizeNotPow2()
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
                Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;