diff options
| -rw-r--r-- | llvm/lib/Target/ARM/ARMParallelDSP.cpp | 67 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll | 159 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll | 58 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll | 6 | 
13 files changed, 316 insertions, 91 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 8bda733c50c..ae5657a0a2c 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -18,6 +18,7 @@  #include "llvm/ADT/SmallPtrSet.h"  #include "llvm/Analysis/AliasAnalysis.h"  #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/OrderedBasicBlock.h"  #include "llvm/IR/Instructions.h"  #include "llvm/IR/NoFolder.h"  #include "llvm/Transforms/Scalar.h" @@ -42,6 +43,10 @@ static cl::opt<bool>  DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),                     cl::desc("Disable the ARM Parallel DSP pass")); +static cl::opt<unsigned> +NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), +             cl::desc("Limit the number of loads analysed")); +  namespace {    struct MulCandidate;    class Reduction; @@ -346,6 +351,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {    SmallVector<Instruction*, 8> Writes;    LoadPairs.clear();    WideLoads.clear(); +  OrderedBasicBlock OrderedBB(BB);    // Collect loads and instruction that may write to memory. For now we only    // record loads which are simple, sign-extended and have a single user. @@ -360,21 +366,24 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {      Loads.push_back(Ld);    } +  if (Loads.empty() || Loads.size() > NumLoadLimit) +    return false; +    using InstSet = std::set<Instruction*>;    using DepMap = std::map<Instruction*, InstSet>;    DepMap RAWDeps;    // Record any writes that may alias a load.    const auto Size = LocationSize::unknown(); -  for (auto Read : Loads) { -    for (auto Write : Writes) { +  for (auto Write : Writes) { +    for (auto Read : Loads) {        MemoryLocation ReadLoc =          MemoryLocation(Read->getPointerOperand(), Size);        if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),            ModRefInfo::ModRef)))          continue; -      if (DT->dominates(Write, Read)) +      if (OrderedBB.dominates(Write, Read))          RAWDeps[Read].insert(Write);      }    } @@ -382,8 +391,8 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {    // Check whether there's not a write between the two loads which would    // prevent them from being safely merged.    auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { -    LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; -    LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; +    LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; +    LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;      if (RAWDeps.count(Dominated)) {        InstSet &WritesBefore = RAWDeps[Dominated]; @@ -391,7 +400,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {        for (auto Before : WritesBefore) {          // We can't move the second load backward, past a write, to merge          // with the first load. -        if (DT->dominates(Dominator, Before)) +        if (OrderedBB.dominates(Dominator, Before))            return false;        }      } @@ -401,7 +410,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {    // Record base, offset load pairs.    for (auto *Base : Loads) {      for (auto *Offset : Loads) { -      if (Base == Offset) +      if (Base == Offset || OffsetLoads.count(Offset))          continue;        if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) && @@ -613,7 +622,6 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {    return !R.getMulPairs().empty();  } -  void ARMParallelDSP::InsertParallelMACs(Reduction &R) {    auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, @@ -633,30 +641,45 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {          Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);      IRBuilder<NoFolder> Builder(InsertAfter->getParent(), -                                ++BasicBlock::iterator(InsertAfter)); +                                BasicBlock::iterator(InsertAfter));      Instruction *Call = Builder.CreateCall(SMLAD, Args);      NumSMLAD++;      return Call;    }; -  Instruction *InsertAfter = R.getRoot(); +  // Return the instruction after the dominated instruction. +  auto GetInsertPoint = [this](Value *A, Value *B) { +    assert((isa<Instruction>(A) || isa<Instruction>(B)) && +           "expected at least one instruction"); + +    Value *V = nullptr; +    if (!isa<Instruction>(A)) +      V = B; +    else if (!isa<Instruction>(B)) +      V = A; +    else +      V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A; + +    return &*++BasicBlock::iterator(cast<Instruction>(V)); +  }; +    Value *Acc = R.getAccumulator();    // For any muls that were discovered but not paired, accumulate their values    // as before. -  IRBuilder<NoFolder> Builder(InsertAfter->getParent(), -                              ++BasicBlock::iterator(InsertAfter)); +  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());    MulCandList &MulCands = R.getMuls();    for (auto &MulCand : MulCands) {      if (MulCand->Paired)        continue; -    Value *Mul = MulCand->Root; +    Instruction *Mul = cast<Instruction>(MulCand->Root);      LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");      if (R.getType() != Mul->getType()) {        assert(R.is64Bit() && "expected 64-bit result"); -      Mul = Builder.CreateSExt(Mul, R.getType()); +      Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); +      Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));      }      if (!Acc) { @@ -664,8 +687,11 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {        continue;      } +    // If Acc is the original incoming value to the reduction, it could be a +    // phi. But the phi will dominate Mul, meaning that Mul will be the +    // insertion point. +    Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));      Acc = Builder.CreateAdd(Mul, Acc); -    InsertAfter = cast<Instruction>(Acc);    }    if (!Acc) { @@ -677,6 +703,14 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {      Acc = Builder.CreateSExt(Acc, R.getType());    } +  // Roughly sort the mul pairs in their program order. +  OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); +  llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { +               const Instruction *A = PairA.first->Root; +               const Instruction *B = PairB.first->Root; +               return OrderedBB.dominates(A, B); +             }); +    IntegerType *Ty = IntegerType::get(M->getContext(), 32);    for (auto &Pair : R.getMulPairs()) {      MulCandidate *LHSMul = Pair.first; @@ -688,8 +722,9 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {      LoadInst *WideRHS = WideLoads.count(BaseRHS) ?        WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); +    Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); +    InsertAfter = GetInsertPoint(InsertAfter, Acc);      Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); -    InsertAfter = cast<Instruction>(Acc);    }    R.UpdateRoot(cast<Instruction>(Acc));  } diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll index 721f887bfef..adcb81ecab1 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -134,3 +134,162 @@ bb.1:    ret i32 %res  } +; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as +; scalars. +; CHECK-LABEL: num_load_limit +; CHECK: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) { +entry: +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.1 = load i16, i16* %addr.a.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %add.0 = add i32 %mul.0, %mul.1 + +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %mul.2 = mul i32 %sext.a.0, %sext.b.0 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %addr.b.3 = getelementptr i16, i16* %b, i32 3 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %ld.b.3 = load i16, i16* %addr.b.3 +  %sext.b.3 = sext i16 %ld.b.3 to i32 +  %mul.3 = mul i32 %sext.a.1, %sext.b.3 +  %add.3 = add i32 %mul.2, %mul.3 + +  %addr.a.4 = getelementptr i16, i16* %a, i32 4 +  %addr.b.4 = getelementptr i16, i16* %b, i32 4 +  %ld.a.4 = load i16, i16* %addr.a.4 +  %sext.a.4 = sext i16 %ld.a.4 to i32 +  %ld.b.4 = load i16, i16* %addr.b.4 +  %sext.b.4 = sext i16 %ld.b.4 to i32 +  %mul.4 = mul i32 %sext.a.4, %sext.b.4 +  %addr.a.5 = getelementptr i16, i16* %a, i32 5 +  %addr.b.5 = getelementptr i16, i16* %b, i32 5 +  %ld.a.5 = load i16, i16* %addr.a.5 +  %sext.a.5 = sext i16 %ld.a.5 to i32 +  %ld.b.5 = load i16, i16* %addr.b.5 +  %sext.b.5 = sext i16 %ld.b.5 to i32 +  %mul.5 = mul i32 %sext.a.5, %sext.b.5 +  %add.5 = add i32 %mul.4, %mul.5 + +  %addr.a.6 = getelementptr i16, i16* %a, i32 6 +  %addr.b.6 = getelementptr i16, i16* %b, i32 6 +  %ld.a.6 = load i16, i16* %addr.a.6 +  %sext.a.6 = sext i16 %ld.a.6 to i32 +  %ld.b.6 = load i16, i16* %addr.b.6 +  %sext.b.6 = sext i16 %ld.b.6 to i32 +  %mul.6 = mul i32 %sext.a.6, %sext.b.6 +  %addr.a.7 = getelementptr i16, i16* %a, i32 7 +  %addr.b.7 = getelementptr i16, i16* %b, i32 7 +  %ld.a.7 = load i16, i16* %addr.a.7 +  %sext.a.7 = sext i16 %ld.a.7 to i32 +  %ld.b.7 = load i16, i16* %addr.b.7 +  %sext.b.7 = sext i16 %ld.b.7 to i32 +  %mul.7 = mul i32 %sext.a.7, %sext.b.7 +  %add.7 = add i32 %mul.6, %mul.7 + +  %add.10 = add i32 %add.7, %add.5 +  %add.11 = add i32 %add.3, %add.0 +  %add.12 = add i32 %add.10, %add.11 +  %res = add i32 %add.12, %acc +  ret i32 %res +} + +; CHECK-LABEL: too_many_loads +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) { +entry: +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.1 = load i16, i16* %addr.a.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %add.0 = add i32 %mul.0, %mul.1 + +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %mul.2 = mul i32 %sext.a.0, %sext.b.0 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %addr.b.3 = getelementptr i16, i16* %b, i32 3 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %ld.b.3 = load i16, i16* %addr.b.3 +  %sext.b.3 = sext i16 %ld.b.3 to i32 +  %mul.3 = mul i32 %sext.a.1, %sext.b.3 +  %add.3 = add i32 %mul.2, %mul.3 + +  %addr.a.4 = getelementptr i16, i16* %a, i32 4 +  %addr.b.4 = getelementptr i16, i16* %b, i32 4 +  %ld.a.4 = load i16, i16* %addr.a.4 +  %sext.a.4 = sext i16 %ld.a.4 to i32 +  %ld.b.4 = load i16, i16* %addr.b.4 +  %sext.b.4 = sext i16 %ld.b.4 to i32 +  %mul.4 = mul i32 %sext.a.4, %sext.b.4 +  %addr.a.5 = getelementptr i16, i16* %a, i32 5 +  %addr.b.5 = getelementptr i16, i16* %b, i32 5 +  %ld.a.5 = load i16, i16* %addr.a.5 +  %sext.a.5 = sext i16 %ld.a.5 to i32 +  %ld.b.5 = load i16, i16* %addr.b.5 +  %sext.b.5 = sext i16 %ld.b.5 to i32 +  %mul.5 = mul i32 %sext.a.5, %sext.b.5 +  %add.5 = add i32 %mul.4, %mul.5 + +  %addr.a.6 = getelementptr i16, i16* %a, i32 6 +  %addr.b.6 = getelementptr i16, i16* %b, i32 6 +  %ld.a.6 = load i16, i16* %addr.a.6 +  %sext.a.6 = sext i16 %ld.a.6 to i32 +  %ld.b.6 = load i16, i16* %addr.b.6 +  %sext.b.6 = sext i16 %ld.b.6 to i32 +  %mul.6 = mul i32 %sext.a.6, %sext.b.6 +  %addr.a.7 = getelementptr i16, i16* %a, i32 7 +  %addr.b.7 = getelementptr i16, i16* %b, i32 7 +  %ld.a.7 = load i16, i16* %addr.a.7 +  %sext.a.7 = sext i16 %ld.a.7 to i32 +  %ld.b.7 = load i16, i16* %addr.b.7 +  %sext.b.7 = sext i16 %ld.b.7 to i32 +  %mul.7 = mul i32 %sext.a.7, %sext.b.7 +  %add.7 = add i32 %mul.6, %mul.7 + +  %addr.a.8 = getelementptr i16, i16* %a, i32 7 +  %addr.b.8 = getelementptr i16, i16* %b, i32 7 +  %ld.a.8 = load i16, i16* %addr.a.8 +  %sext.a.8 = sext i16 %ld.a.8 to i32 +  %ld.b.8 = load i16, i16* %addr.b.8 +  %sext.b.8 = sext i16 %ld.b.8 to i32 +  %mul.8 = mul i32 %sext.a.8, %sext.b.8 + +  %add.10 = add i32 %add.7, %add.5 +  %add.11 = add i32 %add.3, %add.0 +  %add.12 = add i32 %add.10, %add.11 +  %add.13 = add i32 %add.12, %acc +  %res = add i32 %add.13, %mul.8 +  ret i32 %res +} diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll index 68702b722c7..fab5aba46f5 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -1,21 +1,51 @@ -; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC +; RUN: opt -S -mtriple=armv7-a -arm-parallel-dsp -dce %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT  ; TODO: Think we should be able to use smlsdx/smlsldx here.  ; CHECK-LABEL: complex_dot_prod -; CHECK: smulbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlaldx -; CHECK: smlaldx -; CHECK: smlaldx -; CHECK: pop.w	{r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-LLC: smlaldx +; CHECK-LLC: smulbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LCC: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} + +; CHECK-OPT: [[ADDR_A:%[^ ]+]] = bitcast i16* %pSrcA to i32* +; CHECK-OPT: [[A:%[^ ]+]] = load i32, i32* [[ADDR_A]], align 2 +; CHECK-OPT: [[ADDR_A_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 2 +; CHECK-OPT: [[ADDR_B:%[^ ]+]] = bitcast i16* %pSrcB to i32* +; CHECK-OPT: [[B:%[^ ]+]] = load i32, i32* [[ADDR_B]], align 2 +; CHECK-OPT: [[ACC0:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A]], i32 [[B]], i64 0) +; CHECK-OPT: [[ADDR_B_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 2 +; CHECK-OPT: [[CAST_ADDR_A_2:%[^ ]+]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-OPT: [[A_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_2]], align 2 +; CHECK-OPT: [[ADDR_A_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 4 +; CHECK-OPT: [[CAST_ADDR_B_2:%[^ ]+]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-OPT: [[B_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_2]], align 2 +; CHECK-OPT: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_2]], i32 [[B_2]], i64 [[ACC0]]) +; CHECK-OPT: [[ADDR_B_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 4 +; CHECK-OPT: [[CAST_ADDR_A_4:%[^ ]+]] = bitcast i16* [[ADDR_A_4]] to i32* +; CHECK-OPT: [[A_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_4]], align 2 +; CHECK-OPT: [[ADDR_A_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 6 +; CHECK-OPT: [[CAST_ADDR_B_4:%[^ ]+]] = bitcast i16* [[ADDR_B_4]] to i32* +; CHECK-OPT: [[B_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_4]], align 2 +; CHECK-OPT: [[ACC2:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_4]], i32 [[B_4]], i64 [[ACC1]]) +; CHECK-OPT: [[ADDR_B_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 6 +; CHECK-OPT: [[CAST_ADDR_A_6:%[^ ]+]] = bitcast i16* [[ADDR_A_6]] to i32* +; CHECK-OPT: [[A_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_6]], align 2 +; CHECK-OPT: [[CAST_ADDR_B_6:%[^ ]+]] = bitcast i16* [[ADDR_B_6]] to i32* +; CHECK-OPT: [[B_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_6]], align 2 +; CHECK-OPT: call i64 @llvm.arm.smlaldx(i32 [[A_6]], i32 [[B_6]], i64 [[ACC2]]) +  define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) {  entry:    %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA, i32 1 @@ -107,7 +137,7 @@ entry:    %conv85 = sext i32 %mul84 to i64    %sub86 = sub nsw i64 %add76, %conv85    %mul89 = mul nsw i32 %conv73, %conv82 -  %conv90 = sext i32 %mul89 to i64   +  %conv90 = sext i32 %mul89 to i64    %add81 = add nsw i64 %add67, %conv90    %add91 = add nsw i64 %add81, %conv80    %16 = lshr i64 %sub86, 6 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll index 0e2a21ea7b4..e90284b8613 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -105,10 +105,10 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc  ; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])  define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {  entry: @@ -144,10 +144,10 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc  ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])  define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) {  entry: @@ -184,10 +184,10 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc  ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])  define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) {  entry: @@ -225,10 +225,10 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc  ; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])  define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {  entry: @@ -306,8 +306,8 @@ entry:  ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0) -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]]) +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]])  define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) {  entry:    %addr.a.1 = getelementptr i16, i16* %a, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll index 052fb51a8dd..a75dd591dfc 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -11,14 +11,14 @@  ; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2  ; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32*  ; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2 +; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0)  ; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2  ; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32*  ; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2  ; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2  ; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32*  ; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0) -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]]) +; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]])  ; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4  define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll index 37e39a02417..6949b4a7048 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -19,9 +19,9 @@ define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea  ; CHECK-LE-NEXT:    @ =>This Inner Loop Header: Depth=1  ; CHECK-LE-NEXT:    ldr lr, [r3, #2]!  ; CHECK-LE-NEXT:    ldr r4, [r2, #2]! -; CHECK-LE-NEXT:    sxtah r1, r1, lr  ; CHECK-LE-NEXT:    subs r0, #1  ; CHECK-LE-NEXT:    smlad r12, r4, lr, r12 +; CHECK-LE-NEXT:    sxtah r1, r1, lr  ; CHECK-LE-NEXT:    bne .LBB0_2  ; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup  ; CHECK-LE-NEXT:    add.w r0, r12, r1 @@ -210,33 +210,33 @@ for.body:  define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {  ; CHECK-LE-LABEL: mul_top_user:  ; CHECK-LE:       @ %bb.0: @ %entry -; CHECK-LE-NEXT:    .save {r4, r5, r7, lr} -; CHECK-LE-NEXT:    push {r4, r5, r7, lr} +; CHECK-LE-NEXT:    .save {r4, lr} +; CHECK-LE-NEXT:    push {r4, lr}  ; CHECK-LE-NEXT:    cmp r0, #1  ; CHECK-LE-NEXT:    blt .LBB2_4  ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader -; CHECK-LE-NEXT:    sub.w lr, r2, #2 +; CHECK-LE-NEXT:    subs r2, #2  ; CHECK-LE-NEXT:    subs r3, #2  ; CHECK-LE-NEXT:    mov.w r12, #0  ; CHECK-LE-NEXT:    movs r1, #0  ; CHECK-LE-NEXT:    .p2align 2  ; CHECK-LE-NEXT:  .LBB2_2: @ %for.body  ; CHECK-LE-NEXT:    @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT:    ldr r2, [lr, #2]! -; CHECK-LE-NEXT:    ldr r4, [r3, #2]! -; CHECK-LE-NEXT:    asrs r5, r2, #16 -; CHECK-LE-NEXT:    smlad r12, r2, r4, r12 +; CHECK-LE-NEXT:    ldr lr, [r3, #2]! +; CHECK-LE-NEXT:    ldr r4, [r2, #2]!  ; CHECK-LE-NEXT:    subs r0, #1 -; CHECK-LE-NEXT:    mul r1, r5, r1 +; CHECK-LE-NEXT:    smlad r12, r4, lr, r12 +; CHECK-LE-NEXT:    asr.w r4, r4, #16 +; CHECK-LE-NEXT:    mul r1, r4, r1  ; CHECK-LE-NEXT:    bne .LBB2_2  ; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup  ; CHECK-LE-NEXT:    add.w r0, r12, r1 -; CHECK-LE-NEXT:    pop {r4, r5, r7, pc} +; CHECK-LE-NEXT:    pop {r4, pc}  ; CHECK-LE-NEXT:  .LBB2_4:  ; CHECK-LE-NEXT:    mov.w r12, #0  ; CHECK-LE-NEXT:    movs r1, #0  ; CHECK-LE-NEXT:    add.w r0, r12, r1 -; CHECK-LE-NEXT:    pop {r4, r5, r7, pc} +; CHECK-LE-NEXT:    pop {r4, pc}  ;  ; CHECK-BE-LABEL: mul_top_user:  ; CHECK-BE:       @ %bb.0: @ %entry @@ -313,8 +313,8 @@ for.body:  define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {  ; CHECK-LE-LABEL: and_user:  ; CHECK-LE:       @ %bb.0: @ %entry -; CHECK-LE-NEXT:    .save {r4, r5, r7, lr} -; CHECK-LE-NEXT:    push {r4, r5, r7, lr} +; CHECK-LE-NEXT:    .save {r4, lr} +; CHECK-LE-NEXT:    push {r4, lr}  ; CHECK-LE-NEXT:    cmp r0, #1  ; CHECK-LE-NEXT:    blt .LBB3_4  ; CHECK-LE-NEXT:  @ %bb.1: @ %for.body.preheader @@ -327,19 +327,19 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture rea  ; CHECK-LE-NEXT:    @ =>This Inner Loop Header: Depth=1  ; CHECK-LE-NEXT:    ldr r2, [r3, #2]!  ; CHECK-LE-NEXT:    ldr r4, [lr, #2]! -; CHECK-LE-NEXT:    uxth r5, r2 -; CHECK-LE-NEXT:    smlad r12, r4, r2, r12  ; CHECK-LE-NEXT:    subs r0, #1 -; CHECK-LE-NEXT:    mul r1, r5, r1 +; CHECK-LE-NEXT:    smlad r12, r4, r2, r12 +; CHECK-LE-NEXT:    uxth r2, r2 +; CHECK-LE-NEXT:    mul r1, r2, r1  ; CHECK-LE-NEXT:    bne .LBB3_2  ; CHECK-LE-NEXT:  @ %bb.3: @ %for.cond.cleanup  ; CHECK-LE-NEXT:    add.w r0, r12, r1 -; CHECK-LE-NEXT:    pop {r4, r5, r7, pc} +; CHECK-LE-NEXT:    pop {r4, pc}  ; CHECK-LE-NEXT:  .LBB3_4:  ; CHECK-LE-NEXT:    mov.w r12, #0  ; CHECK-LE-NEXT:    movs r1, #0  ; CHECK-LE-NEXT:    add.w r0, r12, r1 -; CHECK-LE-NEXT:    pop {r4, r5, r7, pc} +; CHECK-LE-NEXT:    pop {r4, pc}  ;  ; CHECK-BE-LABEL: and_user:  ; CHECK-BE:       @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll index 1f4b141b7a0..f807149d2c1 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -7,12 +7,12 @@  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)  ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*  ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]  ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*  ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]])  ; CHECK: ret i32 [[RES]]  define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {  entry: @@ -51,12 +51,12 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc)  ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*  ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]  ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*  ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]])  ; CHECK: ret i64 [[RES]]  define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) {  entry: @@ -133,13 +133,14 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)  ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*  ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]  ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]]  define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {  entry:    %addr.a.1 = getelementptr i16, i16* %a, i32 1 @@ -178,13 +179,14 @@ entry:  ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)  ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*  ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]  ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2  ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*  ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]]  define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {  entry:    %addr.a.1 = getelementptr i16, i16* %a, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll index 5cccc0572ae..7620b64f26a 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -15,14 +15,14 @@  ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2  ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*  ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]])  ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5  ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*  ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2  ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4  ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*  ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]])  ; CHECK: ret i32 [[RES]]  define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) {  entry: @@ -88,14 +88,14 @@ entry:  ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2  ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*  ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]])  ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5  ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*  ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2  ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4  ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*  ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]])  ; CHECK: ret i32 [[RES]]  define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) {  entry: @@ -157,14 +157,14 @@ entry:  ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2  ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32*  ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]])  ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5  ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32*  ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2  ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4  ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32*  ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]])  ; CHECK: ret i64 [[RES]]  define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) {  entry: @@ -238,13 +238,13 @@ entry:  ; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2  ; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32  ; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] +; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 +; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]]  ; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32*  ; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2  ; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3  ; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32*  ; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 -; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 -; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]]  ; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]])  ; CHECK: ret i64 [[RES]]  define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll index 22744be02b0..2f47c2ce7e1 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -10,10 +10,10 @@  ; CHECK:  [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2  ; CHECK:  [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32*  ; CHECK:  [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 +; CHECK:  [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054)  ; CHECK:  [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32*  ; CHECK:  [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) -; CHECK:  [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) +; CHECK:  [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]])  ;  ; And we don't want to see a 3rd smlad:  ; CHECK-NOT: call i32 @llvm.arm.smlad diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll index aa012573606..96e0667db6a 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll @@ -12,12 +12,13 @@ define i32 @smladx(i16* nocapture readonly %pIn1, i16* nocapture readonly %pIn2,  ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2  ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*  ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC0]]) +  ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*  ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2  ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*  ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC1]])  ; CHECK-NOT: call i32 @llvm.arm.smlad  ; CHECK-UNSUPPORTED-NOT:  call i32 @llvm.arm.smlad @@ -130,6 +131,7 @@ define i32 @smladx_swap(i16* nocapture readonly %pIn1, i16* nocapture readonly %  ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2  ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*  ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC0]])  ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2  ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -138,8 +140,7 @@ define i32 @smladx_swap(i16* nocapture readonly %pIn1, i16* nocapture readonly %  ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*  ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC1]])  ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4  ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll index e325fe2f2a8..8fcfe66e8c9 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll @@ -11,12 +11,12 @@ define i64 @smlaldx(i16* nocapture readonly %pIn1, i16* nocapture readonly %pIn2  ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2  ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*  ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]])  ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*  ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2  ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*  ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]])  ; CHECK-NOT: call i64 @llvm.arm.smlad  ; CHECK-UNSUPPORTED-NOT:  call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ for.cond.cleanup:  ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2  ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*  ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])  ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2  ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ for.cond.cleanup:  ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*  ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]])  ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4  ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll index 07cc1b41ed2..683562e54aa 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -11,12 +11,12 @@ define i64 @smlaldx(i16* nocapture readonly %pIn1, i16* nocapture readonly %pIn2  ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2  ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32*  ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]])  ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32*  ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2  ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32*  ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]])  ; CHECK-NOT: call i64 @llvm.arm.smlad  ; CHECK-UNSUPPORTED-NOT:  call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ for.cond.cleanup:  ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2  ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32*  ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]])  ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2  ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ for.cond.cleanup:  ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32*  ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]])  ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4  ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll index 16d0216df7e..1e988fe34bf 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -1,6 +1,6 @@  ; RUN: llc -O3 -mtriple=thumbv7em -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=CHECK-REG-PRESSURE  ; RUN: llc -O3 -mtriple=thumbv7eb %s -o - | FileCheck %s --check-prefix=CHECK-UNSUPPORTED -; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp -arm-parallel-dsp-load-limit=20 %s -o - | FileCheck %s --check-prefix=CHECK  ; CHECK-UNSUPPORTED-LABEL: unroll_n_jam_smlad  ; CHECK-UNSUPPORTED-NOT: smlad r{{.}} @@ -38,13 +38,13 @@ entry:  ; CHECK-NOT: smlad r{{.*}}  ; CHECK-REG-PRESSURE: .LBB0_1: +; CHECK-REG-PRESSURE-NOT: call i32 @llvm.arm.smlad  ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp  ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp  ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp  ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp  ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE-NOT: ldr{{.*}}, [sp  ; CHECK-REG-PRESSURE: bne .LBB0_1  for.body:  | 

