diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMParallelDSP.cpp | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll | 329 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | 161 | 
4 files changed, 497 insertions, 5 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index a4d3fffa75b..6ef03c644d1 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -68,7 +68,7 @@ namespace {      }      LoadInst *getBaseLoad() const { -      return cast<LoadInst>(LHS); +      return VecLd.front();      }    }; @@ -696,13 +696,15 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,    // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.    // TODO: Support big-endian as well.    Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); -  BaseSExt->setOperand(0, Bottom); +  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); +  BaseSExt->replaceAllUsesWith(NewBaseSExt);    IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());    Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());    Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);    Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); -  OffsetSExt->setOperand(0, Trunc); +  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); +  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);    WideLoads.emplace(std::make_pair(Base,                                     make_unique<WidenedLoad>(Loads, WideLoad))); diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll index 8d26f61eb6c..d9dbd960974 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -5,7 +5,7 @@  ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc)  define i32 @single_block(i16* %a, i16* %b, i32 %acc) {  entry:    %ld.a.0 = load i16, i16* %a @@ -30,7 +30,7 @@ entry:  ; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]]  ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*  ; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) +; CHECK:  call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0)  define i32 @multi_block(i16* %a, i16* %b, i32 %acc) {  entry:    %ld.a.0 = load i16, i16* %a diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll new file mode 100644 index 00000000000..c072df49cdf --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -0,0 +1,329 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: exchange_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.1 +  %mul.1 = mul i32 %sext.a.1, %sext.b.0 +  %add = add i32 %mul.0, %mul.1 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.b.1, %sext.a.0 +  %mul.1 = mul i32 %sext.b.0, %sext.a.1 +  %add = add i32 %mul.0, %mul.1 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.1 +  %mul.1 = mul i32 %sext.a.1, %sext.b.0 +  %add = add i32 %mul.1, %mul.0 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.b.1, %sext.a.0 +  %mul.1 = mul i32 %sext.b.0, %sext.a.1 +  %add = add i32 %mul.1, %mul.0 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) +define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.1 +  %mul.1 = mul i32 %sext.a.1, %sext.b.0 +  %add = add i32 %mul.0, %mul.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.a.3, %sext.b.1 +  %mul.3 = mul i32 %sext.a.2, %sext.b.0 +  %add.1 = add i32 %mul.2, %mul.3 +  %add.2 = add i32 %add, %add.1 +  %res = add i32 %add.2, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) +define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %add = add i32 %mul.0, %mul.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.b.0, %sext.a.3 +  %mul.3 = mul i32 %sext.b.1, %sext.a.2 +  %add.1 = add i32 %mul.2, %mul.3 +  %add.2 = add i32 %add, %add.1 +  %res = add i32 %add.2, %acc +  ret i32 %res +} + +; TODO: Why aren't two intrinsics generated? +; CHECK-LABEL: exchange_multi_use_3 +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 +define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.b.0, %sext.a.3 +  %mul.3 = mul i32 %sext.b.1, %sext.a.2 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %add = add i32 %mul.0, %mul.1 +  %add.1 = add i32 %mul.2, %mul.3 +  %sub = sub i32 %add, %add.1 +  %res = add i32 %acc, %sub +  ret i32 %res +} + +; TODO: Why isn't smladx generated too? +; CHECK-LABEL: exchange_multi_use_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.b.0, %sext.a.3 +  %mul.3 = mul i32 %sext.b.1, %sext.a.2 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %add.1 = add i32 %mul.2, %mul.3 +  %add = add i32 %mul.0, %mul.1 +  %sub = sub i32 %add, %add.1 +  %res = add i32 %acc, %sub +  ret i32 %res +} + +; CHECK-LABEL: exchange_swap +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.1, %sext.b.0 +  %mul.1 = mul i32 %sext.a.0, %sext.b.1 +  %add = add i32 %mul.0, %mul.1 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_swap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.1, %sext.b.0 +  %mul.1 = mul i32 %sext.a.0, %sext.b.1 +  %add = add i32 %mul.1, %mul.0 +  %res = add i32 %add, %acc +  ret i32 %res +} + +; CHECK-LABEL: exchange_swap_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.b.0, %sext.a.1 +  %mul.1 = mul i32 %sext.b.1, %sext.a.0 +  %add = add i32 %mul.1, %mul.0 +  %res = add i32 %add, %acc +  ret i32 %res +} diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll new file mode 100644 index 00000000000..238f1eb0301 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -0,0 +1,161 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: overlap_1 +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %mul.2 = mul i32 %sext.a.2, %sext.b.2 +  %add = add i32 %mul.0, %mul.1 +  %add.1 = add i32 %mul.1, %mul.2 +  %add.2 = add i32 %add.1, %add +  %res = add i32 %add.2, %acc +  ret i32 %res +} + +; CHECK-LABEL: overlap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %mul.2 = mul i32 %sext.b.2, %sext.a.2 +  %add = add i32 %mul.0, %mul.1 +  %add.1 = add i32 %mul.1, %mul.2 +  %add.2 = add i32 %add, %add.1 +  %res = add i32 %add.2, %acc +  ret i32 %res +} + +; CHECK-LABEL: overlap_3 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.a.2, %sext.b.1 +  %mul.3 = mul i32 %sext.a.3, %sext.b.2 +  %add = add i32 %mul.0, %mul.1 +  %add.1 = add i32 %mul.2, %mul.3 +  %add.2 = add i32 %add.1, %add +  %res = add i32 %add.2, %acc +  ret i32 %res +} + +; CHECK-LABEL: overlap_4 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +entry: +  %addr.a.1 = getelementptr i16, i16* %a, i32 1 +  %addr.b.1 = getelementptr i16, i16* %b, i32 1 +  %ld.a.0 = load i16, i16* %a +  %sext.a.0 = sext i16 %ld.a.0 to i32 +  %ld.b.0 = load i16, i16* %b +  %ld.a.1 = load i16, i16* %addr.a.1 +  %ld.b.1 = load i16, i16* %addr.b.1 +  %sext.a.1 = sext i16 %ld.a.1 to i32 +  %sext.b.1 = sext i16 %ld.b.1 to i32 +  %sext.b.0 = sext i16 %ld.b.0 to i32 +  %mul.0 = mul i32 %sext.a.0, %sext.b.0 +  %mul.1 = mul i32 %sext.a.1, %sext.b.1 +  %addr.a.2 = getelementptr i16, i16* %a, i32 2 +  %addr.b.2 = getelementptr i16, i16* %b, i32 2 +  %addr.a.3 = getelementptr i16, i16* %a, i32 3 +  %ld.a.2 = load i16, i16* %addr.a.2 +  %ld.b.2 = load i16, i16* %addr.b.2 +  %ld.a.3 = load i16, i16* %addr.a.3 +  %sext.a.2 = sext i16 %ld.a.2 to i32 +  %sext.b.2 = sext i16 %ld.b.2 to i32 +  %sext.a.3 = sext i16 %ld.a.3 to i32 +  %mul.2 = mul i32 %sext.b.2, %sext.a.2 +  %mul.3 = mul i32 %sext.b.1, %sext.a.3 +  %add = add i32 %mul.0, %mul.1 +  %add.1 = add i32 %mul.2, %mul.3 +  %add.2 = add i32 %add.1, %add +  %res = add i32 %add.2, %acc +  ret i32 %res +}  | 

