diff options
author | Sam Parker <sam.parker@arm.com> | 2019-08-28 08:51:13 +0000 |
---|---|---|
committer | Sam Parker <sam.parker@arm.com> | 2019-08-28 08:51:13 +0000 |
commit | a761ba0f2d4bb4e977a3cef52d569e34e721f8bc (patch) | |
tree | aae68ddc84a5b84e798c65a1893b5fdb10bf78ea /llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | |
parent | 207c65396590cada06e533a012e239a63a9c1aec (diff) | |
download | bcm5719-llvm-a761ba0f2d4bb4e977a3cef52d569e34e721f8bc.tar.gz bcm5719-llvm-a761ba0f2d4bb4e977a3cef52d569e34e721f8bc.zip |
[ARM][ParallelDSP] Change search for muls
rL369567 reverted a couple of recent changes made to ARMParallelDSP
because of a miscompilation error: PR43073.
The issue stemmed from an underlying bug that was caused by adding
muls into a reduction before it was proved that they could be executed
in parallel with another mul.
Most of the changes here are from the previously reverted commits.
The additional changes have been made area:
1) The Search function now doesn't insert any muls into the Reduction
object. That now happens once the search has successfully finished.
2) For any muls added into the reduction but that weren't paired, we
accumulate their values as an input into the smlad.
Differential Revision: https://reviews.llvm.org/D66660
llvm-svn: 370171
Diffstat (limited to 'llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll')
-rw-r--r-- | llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll new file mode 100644 index 00000000000..a071ec3e748 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -0,0 +1,172 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: overlap_1 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_2 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc +; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]] +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_3 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.1 + %mul.3 = mul i32 %sext.a.3, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_4 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %mul.3 = mul i32 %sext.b.1, %sext.a.3 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} |