[ARM] Disable VLD4 under MVE

Alas, using half the available vector registers in a single instruction is just too much for the register allocator to handle. The mve-vldst4.ll test here fails when these instructions are enabled at present. This patch disables the generation of VLD4 and VST4 by adding a mve-max-interleave-factor option, which we currently default to 2. Differential Revision: https://reviews.llvm.org/D71109
author: David Green <david.green@arm.com> 2019-12-08 09:58:03 +0000
committer: David Green <david.green@arm.com> 2019-12-08 10:37:29 +0000
commit: 3a6eb5f16054e8c0f41a37542a5fc806016502a0 (patch)
tree: f46645123431bee65ab83f055038064fdbd6e9cb /llvm/test/Transforms/LoopVectorize
parent: e8716a6df7abad68b6cf81c437a2e0524e88f3ad (diff)
download: bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.tar.gz
bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.zip
2 files changed, 109 insertions, 22 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index d94735f13cf..4cd63b42238 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -700,14 +700,14 @@ entry:
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1
 ; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1
 ; VF_16-LABEL: Checking a loop in "i8_factor_4"
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0
@@ -754,23 +754,23 @@ entry:
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in "i16_factor_4"
-; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in "i16_factor_4"
-; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0
@@ -808,32 +808,32 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in "i32_factor_4"
-; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in "i32_factor_4"
-; VF_8:          Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in "i32_factor_4"
-; VF_16:         Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0
@@ -943,23 +943,23 @@ entry:
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
 ; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2
 ; VF_8-LABEL:  Checking a loop in "f16_factor_4"
-; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
 ; VF_16-LABEL: Checking a loop in "f16_factor_4"
-; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0
@@ -997,32 +997,32 @@ entry:
 ; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
 ; VF_2-NEXT:     Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4
 ; VF_4-LABEL:  Checking a loop in "f32_factor_4"
-; VF_4:          Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_4:          Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_4-NEXT:     Found an estimated cost of 8 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_4-NEXT:     Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
 ; VF_8-LABEL:  Checking a loop in "f32_factor_4"
-; VF_8:          Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_8:          Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_8-NEXT:     Found an estimated cost of 16 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_8-NEXT:     Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
 ; VF_16-LABEL: Checking a loop in "f32_factor_4"
-; VF_16:         Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_16:         Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
 ; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_16-NEXT:    Found an estimated cost of 32 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_16-NEXT:    Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
new file mode 100644
index 00000000000..cb6e1005db1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
@@ -0,0 +1,87 @@
+; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=1 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=2 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=4 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-4
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; CHECK-LABEL: vld2
+; CHECK-2: vector.body
+; CHECK-NO2-NOT: vector.body
+define void @vld2(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
+entry:
+  %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
+  %0 = bitcast i16 %tmp.0.extract.trunc to half
+  %mul = mul i32 %numCols, %numRows
+  %shr = lshr i32 %mul, 2
+  %cmp26 = icmp eq i32 %shr, 0
+  br i1 %cmp26, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %pIn.addr.029 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ]
+  %pOut.addr.028 = phi half* [ %add.ptr7, %while.body ], [ %pOut, %entry ]
+  %blkCnt.027 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+  %1 = load half, half* %pIn.addr.029, align 2
+  %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.029, i32 1
+  %2 = load half, half* %arrayidx2, align 2
+  %mul3 = fmul half %1, %0
+  %mul4 = fmul half %2, %0
+  store half %mul3, half* %pOut.addr.028, align 2
+  %arrayidx6 = getelementptr inbounds half, half* %pOut.addr.028, i32 1
+  store half %mul4, half* %arrayidx6, align 2
+  %add.ptr = getelementptr inbounds half, half* %pIn.addr.029, i32 2
+  %add.ptr7 = getelementptr inbounds half, half* %pOut.addr.028, i32 2
+  %dec = add nsw i32 %blkCnt.027, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: vld4
+; CHECK-4: vector.body
+; CHECK-NO4-NOT: vector.body
+define void @vld4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
+entry:
+  %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
+  %0 = bitcast i16 %tmp.0.extract.trunc to half
+  %mul = mul i32 %numCols, %numRows
+  %shr = lshr i32 %mul, 2
+  %cmp38 = icmp eq i32 %shr, 0
+  br i1 %cmp38, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %pIn.addr.041 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ]
+  %pOut.addr.040 = phi half* [ %add.ptr13, %while.body ], [ %pOut, %entry ]
+  %blkCnt.039 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+  %1 = load half, half* %pIn.addr.041, align 2
+  %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.041, i32 1
+  %2 = load half, half* %arrayidx2, align 2
+  %arrayidx3 = getelementptr inbounds half, half* %pIn.addr.041, i32 2
+  %3 = load half, half* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %pIn.addr.041, i32 3
+  %4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %1, %0
+  %mul6 = fmul half %2, %0
+  %mul7 = fmul half %3, %0
+  %mul8 = fmul half %4, %0
+  store half %mul5, half* %pOut.addr.040, align 2
+  %arrayidx10 = getelementptr inbounds half, half* %pOut.addr.040, i32 1
+  store half %mul6, half* %arrayidx10, align 2
+  %arrayidx11 = getelementptr inbounds half, half* %pOut.addr.040, i32 2
+  store half %mul7, half* %arrayidx11, align 2
+  %arrayidx12 = getelementptr inbounds half, half* %pOut.addr.040, i32 3
+  store half %mul8, half* %arrayidx12, align 2
+  %add.ptr = getelementptr inbounds half, half* %pIn.addr.041, i32 4
+  %add.ptr13 = getelementptr inbounds half, half* %pOut.addr.040, i32 4
+  %dec = add nsw i32 %blkCnt.039, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
author	David Green <david.green@arm.com>	2019-12-08 09:58:03 +0000
committer	David Green <david.green@arm.com>	2019-12-08 10:37:29 +0000
commit	3a6eb5f16054e8c0f41a37542a5fc806016502a0 (patch)
tree	f46645123431bee65ab83f055038064fdbd6e9cb /llvm/test/Transforms/LoopVectorize
parent	e8716a6df7abad68b6cf81c437a2e0524e88f3ad (diff)
download	bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.tar.gz bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.zip