summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize
diff options
context:
space:
mode:
authorDavid Green <david.green@arm.com>2019-12-08 09:58:03 +0000
committerDavid Green <david.green@arm.com>2019-12-08 10:37:29 +0000
commit3a6eb5f16054e8c0f41a37542a5fc806016502a0 (patch)
treef46645123431bee65ab83f055038064fdbd6e9cb /llvm/test/Transforms/LoopVectorize
parente8716a6df7abad68b6cf81c437a2e0524e88f3ad (diff)
downloadbcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.tar.gz
bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.zip
[ARM] Disable VLD4 under MVE
Alas, using half the available vector registers in a single instruction is just too much for the register allocator to handle. The mve-vldst4.ll test here fails when these instructions are enabled at present. This patch disables the generation of VLD4 and VST4 by adding a mve-max-interleave-factor option, which we currently default to 2. Differential Revision: https://reviews.llvm.org/D71109
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll44
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll87
2 files changed, 109 insertions, 22 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index d94735f13cf..4cd63b42238 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -700,14 +700,14 @@ entry:
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1
; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1
; VF_16-LABEL: Checking a loop in "i8_factor_4"
-; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
+; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1
-; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1
+; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0
@@ -754,23 +754,23 @@ entry:
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2
; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2
; VF_8-LABEL: Checking a loop in "i16_factor_4"
-; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2
+; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2
; VF_16-LABEL: Checking a loop in "i16_factor_4"
-; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
+; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2
-; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2
+; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0
@@ -808,32 +808,32 @@ entry:
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4
; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4
; VF_4-LABEL: Checking a loop in "i32_factor_4"
-; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4
; VF_8-LABEL: Checking a loop in "i32_factor_4"
-; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4
; VF_16-LABEL: Checking a loop in "i32_factor_4"
-; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
+; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4
-; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4
+; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0
@@ -943,23 +943,23 @@ entry:
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2
; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2
; VF_8-LABEL: Checking a loop in "f16_factor_4"
-; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2
; VF_16-LABEL: Checking a loop in "f16_factor_4"
-; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
+; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2
-; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
+; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0
@@ -997,32 +997,32 @@ entry:
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4
; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4
; VF_4-LABEL: Checking a loop in "f32_factor_4"
-; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4
; VF_8-LABEL: Checking a loop in "f32_factor_4"
-; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4
; VF_16-LABEL: Checking a loop in "f32_factor_4"
-; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
+; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4
; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4
-; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
+; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
new file mode 100644
index 00000000000..cb6e1005db1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
@@ -0,0 +1,87 @@
+; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=1 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=2 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4
+; RUN: opt -loop-vectorize -mve-max-interleave-factor=4 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-4
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; CHECK-LABEL: vld2
+; CHECK-2: vector.body
+; CHECK-NO2-NOT: vector.body
+define void @vld2(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
+entry:
+ %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
+ %0 = bitcast i16 %tmp.0.extract.trunc to half
+ %mul = mul i32 %numCols, %numRows
+ %shr = lshr i32 %mul, 2
+ %cmp26 = icmp eq i32 %shr, 0
+ br i1 %cmp26, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %pIn.addr.029 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ]
+ %pOut.addr.028 = phi half* [ %add.ptr7, %while.body ], [ %pOut, %entry ]
+ %blkCnt.027 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %1 = load half, half* %pIn.addr.029, align 2
+ %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.029, i32 1
+ %2 = load half, half* %arrayidx2, align 2
+ %mul3 = fmul half %1, %0
+ %mul4 = fmul half %2, %0
+ store half %mul3, half* %pOut.addr.028, align 2
+ %arrayidx6 = getelementptr inbounds half, half* %pOut.addr.028, i32 1
+ store half %mul4, half* %arrayidx6, align 2
+ %add.ptr = getelementptr inbounds half, half* %pIn.addr.029, i32 2
+ %add.ptr7 = getelementptr inbounds half, half* %pOut.addr.028, i32 2
+ %dec = add nsw i32 %blkCnt.027, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: vld4
+; CHECK-4: vector.body
+; CHECK-NO4-NOT: vector.body
+define void @vld4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
+entry:
+ %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
+ %0 = bitcast i16 %tmp.0.extract.trunc to half
+ %mul = mul i32 %numCols, %numRows
+ %shr = lshr i32 %mul, 2
+ %cmp38 = icmp eq i32 %shr, 0
+ br i1 %cmp38, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %pIn.addr.041 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ]
+ %pOut.addr.040 = phi half* [ %add.ptr13, %while.body ], [ %pOut, %entry ]
+ %blkCnt.039 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %1 = load half, half* %pIn.addr.041, align 2
+ %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.041, i32 1
+ %2 = load half, half* %arrayidx2, align 2
+ %arrayidx3 = getelementptr inbounds half, half* %pIn.addr.041, i32 2
+ %3 = load half, half* %arrayidx3, align 2
+ %arrayidx4 = getelementptr inbounds half, half* %pIn.addr.041, i32 3
+ %4 = load half, half* %arrayidx4, align 2
+ %mul5 = fmul half %1, %0
+ %mul6 = fmul half %2, %0
+ %mul7 = fmul half %3, %0
+ %mul8 = fmul half %4, %0
+ store half %mul5, half* %pOut.addr.040, align 2
+ %arrayidx10 = getelementptr inbounds half, half* %pOut.addr.040, i32 1
+ store half %mul6, half* %arrayidx10, align 2
+ %arrayidx11 = getelementptr inbounds half, half* %pOut.addr.040, i32 2
+ store half %mul7, half* %arrayidx11, align 2
+ %arrayidx12 = getelementptr inbounds half, half* %pOut.addr.040, i32 3
+ store half %mul8, half* %arrayidx12, align 2
+ %add.ptr = getelementptr inbounds half, half* %pIn.addr.041, i32 4
+ %add.ptr13 = getelementptr inbounds half, half* %pOut.addr.040, i32 4
+ %dec = add nsw i32 %blkCnt.039, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
OpenPOWER on IntegriCloud