diff options
| author | David Green <david.green@arm.com> | 2019-12-08 09:58:03 +0000 |
|---|---|---|
| committer | David Green <david.green@arm.com> | 2019-12-08 10:37:29 +0000 |
| commit | 3a6eb5f16054e8c0f41a37542a5fc806016502a0 (patch) | |
| tree | f46645123431bee65ab83f055038064fdbd6e9cb /llvm/test/Transforms/LoopVectorize | |
| parent | e8716a6df7abad68b6cf81c437a2e0524e88f3ad (diff) | |
| download | bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.tar.gz bcm5719-llvm-3a6eb5f16054e8c0f41a37542a5fc806016502a0.zip | |
[ARM] Disable VLD4 under MVE
Alas, using half the available vector registers in a single instruction
is just too much for the register allocator to handle. The mve-vldst4.ll
test here fails when these instructions are enabled at present. This
patch disables the generation of VLD4 and VST4 by adding a
mve-max-interleave-factor option, which we currently default to 2.
Differential Revision: https://reviews.llvm.org/D71109
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll | 44 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll | 87 |
2 files changed, 109 insertions, 22 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index d94735f13cf..4cd63b42238 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -700,14 +700,14 @@ entry: ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_4" -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -754,23 +754,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_4" -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_4" -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -808,32 +808,32 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_4" -; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_4" -; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_4" -; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0 @@ -943,23 +943,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_4" -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_4" -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0 @@ -997,32 +997,32 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_4" -; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_4" -; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_4" -; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll new file mode 100644 index 00000000000..cb6e1005db1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll @@ -0,0 +1,87 @@ +; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=1 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=2 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=4 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-4 + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-none-eabi" + +; CHECK-LABEL: vld2 +; CHECK-2: vector.body +; CHECK-NO2-NOT: vector.body +define void @vld2(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { +entry: + %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %mul = mul i32 %numCols, %numRows + %shr = lshr i32 %mul, 2 + %cmp26 = icmp eq i32 %shr, 0 + br i1 %cmp26, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %pIn.addr.029 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ] + %pOut.addr.028 = phi half* [ %add.ptr7, %while.body ], [ %pOut, %entry ] + %blkCnt.027 = phi i32 [ %dec, %while.body ], [ %shr, %entry ] + %1 = load half, half* %pIn.addr.029, align 2 + %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.029, i32 1 + %2 = load half, half* %arrayidx2, align 2 + %mul3 = fmul half %1, %0 + %mul4 = fmul half %2, %0 + store half %mul3, half* %pOut.addr.028, align 2 + %arrayidx6 = getelementptr inbounds half, half* %pOut.addr.028, i32 1 + store half %mul4, half* %arrayidx6, align 2 + %add.ptr = getelementptr inbounds half, half* %pIn.addr.029, i32 2 + %add.ptr7 = getelementptr inbounds half, half* %pOut.addr.028, i32 2 + %dec = add nsw i32 %blkCnt.027, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +; CHECK-LABEL: vld4 +; CHECK-4: vector.body +; CHECK-NO4-NOT: vector.body +define void @vld4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { +entry: + %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %mul = mul i32 %numCols, %numRows + %shr = lshr i32 %mul, 2 + %cmp38 = icmp eq i32 %shr, 0 + br i1 %cmp38, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %pIn.addr.041 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ] + %pOut.addr.040 = phi half* [ %add.ptr13, %while.body ], [ %pOut, %entry ] + %blkCnt.039 = phi i32 [ %dec, %while.body ], [ %shr, %entry ] + %1 = load half, half* %pIn.addr.041, align 2 + %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.041, i32 1 + %2 = load half, half* %arrayidx2, align 2 + %arrayidx3 = getelementptr inbounds half, half* %pIn.addr.041, i32 2 + %3 = load half, half* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half* %pIn.addr.041, i32 3 + %4 = load half, half* %arrayidx4, align 2 + %mul5 = fmul half %1, %0 + %mul6 = fmul half %2, %0 + %mul7 = fmul half %3, %0 + %mul8 = fmul half %4, %0 + store half %mul5, half* %pOut.addr.040, align 2 + %arrayidx10 = getelementptr inbounds half, half* %pOut.addr.040, i32 1 + store half %mul6, half* %arrayidx10, align 2 + %arrayidx11 = getelementptr inbounds half, half* %pOut.addr.040, i32 2 + store half %mul7, half* %arrayidx11, align 2 + %arrayidx12 = getelementptr inbounds half, half* %pOut.addr.040, i32 3 + store half %mul8, half* %arrayidx12, align 2 + %add.ptr = getelementptr inbounds half, half* %pIn.addr.041, i32 4 + %add.ptr13 = getelementptr inbounds half, half* %pOut.addr.040, i32 4 + %dec = add nsw i32 %blkCnt.039, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" } |

