diff options
| author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2016-02-17 19:23:04 +0000 |
|---|---|---|
| committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2016-02-17 19:23:04 +0000 |
| commit | 88e76cad162cc79bd1e76deb0c17a75eb0cee767 (patch) | |
| tree | ef05a619c13219586b9f13db57bf7064e40eace3 /llvm/test/Transforms/LoopVectorize | |
| parent | 61a7d629ecc118fb72c1e12b7319ee72c93f1c95 (diff) | |
| download | bcm5719-llvm-88e76cad162cc79bd1e76deb0c17a75eb0cee767.tar.gz bcm5719-llvm-88e76cad162cc79bd1e76deb0c17a75eb0cee767.zip | |
Create masked gather and scatter intrinsics in Loop Vectorizer.
Loop vectorizer now knows to vectorize GEP and create masked gather and scatter intrinsics for random memory access.
The feature is enabled on AVX-512 target.
Differential Revision: http://reviews.llvm.org/D15690
llvm-svn: 261140
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll | 236 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll | 3 |
2 files changed, 238 insertions, 1 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll new file mode 100644 index 00000000000..2f1de54d5f9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -0,0 +1,236 @@ +; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 + +;AVX1-NOT: llvm.masked + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc_linux" + +; The source code: +; +;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { +; +; for (int i=0; i < SIZE; ++i) { +; if (trigger[i] > 0) { +; out[i] = in[index[i]] + (float) 0.5; +; } +; } +;} + +;AVX512-LABEL: @foo1 +;AVX512: llvm.masked.load.v8i32 +;AVX512: llvm.masked.gather.v8f32 +;AVX512: llvm.masked.store.v8f32 +;AVX512: ret void + +; Function Attrs: nounwind uwtable +define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { +entry: + %in.addr = alloca float*, align 8 + %out.addr = alloca float*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store float* %in, float** %in.addr, align 8 + store float* %out, float** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load i32*, i32** %index.addr, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2 + %6 = load i32, i32* %arrayidx3, align 4 + %idxprom4 = sext i32 %6 to i64 + %7 = load float*, float** %in.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4 + %8 = load float, float* %arrayidx5, align 4 + %add = fadd float %8, 5.000000e-01 + %9 = load i32, i32* %i, align 4 + %idxprom6 = sext i32 %9 to i64 + %10 = load float*, float** %out.addr, align 8 + %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6 + store float %add, float* %arrayidx7, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %11 = load i32, i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; The source code +;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { +; +; for (int i=0; i<SIZE; ++i) { +; if (trigger[i] > 0) { +; out[i] = in[i].b + (float) 0.5; +; } +; } +;} + +%struct.In = type { float, float } + +;AVX512-LABEL: @foo2 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.gather.v16f32 +;AVX512: llvm.masked.store.v16f32 +;AVX512: ret void +define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { +entry: + %in.addr = alloca %struct.In*, align 8 + %out.addr = alloca float*, align 8 + %trigger.addr = alloca i32*, align 8 + %index.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In* %in, %struct.In** %in.addr, align 8 + store float* %out, float** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32* %index, i32** %index.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In*, %struct.In** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 + %6 = load float, float* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load float*, float** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4 + store float %add, float* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; The source code +;struct Out { +; float a; +; float b; +;}; +;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { +; +; for (int i=0; i<SIZE; ++i) { +; if (trigger[i] > 0) { +; out[i].b = in[i].b + (float) 0.5; +; } +; } +;} + +;AVX512-LABEL: @foo3 +;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.gather.v16f32 +;AVX512: fadd <16 x float> +;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1 +;AVX512: llvm.masked.scatter.v16f32 +;AVX512: ret void + +%struct.Out = type { float, float } + +define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { +entry: + %in.addr = alloca %struct.In*, align 8 + %out.addr = alloca %struct.Out*, align 8 + %trigger.addr = alloca i32*, align 8 + %i = alloca i32, align 4 + store %struct.In* %in, %struct.In** %in.addr, align 8 + store %struct.Out* %out, %struct.Out** %out.addr, align 8 + store i32* %trigger, i32** %trigger.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 4096 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %2 = load i32*, i32** %trigger.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %3, 0 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %idxprom2 = sext i32 %4 to i64 + %5 = load %struct.In*, %struct.In** %in.addr, align 8 + %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2 + %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1 + %6 = load float, float* %b, align 4 + %add = fadd float %6, 5.000000e-01 + %7 = load i32, i32* %i, align 4 + %idxprom4 = sext i32 %7 to i64 + %8 = load %struct.Out*, %struct.Out** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4 + %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1 + store float %add, float* %b6, align 4 + br label %if.end + +if.end: ; preds = %if.then, %for.body + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} +declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>) diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index f810d17e893..268fb61dba3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -278,7 +278,8 @@ for.end: ; preds = %for.cond ;AVX: ret void ;AVX512-LABEL: @foo4 -;AVX512-NOT: llvm.masked +;AVX512-NOT: llvm.masked.load +;AVX512: llvm.masked.gather ;AVX512: ret void ; Function Attrs: nounwind uwtable |

