3 files changed, 201 insertions, 11 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 82f2e064a58..e18159f2462 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -36,7 +36,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
 ; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2ce357540d0..8ef59613e64 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -17,9 +17,9 @@ target triple = "x86_64-pc_linux"
 ;}
 
 ;AVX512-LABEL: @foo1
-;AVX512: llvm.masked.load.v16i32
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.store.v16f32
+;AVX512: llvm.masked.load.v16i32.p0v16i32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.store.v16f32.p0v16f32
 ;AVX512: ret void
 
 ; Function Attrs: nounwind uwtable
@@ -96,8 +96,8 @@ for.end:                                          ; preds = %for.cond
 
 ;AVX512-LABEL: @foo2
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void
 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
 entry:
@@ -171,10 +171,10 @@ for.end:                                          ; preds = %for.cond
 
 ;AVX512-LABEL: @foo3
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
 ;AVX512: fadd <16 x float>
 ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void
 
 %struct.Out = type { float, float }
@@ -233,4 +233,194 @@ for.inc:                                          ; preds = %if.end
 for.end:                                          ; preds = %for.cond
   ret void
 }
-declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+;AVX512-LABEL: @foo2_addrspace
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace2
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
+;AVX512: ret void
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace3
+;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bda4b2454ee..aff372b562f 100755
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -23,11 +23,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>