summaryrefslogtreecommitdiffstats
path: root/clang/test
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-11-16 19:38:21 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-11-16 19:38:21 +0000
commitf2f39be9ed155a6ad8f82bb01cb97202c577d017 (patch)
tree465179cad16c1e0eddba298d99beffcf35fe4141 /clang/test
parente5cb1c35b4ec58c88dc2ef18eec609d67b33ec81 (diff)
downloadbcm5719-llvm-f2f39be9ed155a6ad8f82bb01cb97202c577d017.tar.gz
bcm5719-llvm-f2f39be9ed155a6ad8f82bb01cb97202c577d017.zip
[OPENMP][NVPTX]Emit correct reduction code for teams/parallel
reductions. Fixed previously committed code for the reduction support in teams/parallel constructs taking into account new design of the NVPTX support in the compiler. Teams reduction are not fully functional yet, it is going to be fixed in the following patches. llvm-svn: 347081
Diffstat (limited to 'clang/test')
-rw-r--r--clang/test/OpenMP/nvptx_data_sharing.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_parallel_codegen.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_parallel_for_codegen.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp107
-rw-r--r--clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp2
-rw-r--r--clang/test/OpenMP/nvptx_teams_codegen.cpp4
-rw-r--r--clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp410
9 files changed, 414 insertions, 119 deletions
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
index 7f822aba921..ed3c88b5771 100644
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -27,7 +27,7 @@ void test_ds(){
}
}
// CK1: [[MEM_TY:%.+]] = type { [8 x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 71a3ad54917..a84962c4d81 100644
--- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -22,7 +22,7 @@ int main(int argc, char **argv) {
}
// CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index a3b1b012bf0..2fd837c92b2 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -72,7 +72,7 @@ int bar(int n){
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
index c292152e658..25a7a15693a 100644
--- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
@@ -31,7 +31,7 @@ int bar(int n){
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 0073db6f288..1687c8ea761 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -9,7 +9,7 @@
#define HEADER
// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
// Check that the execution mode of all 3 target regions is set to Spmd Mode.
// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
@@ -73,18 +73,16 @@ int bar(int n){
// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
- // CHECK: [[REDLABEL]]
// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
// CHECK: [[EV:%.+]] = load double, double* [[E]], align
// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
// CHECK: store double [[ADD]], double* [[E_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
@@ -187,18 +185,23 @@ int bar(int n){
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+ // CHECK: br label
+ // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+ // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+ // CHECK: br i1 [[DONE_COPY]], label
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
- // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -215,13 +218,13 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+ // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
@@ -229,6 +232,9 @@ int bar(int n){
//
// CHECK: [[READ_CONT]]
// CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+ // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+ // CHECK: br label
// CHECK: ret
@@ -268,10 +274,8 @@ int bar(int n){
// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
-
- // CHECK: [[REDLABEL]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
@@ -284,9 +288,8 @@ int bar(int n){
// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
// CHECK: store float [[MUL]], float* [[D_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
@@ -432,10 +435,10 @@ int bar(int n){
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -452,11 +455,11 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
@@ -471,12 +474,11 @@ int bar(int n){
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
- // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -493,13 +495,12 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
@@ -560,10 +561,9 @@ int bar(int n){
// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
- // CHECK: [[REDLABEL]]
// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
@@ -587,9 +587,8 @@ int bar(int n){
// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
@@ -752,10 +751,9 @@ int bar(int n){
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -772,12 +770,11 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
@@ -794,10 +791,10 @@ int bar(int n){
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -814,12 +811,12 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index 8b8e0b0bba7..79315717ae6 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -68,7 +68,7 @@ int bar(int n){
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index 395c10e6632..2e9ceb129b4 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -63,7 +63,7 @@ int bar(int n){
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp
index 98988a5d4dd..4965a50781c 100644
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -28,7 +28,7 @@ int main (int argc, char **argv) {
}
// CK1: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
@@ -115,7 +115,7 @@ int main (int argc, char **argv) {
}
// CK2: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index b73b5a8e93e..e2a103ab86d 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -8,13 +8,23 @@
#ifndef HEADER
#define HEADER
+// CHECK: [[MAP_TY:%.+]] = type { [16 x i8] }
+
+// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
+// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
+// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
+// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8
+
// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
-// Check that the execution mode of all 3 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1
+// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
template<typename tx>
tx ftemplate(int n) {
@@ -39,6 +49,7 @@ tx ftemplate(int n) {
#pragma omp target
#pragma omp teams reduction(|: a) reduction(max: b)
+ #pragma omp parallel reduction(|: a) reduction(max: b)
{
a |= 1;
b = 99 > b ? 99 : b;
@@ -55,9 +66,9 @@ int bar(int n){
return a;
}
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker()
- // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](
+ // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]](
//
// CHECK: {{call|invoke}} void [[T1]]_worker()
//
@@ -186,18 +197,23 @@ int bar(int n){
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+ // CHECK: br label
+ // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+ // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+ // CHECK: br i1 [[DONE_COPY]], label
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
// [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
- // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -214,13 +230,13 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+ // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
@@ -228,6 +244,9 @@ int bar(int n){
//
// CHECK: [[READ_CONT]]
// CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+ // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+ // CHECK: br label
// CHECK: ret
//
@@ -307,9 +326,9 @@ int bar(int n){
// CHECK: [[REDUCE_CONT]]
// CHECK: ret
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
- // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]](
+ // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]](
//
// CHECK: {{call|invoke}} void [[T2]]_worker()
//
@@ -495,10 +514,10 @@ int bar(int n){
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -515,11 +534,11 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
@@ -534,12 +553,11 @@ int bar(int n){
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
- // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -556,13 +574,12 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
@@ -689,13 +706,60 @@ int bar(int n){
// CHECK: [[REDUCE_CONT]]
// CHECK: ret
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
-
- // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]](
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}(
+ //
+ // CHECK: call void @__kmpc_spmd_kernel_init(
+ // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
+ // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**))
+ // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
+ // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]*
+ // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0
+ // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1
+ // CHECK: store i32 0, i32* [[A_ADDR]],
+ // CHECK: store i16 -32768, i16* [[B_ADDR]],
+ // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
+ // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8*
+ // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
+ // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8*
+ // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
+ // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+ // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+ // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
//
- // CHECK: {{call|invoke}} void [[T3]]_worker()
+ // CHECK: [[IFLABEL]]
+ // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
+ // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
+ // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
+ // CHECK: store i32 [[OR]], i32* [[A_IN]], align
+ // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
+ // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
+ // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
+ // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+ // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
+ // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
//
- // CHECK: call void @__kmpc_kernel_init(
+ // CHECK: [[DO_MAX]]
+ // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
+ // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+ //
+ // CHECK: [[MAX_ELSE]]
+ // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
+ // CHECK: br label {{%?}}[[MAX_CONT]]
+ //
+ // CHECK: [[MAX_CONT]]
+ // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+ // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
+ // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+ // CHECK: br label %[[EXIT]]
+ //
+ // CHECK: [[EXIT]]
+ // call void @__kmpc_restore_team_static_memory(i16 1)
+ // CHECK: call void @__kmpc_spmd_kernel_deinit(
+
+ // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}})
//
// CHECK: store i32 0, i32* [[A:%.+]], align
// CHECK: store i16 -32768, i16* [[B:%.+]], align
@@ -726,7 +790,7 @@ int bar(int n){
// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
- // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+ // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
// CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
// CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
//
@@ -757,7 +821,243 @@ int bar(int n){
// CHECK: br label %[[EXIT]]
//
// CHECK: [[EXIT]]
- // CHECK: call void @__kmpc_kernel_deinit(
+ // CHECK: ret void
+
+ //
+ // Reduction function
+ // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*)
+ // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+ // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
+ //
+ // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+ // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
+ //
+ // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+ // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
+ //
+ // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+ // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
+ //
+ // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
+ // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
+ // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+ // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
+ //
+ // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
+ // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
+ // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
+ // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
+ //
+ // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+ // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+ //
+ // CHECK: [[DO_MAX]]
+ // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
+ // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+ //
+ // CHECK: [[MAX_ELSE]]
+ // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
+ // CHECK: br label {{%?}}[[MAX_CONT]]
+ //
+ // CHECK: [[MAX_CONT]]
+ // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+ // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
+ // CHECK: ret void
+
+ //
+ // Shuffle and reduce function
+ // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+ // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+ // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
+ // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
+ //
+ // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+ // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+ // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+ //
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ //
+ // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+ // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+ //
+ // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
+ // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
+ // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
+ //
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+ //
+ // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
+ // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+ // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+ // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
+ //
+ // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
+ // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
+ // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+ //
+ // Condition to reduce
+ // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+ //
+ // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+ // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+ // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+ //
+ // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+ // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+ // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+ // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+ // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+ // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+ //
+ // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+ // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+ // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+ //
+ // CHECK: [[DO_REDUCE]]
+ // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+ // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+ // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+ // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+ //
+ // CHECK: [[REDUCE_ELSE]]
+ // CHECK: br label {{%?}}[[REDUCE_CONT]]
+ //
+ // CHECK: [[REDUCE_CONT]]
+ // Now check if we should just copy over the remote reduction list
+ // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+ // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+ // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+ // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // CHECK: [[DO_COPY]]
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+ // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
+ //
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+ // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // CHECK: [[COPY_CONT]]
+ // CHECK: void
+
+ //
+ // Inter warp copy function
+ // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32)
+ // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+ // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+ // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+ // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // [[DO_COPY]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ //
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // Barrier after copy to shared memory storage medium.
+ // CHECK: [[COPY_CONT]]
+ // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ //
+ // Read into warp 0.
+ // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+ //
+ // CHECK: [[DO_READ]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+ // CHECK: br label {{%?}}[[READ_CONT:.+]]
+ //
+ // CHECK: [[READ_ELSE]]
+ // CHECK: br label {{%?}}[[READ_CONT]]
+ //
+ // CHECK: [[READ_CONT]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+ // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // [[DO_COPY]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ //
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // Barrier after copy to shared memory storage medium.
+ // CHECK: [[COPY_CONT]]
+ // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ //
+ // Read into warp 0.
+ // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+ //
+ // CHECK: [[DO_READ]]
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
+ // CHECK: br label {{%?}}[[READ_CONT:.+]]
+ //
+ // CHECK: [[READ_ELSE]]
+ // CHECK: br label {{%?}}[[READ_CONT]]
+ //
+ // CHECK: [[READ_CONT]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: ret
//
// Reduction function
@@ -919,10 +1219,9 @@ int bar(int n){
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -939,12 +1238,11 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
@@ -961,10 +1259,10 @@ int bar(int n){
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
@@ -981,12 +1279,12 @@ int bar(int n){
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
OpenPOWER on IntegriCloud