summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-03-20 15:41:05 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-03-20 15:41:05 +0000
commit173142171ed7ac95b58412f439e611d5d512c6cb (patch)
treeaddaa272734fe25679a15c47a437cc7f2cdfc4e9
parentc77e5d0dfad745d045c9f382e807a454eed2eff7 (diff)
downloadbcm5719-llvm-173142171ed7ac95b58412f439e611d5d512c6cb.tar.gz
bcm5719-llvm-173142171ed7ac95b58412f439e611d5d512c6cb.zip
[OPENMP, NVPTX] Codegen for target distribute parallel combined
constructs in generic mode. Fixed codegen for distribute parallel combined constructs. We have to pass and read the shared lower and upper bound from the distribute region in the inner parallel region. Patch is for generic mode. llvm-svn: 327990
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp54
-rw-r--r--clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp4
-rw-r--r--clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp98
3 files changed, 146 insertions, 10 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index ccffa9cc4a4..8ab890cb4c3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -1444,7 +1444,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
for (llvm::Value *V : CapturedVars) {
Address Dst = Bld.CreateConstInBoundsGEP(
SharedArgListAddress, Idx, CGF.getPointerSize());
- llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
+ llvm::Value * PtrV;
+ if (V->getType()->isIntegerTy())
+ PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
+ else
+ PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
Ctx.getPointerType(Ctx.VoidPtrTy));
++Idx;
@@ -2963,22 +2967,56 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
// Retrieve the shared variables from the list of references returned
// by the runtime. Pass the variables to the outlined function.
+ Address SharedArgListAddress = Address::invalid();
+ if (CS.capture_size() > 0 ||
+ isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+ SharedArgListAddress = CGF.EmitLoadOfPointer(
+ GlobalArgs, CGF.getContext()
+ .getPointerType(CGF.getContext().getPointerType(
+ CGF.getContext().VoidPtrTy))
+ .castAs<PointerType>());
+ }
+ unsigned Idx = 0;
+ if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+ Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+ CGF.getPointerSize());
+ Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+ Src, CGF.SizeTy->getPointerTo());
+ llvm::Value *LB = CGF.EmitLoadOfScalar(
+ TypedAddress,
+ /*Volatile=*/false,
+ CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+ cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
+ Args.emplace_back(LB);
+ ++Idx;
+ Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+ CGF.getPointerSize());
+ TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+ Src, CGF.SizeTy->getPointerTo());
+ llvm::Value *UB = CGF.EmitLoadOfScalar(
+ TypedAddress,
+ /*Volatile=*/false,
+ CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+ cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
+ Args.emplace_back(UB);
+ ++Idx;
+ }
if (CS.capture_size() > 0) {
ASTContext &CGFContext = CGF.getContext();
- Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
- CGFContext
- .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
- .castAs<PointerType>());
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
QualType ElemTy = CurField->getType();
- Address Src = Bld.CreateConstInBoundsGEP(
- SharedArgListAddress, I, CGF.getPointerSize());
- Address TypedAddress = Bld.CreateBitCast(
+ Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
+ CGF.getPointerSize());
+ Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
/*Volatile=*/false,
CGFContext.getPointerType(ElemTy),
CI->getLocation());
+ if (CI->capturesVariableByCopy()) {
+ Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
+ CI->getLocation());
+ }
Args.emplace_back(Arg);
}
}
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index fc3f25355ec..73647c41128 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -22,7 +22,7 @@ tx ftemplate(int n) {
tx a[N];
short aa[N];
tx b[10];
- tx c[M][M];
+ tx c[M][M];
tx f = n;
tx l;
int k;
@@ -47,7 +47,7 @@ tx ftemplate(int n) {
for(int i = 0; i < M; i++) {
for(int j = 0; j < M; j++) {
k = M;
- c[i][j] = i+j*f+k;
+ c[i][j] = i + j * f + k;
}
}
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
new file mode 100644
index 00000000000..af72f3be8f3
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -0,0 +1,98 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int a;
+
+int foo(int *a);
+
+int main(int argc, char **argv) {
+#pragma omp target teams distribute parallel for map(tofrom:a) if(parallel:argc)
+ for (int i= 0; i < argc; ++i)
+ a = foo(&i) + foo(&a) + foo(&argc);
+ return 0;
+}
+
+// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker()
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]])
+
+// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void @__kmpc_kernel_init(
+// CHECK: call void @__kmpc_data_sharing_init_stack()
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_kernel_prepare_parallel(
+// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4)
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8*
+// CHECK: store i8* [[LB]], i8** [[BUF_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
+// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8*
+// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
+// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8*
+// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]],
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
+// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8*
+// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]],
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_end_sharing_variables()
+// CHECK: br label
+
+// CHECK: call void @__kmpc_serialized_parallel(%ident_t* @
+// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** %
+// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]])
+// CHECK: call void @__kmpc_end_serialized_parallel(%ident_t* @
+// CHECK: br label %
+
+
+// CHECK: call void @__kmpc_for_static_fini(%ident_t* @
+
+// CHECK: call void @__kmpc_kernel_deinit(i16 1)
+// CHECK: call void @llvm.nvvm.barrier0()
+
+// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}})
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0)
+// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_TY:%.+]]*
+// CHECK: [[I:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[ARGC_VAL:%.+]] = load i32, i32* %
+// CHECK: [[ARGC:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32 [[ARGC_VAL]], i32* [[ARGC]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call i32 [[FOO:@.+foo.+]](i32* [[I]])
+// CHECK: call i32 [[FOO]](i32* %{{.+}})
+// CHECK: call i32 [[FOO]](i32* [[ARGC]])
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
+
+// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32)
+// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]])
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0
+// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}*
+// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
+// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}*
+// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
+// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32*
+// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]],
+// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
+// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32**
+// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]],
+// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]])
+// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]])
+// CHECK: ret void
+
+#endif
OpenPOWER on IntegriCloud