summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-10-11 18:30:31 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-10-11 18:30:31 +0000
commitff23bb66226be8c8017478ce936dc7fc90447259 (patch)
tree21359227794aa27a73730a1eedbfb4fbf052815b
parentf1f2a2a31ad6200f468718dbbc75fbfbea152915 (diff)
downloadbcm5719-llvm-ff23bb66226be8c8017478ce936dc7fc90447259.tar.gz
bcm5719-llvm-ff23bb66226be8c8017478ce936dc7fc90447259.zip
[OPENMP][NVPTX]Reduce memory use for globalized vars in
target/teams/distribute regions. Previously introduced globalization scheme that uses memory coalescing scheme may increase memory usage fr the variables that are devlared in target/teams/distribute contexts. We don't need 32 copies of such variables, just 1. Patch reduces memory use in this case. llvm-svn: 344273
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp22
-rw-r--r--clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp7
2 files changed, 17 insertions, 12 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 04de3956622..944130b1abf 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -203,7 +203,8 @@ static RecordDecl *buildRecordForGlobalizedVars(
std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
stable_sort_comparator);
// Build struct _globalized_locals_ty {
- // /* globalized vars */[32] align (max(decl_align, 128))
+ // /* globalized vars */[WarSize] align (max(decl_align,
+ // GlobalMemoryAlignment))
// /* globalized vars */ for EscapedDeclsForTeams
// };
RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
@@ -370,11 +371,16 @@ class CheckVarsEscapingDeclContext final
}
}
- void buildRecordForGlobalizedVars() {
+ void buildRecordForGlobalizedVars(bool IsInTargetMasterThreadRegion) {
assert(!GlobalizedRD &&
"Record for globalized variables is built already.");
+ ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
+ if (IsInTargetMasterThreadRegion)
+ EscapedDeclsForTeams = EscapedDecls.getArrayRef();
+ else
+ EscapedDeclsForParallel = EscapedDecls.getArrayRef();
GlobalizedRD = ::buildRecordForGlobalizedVars(
- CGF.getContext(), EscapedDecls.getArrayRef(), llvm::None,
+ CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
MappedDeclsFields);
}
@@ -521,9 +527,9 @@ public:
/// Returns the record that handles all the escaped local variables and used
/// instead of their original storage.
- const RecordDecl *getGlobalizedRecord() {
+ const RecordDecl *getGlobalizedRecord(bool IsInTargetMasterThreadRegion) {
if (!GlobalizedRD)
- buildRecordForGlobalizedVars();
+ buildRecordForGlobalizedVars(IsInTargetMasterThreadRegion);
return GlobalizedRD;
}
@@ -4087,7 +4093,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
return;
CheckVarsEscapingDeclContext VarChecker(CGF);
VarChecker.Visit(Body);
- const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
+ const RecordDecl *GlobalizedVarsRecord =
+ VarChecker.getGlobalizedRecord(IsInTargetMasterThreadRegion);
ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
VarChecker.getEscapedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
@@ -4105,7 +4112,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
- Data.insert(std::make_pair(VD, MappedVarData(FD)));
+ Data.insert(
+ std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion)));
}
if (!NeedToDelayGlobalization) {
emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 2a7344f7734..2b18f6d3f9b 100644
--- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -26,13 +26,10 @@ int main(int argc, char **argv) {
// CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
// CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0)
+// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0)
// CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
// CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
-// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]]
+// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]],
// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2
OpenPOWER on IntegriCloud