6 files changed, 216 insertions, 73 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index cee5ca65fc6..98d8b0ff607 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -140,13 +140,15 @@ public:
 /// to emit optimized code.
 class ExecutionModeRAII {
 private:
-  bool SavedMode;
-  bool &Mode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
 
 public:
-  ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) {
+  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
+      : Mode(Mode) {
     SavedMode = Mode;
-    Mode = NewMode;
+    Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
+                  : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
   }
   ~ExecutionModeRAII() { Mode = SavedMode; }
 };
@@ -579,8 +581,9 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
   WorkerFn->setDoesNotRecurse();
 }
 
-bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
-  return IsInSPMDExecutionMode;
+CGOpenMPRuntimeNVPTX::ExecutionMode
+CGOpenMPRuntimeNVPTX::getExecutionMode() const {
+  return CurrentExecutionMode;
 }
 
 static CGOpenMPRuntimeNVPTX::DataSharingMode
@@ -589,34 +592,96 @@ getDataSharingMode(CodeGenModule &CGM) {
                                           : CGOpenMPRuntimeNVPTX::Generic;
 }
 
-/// Check for inner (nested) SPMD construct, if any
-static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
-  const auto *CS = D.getCapturedStmt(OMPD_target);
-  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
-  const Stmt *ChildStmt = nullptr;
+/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
+/// iff there is only one.
+static const Stmt *getSingleCompoundChild(const Stmt *Body) {
   if (const auto *C = dyn_cast<CompoundStmt>(Body))
     if (C->size() == 1)
-      ChildStmt = C->body_front();
-  if (!ChildStmt)
-    return false;
+      return C->body_front();
+  return Body;
+}
+
+/// Check if the parallel directive has an 'if' clause with non-constant or
+/// false condition.
+static bool hasParallelIfClause(ASTContext &Ctx,
+                                const OMPExecutableDirective &D) {
+  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
+    OpenMPDirectiveKind NameModifier = C->getNameModifier();
+    if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
+      continue;
+    const Expr *Cond = C->getCondition();
+    bool Result;
+    if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
+      return true;
+  }
+  return false;
+}
+
+/// Check for inner (nested) SPMD construct, if any
+static bool hasNestedSPMDDirective(ASTContext &Ctx,
+                                   const OMPExecutableDirective &D) {
+  const auto *CS = D.getInnermostCapturedStmt();
+  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
+  const Stmt *ChildStmt = getSingleCompoundChild(Body);
 
   if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
-    // TODO: add further analysis for inner teams|distribute directives, if any.
     switch (D.getDirectiveKind()) {
     case OMPD_target:
-      return (isOpenMPParallelDirective(DKind) &&
-              !isOpenMPTeamsDirective(DKind) &&
-              !isOpenMPDistributeDirective(DKind)) ||
-             isOpenMPSimdDirective(DKind) ||
-             DKind == OMPD_teams_distribute_parallel_for;
+      if ((isOpenMPParallelDirective(DKind) &&
+           !hasParallelIfClause(Ctx, *NestedDir)) ||
+          isOpenMPSimdDirective(DKind))
+        return true;
+      if (DKind == OMPD_teams || DKind == OMPD_teams_distribute) {
+        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+        if (!Body)
+          return false;
+        ChildStmt = getSingleCompoundChild(Body);
+        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+          DKind = NND->getDirectiveKind();
+          if ((isOpenMPParallelDirective(DKind) &&
+               !hasParallelIfClause(Ctx, *NND)) ||
+              isOpenMPSimdDirective(DKind))
+            return true;
+          if (DKind == OMPD_distribute) {
+            Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+            if (!Body)
+              return false;
+            ChildStmt = getSingleCompoundChild(Body);
+            if (!ChildStmt)
+              return false;
+            if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+              DKind = NND->getDirectiveKind();
+              return (isOpenMPParallelDirective(DKind) &&
+                      !hasParallelIfClause(Ctx, *NND)) ||
+                     isOpenMPSimdDirective(DKind);
+            }
+          }
+        }
+      }
+      return false;
     case OMPD_target_teams:
-      return (isOpenMPParallelDirective(DKind) &&
-              !isOpenMPDistributeDirective(DKind)) ||
-             isOpenMPSimdDirective(DKind) ||
-             DKind == OMPD_distribute_parallel_for;
+      if ((isOpenMPParallelDirective(DKind) &&
+           !hasParallelIfClause(Ctx, *NestedDir)) ||
+          isOpenMPSimdDirective(DKind))
+        return true;
+      if (DKind == OMPD_distribute) {
+        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+        if (!Body)
+          return false;
+        ChildStmt = getSingleCompoundChild(Body);
+        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+          DKind = NND->getDirectiveKind();
+          return (isOpenMPParallelDirective(DKind) &&
+                  !hasParallelIfClause(Ctx, *NND)) ||
+                 isOpenMPSimdDirective(DKind);
+        }
+      }
+      return false;
     case OMPD_target_teams_distribute:
-      return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind);
+      return (isOpenMPParallelDirective(DKind) &&
+              !hasParallelIfClause(Ctx, *NestedDir)) ||
+             isOpenMPSimdDirective(DKind);
     case OMPD_target_simd:
     case OMPD_target_parallel:
     case OMPD_target_parallel_for:
@@ -674,20 +739,22 @@ static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
   return false;
 }
 
-static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) {
+static bool supportsSPMDExecutionMode(ASTContext &Ctx,
+                                      const OMPExecutableDirective &D) {
   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
   switch (DirectiveKind) {
   case OMPD_target:
   case OMPD_target_teams:
   case OMPD_target_teams_distribute:
-    return hasNestedSPMDDirective(D);
-  case OMPD_target_simd:
+    return hasNestedSPMDDirective(Ctx, D);
   case OMPD_target_parallel:
   case OMPD_target_parallel_for:
   case OMPD_target_parallel_for_simd:
-  case OMPD_target_teams_distribute_simd:
   case OMPD_target_teams_distribute_parallel_for:
   case OMPD_target_teams_distribute_parallel_for_simd:
+    return !hasParallelIfClause(Ctx, D);
+  case OMPD_target_simd:
+  case OMPD_target_teams_distribute_simd:
     return true;
   case OMPD_parallel:
   case OMPD_for:
@@ -744,7 +811,7 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
                                              llvm::Constant *&OutlinedFnID,
                                              bool IsOffloadEntry,
                                              const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false);
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
   EntryFunctionState EST;
   WorkerFunctionState WST(CGM, D.getLocStart());
   Work.clear();
@@ -858,7 +925,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
                                           llvm::Constant *&OutlinedFnID,
                                           bool IsOffloadEntry,
                                           const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true);
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
   EntryFunctionState EST;
 
   // Emit target region as a standalone region.
@@ -905,11 +972,13 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
 
   CGF.EmitBlock(ExecuteBB);
 
+  IsInTargetMasterThreadRegion = true;
   emitGenericVarsProlog(CGF, D.getLocStart());
 }
 
 void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
                                                EntryFunctionState &EST) {
+  IsInTargetMasterThreadRegion = false;
   if (!CGF.HaveInsertPoint())
     return;
 
@@ -1380,7 +1449,7 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  bool Mode = supportsSPMDExecutionMode(D);
+  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
   if (Mode)
     emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
                    CodeGen);
@@ -1401,8 +1470,8 @@ void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
                                               OpenMPProcBindClauseKind ProcBind,
                                               SourceLocation Loc) {
   // Do nothing in case of Spmd mode and L0 parallel.
-  // TODO: If in Spmd mode and L1 parallel emit the clause.
-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      IsInTargetMasterThreadRegion)
     return;
 
   CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
@@ -1412,8 +1481,8 @@ void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
                                                 llvm::Value *NumThreads,
                                                 SourceLocation Loc) {
   // Do nothing in case of Spmd mode and L0 parallel.
-  // TODO: If in Spmd mode and L1 parallel emit the clause.
-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      IsInTargetMasterThreadRegion)
     return;
 
   CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
@@ -1457,7 +1526,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
           D, ThreadIDVar, InnermostKind, CodeGen));
   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
-  if (!isInSpmdExecutionMode() && !IsInParallelRegion) {
+  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      !IsInParallelRegion) {
     llvm::Function *WrapperFun =
         createParallelDataSharingWrapper(OutlinedFun, D);
     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
@@ -1635,7 +1705,7 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
   if (!CGF.HaveInsertPoint())
     return;
 
-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
     emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
   else
     emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
@@ -1759,6 +1829,8 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
       SeqGen(CGF, Action);
     } else if (IsInTargetMasterThreadRegion) {
       L0ParallelGen(CGF, Action);
+    } else if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD) {
+      RCG(CGF);
     } else {
       // Check for master and then parallelism:
       // if (is_master) {
@@ -1770,20 +1842,18 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
       // }
       CGBuilderTy &Bld = CGF.Builder;
       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-      if (!isInSpmdExecutionMode()) {
-        llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
-        llvm::BasicBlock *ParallelCheckBB =
-            CGF.createBasicBlock(".parallelcheck");
-        llvm::Value *IsMaster =
-            Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
-        Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
-        CGF.EmitBlock(MasterCheckBB);
-        L0ParallelGen(CGF, Action);
-        CGF.EmitBranch(ExitBB);
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(ParallelCheckBB);
-      }
+      llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+      llvm::BasicBlock *ParallelCheckBB =
+          CGF.createBasicBlock(".parallelcheck");
+      llvm::Value *IsMaster =
+          Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+      Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
+      CGF.EmitBlock(MasterCheckBB);
+      L0ParallelGen(CGF, Action);
+      CGF.EmitBranch(ExitBB);
+      // There is no need to emit line number for unconditional branch.
+      (void)ApplyDebugLocation::CreateEmpty(CGF);
+      CGF.EmitBlock(ParallelCheckBB);
       llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
       llvm::Value *ThreadID = getThreadID(CGF, Loc);
       llvm::Value *PL = CGF.EmitRuntimeCall(
@@ -1827,14 +1897,49 @@ void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
   // is added on Spmd target directives.
   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
 
-  Address ZeroAddr = CGF.CreateMemTemp(
-      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
-      ".zero.addr");
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /*DestWidth=*/32, /*Signed=*/1),
+                                       ".zero.addr");
   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
-  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
-  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
-  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
-  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+  Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
+                    ThreadIDAddr](CodeGenFunction &CGF,
+                                  PrePostActionTy &Action) {
+    Action.Enter(CGF);
+
+    llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
+    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+    emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+  };
+  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
+                                        PrePostActionTy &) {
+
+    RegionCodeGenTy RCG(CodeGen);
+    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+    llvm::Value *ThreadID = getThreadID(CGF, Loc);
+    llvm::Value *Args[] = {RTLoc, ThreadID};
+
+    NVPTXActionTy Action(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+        Args,
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+        Args);
+    RCG.setAction(Action);
+    RCG(CGF);
+  };
+
+  if (IsInTargetMasterThreadRegion) {
+    RegionCodeGenTy RCG(CodeGen);
+    RCG(CGF);
+  } else {
+    // If we are not in the target region, it is definitely L2 parallelism or
+    // more, because for SPMD mode we always has L1 parallel level, sowe don't
+    // need to check for orphaned directives.
+    RegionCodeGenTy RCG(SeqGen);
+    RCG(CGF);
+  }
 }
 
 void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 88420b90497..c7d647bfdb1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -24,6 +24,16 @@ namespace clang {
 namespace CodeGen {
 
 class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
+public:
+  /// Defines the execution mode.
+  enum ExecutionMode {
+    /// SPMD execution mode (all threads are worker threads).
+    EM_SPMD,
+    /// Non-SPMD execution mode (1 master thread, others are workers).
+    EM_NonSPMD,
+    /// Unknown execution mode (orphaned directive).
+    EM_Unknown,
+  };
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -44,7 +54,7 @@ private:
     void createWorkerFunction(CodeGenModule &CGM);
   };
 
-  bool isInSpmdExecutionMode() const;
+  ExecutionMode getExecutionMode() const;
 
   /// Emit the worker function for the current target region.
   void emitWorkerFunction(WorkerFunctionState &WST);
@@ -334,7 +344,7 @@ private:
   /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
   /// target region and used by containing directives such as 'parallel'
   /// to emit optimized code.
-  bool IsInSPMDExecutionMode = false;
+  ExecutionMode CurrentExecutionMode = EM_Unknown;
 
   /// true if we're emitting the code for the target region and next parallel
   /// region is L0 for sure.
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index 071b08d8ad8..8f496eb80e4 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -58,6 +58,7 @@ tx ftemplate(int n) {
     #pragma omp critical
     ++a;
     }
+    ++a;
   }
   return a;
 }
diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
index eec9507c250..aa054fffc4b 100644
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -9,8 +9,9 @@
 #define HEADER
 
 // Check that the execution mode of all 2 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
 
 template<typename tx>
 tx ftemplate(int n) {
@@ -33,6 +34,13 @@ tx ftemplate(int n) {
     aa = 1;
   }
 
+  #pragma omp target teams
+  {
+#pragma omp parallel
+#pragma omp parallel
+    aa = 1;
+  }
+
   return a;
 }
 
@@ -44,14 +52,14 @@ int bar(int n){
   return a;
 }
 
-  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l21}}_worker()
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l22}}_worker()
 
 
 
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -86,7 +94,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] [[A:%[^)]+]])
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](i[[SZ:32|64]] [[A:%[^)]+]])
   // CHECK: store i[[SZ]] [[A]], i[[SZ]]* [[A_ADDR:%.+]], align
   // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8*
 
@@ -137,7 +145,7 @@ int bar(int n){
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l32}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -172,7 +180,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l31]](i[[SZ:32|64]] [[AA:%[^)]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l32]](i[[SZ:32|64]] [[AA:%[^)]+]])
   // CHECK: store i[[SZ]] [[AA]], i[[SZ]]* [[AA_ADDR:%.+]], align
   // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
 
@@ -218,5 +226,24 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
+// CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37(
+// CHECK: call void @__kmpc_spmd_kernel_init(
+// CHECK: call i8* @__kmpc_data_sharing_push_stack(
+// CHECK-NOT: call void @__kmpc_serialized_parallel(
+// CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.*}})
+// CHECK-NOT: call void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_data_sharing_pop_stack(
+// CHECK: call void @__kmpc_spmd_kernel_deinit()
+// CHECK: ret
+
+// CHECK: define internal void [[L0]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable
+// CHECK: call void @__kmpc_serialized_parallel(
+// CHECK: call void [[L1:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}})
+// CHECK: call void @__kmpc_end_serialized_parallel(
+// CHECK: ret void
+
+// CHECK: define internal void [[L1]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable
+// CHECK: store i16 1, i16* %
+// CHECK: ret void
 
 #endif
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
index b9bd9fe28a0..2e712e2d51d 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -13,7 +13,7 @@ int a;
 int foo(int *a);
 
 int main(int argc, char **argv) {
-#pragma omp target teams distribute parallel for map(tofrom:a) if(parallel:argc)
+#pragma omp target teams distribute parallel for map(tofrom:a) if(target:argc) schedule(static, a)
   for (int i= 0; i < argc; ++i)
     a = foo(&i) + foo(&a) + foo(&argc);
   return 0;
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index aab9f50d87b..ebd761fb969 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited | FileCheck %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited -fopenmp-version=45 | FileCheck %s
 // expected-no-diagnostics
 
 int main() {
@@ -11,7 +11,7 @@ int main() {
   int c[10][10][10];
 #pragma omp target parallel firstprivate(a, b) map(tofrom          \
                                                    : c) map(tofrom \
-                                                            : bb) if (a)
+                                                            : bb) if (target:a)
   {
     int &f = c[1][1][1];
     int &g = a;
@@ -54,7 +54,7 @@ int main() {
   return 0;
 }
 
-// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}}, i1 {{[^)]+}})
+// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})