summaryrefslogtreecommitdiffstats
path: root/clang/test
diff options
context:
space:
mode:
authorCarlo Bertolli <cbertol@us.ibm.com>2017-04-25 17:52:12 +0000
committerCarlo Bertolli <cbertol@us.ibm.com>2017-04-25 17:52:12 +0000
commitb0ff0a69c32487bc109c4d1203bfa239c5aedf0f (patch)
tree6b64cff3f6245c2c164bfe7c68d83fdeecaf2fa2 /clang/test
parent44f6ea881822da3249d27fff1e0e892a8b7afb46 (diff)
downloadbcm5719-llvm-b0ff0a69c32487bc109c4d1203bfa239c5aedf0f.tar.gz
bcm5719-llvm-b0ff0a69c32487bc109c4d1203bfa239c5aedf0f.zip
Recommit of
[OpenMP] Initial implementation of code generation for pragma 'distribute parallel for' on host https://reviews.llvm.org/D29508 This patch makes the following additions: It abstracts away loop bound generation code from procedures associated with pragma 'for' and loops in general, in such a way that the same procedures can be used for 'distribute parallel for' without the need for a full re-implementation. It implements code generation for 'distribute parallel for' and adds regression tests. It includes tests for clauses. It is important to notice that most of the clauses are implemented as part of existing procedures. For instance, firstprivate is already implemented for 'distribute' and 'for' as separate pragmas. As the implementation of 'distribute parallel for' is based on the same procedures, then we automatically obtain implementation for such clauses without the need to add new code. However, this requires regression tests that verify correctness of produced code. llvm-svn: 301340
Diffstat (limited to 'clang/test')
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_codegen.cpp2260
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp619
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp192
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp653
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp121
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp297
-rw-r--r--clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp93
7 files changed, 4235 insertions, 0 deletions
diff --git a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
new file mode 100644
index 00000000000..62c38f18885
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
@@ -0,0 +1,2260 @@
+// Test host code gen
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+template <typename T>
+T tmain() {
+ T *a, *b, *c;
+ int n = 10000;
+ int ch = 100;
+
+ // no schedule clauses
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // dist_schedule: static no chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for dist_schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // dist_schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for dist_schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // schedule: static no chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // schedule: dynamic no chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for schedule(dynamic)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ // schedule: dynamic chunk
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for schedule(dynamic, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ }
+
+ return T();
+}
+
+int main() {
+ double *a, *b, *c;
+ int n = 10000;
+ int ch = 100;
+
+#ifdef LAMBDA
+ // LAMBDA-LABEL: @main
+ // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+ [&]() {
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_1:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_2:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_3:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_4:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_5:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_6:@.+]](
+
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN_7:@.+]](
+
+ // no schedule clauses
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_1]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_1]](
+ // LAMBDA-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // LAMBDA-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // LAMBDA-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // LAMBDA-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+ // check EUB for distribute
+ // LAMBDA-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: [[NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // LAMBDA: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // LAMBDA-DAG: [[EUB_TRUE]]:
+ // LAMBDA: [[NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[EUB_END:.+]]
+ // LAMBDA-DAG: [[EUB_FALSE]]:
+ // LAMBDA: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: br label %[[EUB_END]]
+ // LAMBDA-DAG: [[EUB_END]]:
+ // LAMBDA-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // LAMBDA: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_JUMP_BACK:.+]]
+
+ // check exit condition
+ // LAMBDA: [[OMP_JUMP_BACK]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // LAMBDA: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[DIST_BODY]]:
+ // LAMBDA-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to
+ // LAMBDA-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to
+ // check that distlb and distub are properly passed to fork_call
+ // LAMBDA-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // LAMBDA-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // LAMBDA: br label %[[DIST_INC:.+]]
+
+ // increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+ // LAMBDA: [[DIST_INC]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // LAMBDA: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+ // LAMBDA: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_JUMP_BACK]]
+
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+
+ // implementation of 'parallel for'
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_1]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // LAMBDA: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // LAMBDA: [[PF_EUB_TRUE]]:
+ // LAMBDA: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[PF_EUB_END:.+]]
+ // LAMBDA-DAG: [[PF_EUB_FALSE]]:
+ // LAMBDA: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: br label %[[PF_EUB_END]]
+ // LAMBDA-DAG: [[PF_EUB_END]]:
+ // LAMBDA-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // LAMBDA: [[OMP_PF_JUMP_BACK]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[PF_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // LAMBDA: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK]]
+
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // dist_schedule: static no chunk (same sa default - no dist_schedule)
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_2]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_2:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for dist_schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_2]](
+ // LAMBDA-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // LAMBDA-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // LAMBDA-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // LAMBDA-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+ // check EUB for distribute
+ // LAMBDA-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: [[NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // LAMBDA: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // LAMBDA-DAG: [[EUB_TRUE]]:
+ // LAMBDA: [[NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[EUB_END:.+]]
+ // LAMBDA-DAG: [[EUB_FALSE]]:
+ // LAMBDA: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: br label %[[EUB_END]]
+ // LAMBDA-DAG: [[EUB_END]]:
+ // LAMBDA-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // LAMBDA: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_JUMP_BACK:.+]]
+
+ // check exit condition
+ // LAMBDA: [[OMP_JUMP_BACK]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // LAMBDA: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[DIST_BODY]]:
+ // LAMBDA-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to
+ // LAMBDA-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to
+ // check that distlb and distub are properly passed to fork_call
+ // LAMBDA-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // LAMBDA-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // LAMBDA: br label %[[DIST_INC:.+]]
+
+ // increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+ // LAMBDA: [[DIST_INC]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // LAMBDA: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+ // LAMBDA: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_JUMP_BACK]]
+
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+
+ // implementation of 'parallel for'
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_2]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // LAMBDA: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // LAMBDA: [[PF_EUB_TRUE]]:
+ // LAMBDA: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[PF_EUB_END:.+]]
+ // LAMBDA-DAG: [[PF_EUB_FALSE]]:
+ // LAMBDA: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: br label %[[PF_EUB_END]]
+ // LAMBDA-DAG: [[PF_EUB_END]]:
+ // LAMBDA-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // LAMBDA: [[OMP_PF_JUMP_BACK]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[PF_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // LAMBDA: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK]]
+
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // dist_schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_3]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for dist_schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_3]](
+ // LAMBDA-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // LAMBDA-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // LAMBDA-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // LAMBDA-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // unlike the previous tests, in this one we have a outer and inner loop for 'distribute'
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91,
+ // LAMBDA: br label %[[DIST_OUTER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[DIST_OUTER_LOOP_HEADER]]:
+ // check EUB for distribute
+ // LAMBDA-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: [[NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // LAMBDA: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // LAMBDA-DAG: [[EUB_TRUE]]:
+ // LAMBDA: [[NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[EUB_END:.+]]
+ // LAMBDA-DAG: [[EUB_FALSE]]:
+ // LAMBDA: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // LAMBDA: br label %[[EUB_END]]
+ // LAMBDA-DAG: [[EUB_END]]:
+ // LAMBDA-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // LAMBDA: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+
+ // check exit condition
+ // LAMBDA-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // LAMBDA: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[CMP_IV_UB]], label %[[DIST_OUTER_LOOP_BODY:.+]], label %[[DIST_OUTER_LOOP_END:.+]]
+
+ // LAMBDA: [[DIST_OUTER_LOOP_BODY]]:
+ // LAMBDA: br label %[[DIST_INNER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[DIST_INNER_LOOP_HEADER]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}} [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_UB_VAL_4:%.+]] = load {{.+}} [[OMP_UB]],
+ // LAMBDA: [[CMP_IV_UB_2:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_2]], [[OMP_UB_VAL_4]]
+ // LAMBDA: br{{.+}} [[CMP_IV_UB_2]], label %[[DIST_INNER_LOOP_BODY:.+]], label %[[DIST_INNER_LOOP_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[DIST_INNER_LOOP_BODY]]:
+ // LAMBDA-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+ // check that distlb and distub are properly passed to fork_call
+ // LAMBDA-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // LAMBDA-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // LAMBDA: br label %[[DIST_INNER_LOOP_INC:.+]]
+
+ // check DistInc
+ // LAMBDA: [[DIST_INNER_LOOP_INC]]:
+ // LAMBDA-DAG: [[OMP_IV_VAL_3:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // LAMBDA: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_3]], [[OMP_ST_VAL_1]]
+ // LAMBDA: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[DIST_INNER_LOOP_HEADER]]
+
+ // LAMBDA: [[DIST_INNER_LOOP_END]]:
+ // LAMBDA: br label %[[DIST_OUTER_LOOP_INC:.+]]
+
+ // LAMBDA: [[DIST_OUTER_LOOP_INC]]:
+ // check NextLB and NextUB
+ // LAMBDA-DAG: [[OMP_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_LB]],
+ // LAMBDA-DAG: [[OMP_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+ // LAMBDA-DAG: [[OMP_LB_NEXT:%.+]] = add{{.+}} [[OMP_LB_VAL_2]], [[OMP_ST_VAL_2]]
+ // LAMBDA: store{{.+}} [[OMP_LB_NEXT]], {{.+}}* [[OMP_LB]],
+ // LAMBDA-DAG: [[OMP_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_UB]],
+ // LAMBDA-DAG: [[OMP_ST_VAL_3:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+ // LAMBDA-DAG: [[OMP_UB_NEXT:%.+]] = add{{.+}} [[OMP_UB_VAL_5]], [[OMP_ST_VAL_3]]
+ // LAMBDA: store{{.+}} [[OMP_UB_NEXT]], {{.+}}* [[OMP_UB]],
+ // LAMBDA: br label %[[DIST_OUTER_LOOP_HEADER]]
+
+ // outer loop exit
+ // LAMBDA: [[DIST_OUTER_LOOP_END]]:
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+
+ // skip implementation of 'parallel for': using default scheduling and was tested above
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // schedule: static no chunk
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_4]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_4:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_4]](
+ // LAMBDA-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // LAMBDA-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // LAMBDA-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // LAMBDA-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // LAMBDA: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_4:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // LAMBDA: ret
+
+ // 'parallel for' implementation is the same as the case without schedule clase (static no chunk is the default)
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_4]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // LAMBDA-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // LAMBDA: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // LAMBDA: [[PF_EUB_TRUE]]:
+ // LAMBDA: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // LAMBDA: br label %[[PF_EUB_END:.+]]
+ // LAMBDA-DAG: [[PF_EUB_FALSE]]:
+ // LAMBDA: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA: br label %[[PF_EUB_END]]
+ // LAMBDA-DAG: [[PF_EUB_END]]:
+ // LAMBDA-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // LAMBDA: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // LAMBDA: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // LAMBDA: [[OMP_PF_JUMP_BACK]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // LAMBDA: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // LAMBDA: [[PF_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // LAMBDA: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_JUMP_BACK]]
+
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_5]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_5:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_5]](
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // LAMBDA: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_5:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // LAMBDA: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_5]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // check PrevEUB (using PrevUB instead of NumIt as upper bound)
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA-64-DAG: [[OMP_PF_UB_VAL_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_1]] to
+ // LAMBDA: [[PF_PREV_UB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_CONV]], [[PF_PREV_UB_VAL_1]]
+ // LAMBDA-32-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_PREV_UB_VAL_1]]
+ // LAMBDA: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // LAMBDA: [[PF_EUB_TRUE]]:
+ // LAMBDA: [[PF_PREV_UB_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA: br label %[[PF_EUB_END:.+]]
+ // LAMBDA-DAG: [[PF_EUB_FALSE]]:
+ // LAMBDA: [[OMP_PF_UB_VAL_2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // LAMBDA-64: [[OMP_PF_UB_VAL_2_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_2]] to
+ // LAMBDA: br label %[[PF_EUB_END]]
+ // LAMBDA-DAG: [[PF_EUB_END]]:
+ // LAMBDA-64-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2_CONV]], %[[PF_EUB_FALSE]] ]
+ // LAMBDA-32-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2]], %[[PF_EUB_FALSE]] ]
+ // LAMBDA-64-DAG: [[PF_EUB_RES_CONV:%.+]] = trunc{{.+}} [[PF_EUB_RES]] to
+ // LAMBDA-64: store{{.+}} [[PF_EUB_RES_CONV]],{{.+}} [[OMP_PF_UB]],
+ // LAMBDA-32: store{{.+}} [[PF_EUB_RES]], {{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv (IV = LB)
+ // LAMBDA: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+
+ // outer loop: while (IV < UB) {
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB_1:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // LAMBDA: br{{.+}} [[PF_CMP_IV_UB_1]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // LAMBDA: br label %[[OMP_PF_INNER_FOR_HEADER:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_FOR_HEADER]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // LAMBDA: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_LOOP_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // LAMBDA: br{{.+}}
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // LAMBDA: [[OMP_PF_INNER_LOOP_INC]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // LAMBDA-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_PF_INNER_FOR_HEADER]]
+
+ // check NextLB and NextUB
+ // LAMBDA: [[OMP_PF_INNER_LOOP_END]]:
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_INC]]:
+ // LAMBDA-DAG: [[OMP_PF_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA-DAG: [[OMP_PF_ST_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+ // LAMBDA-DAG: [[OMP_PF_LB_NEXT:%.+]] = add{{.+}} [[OMP_PF_LB_VAL_2]], [[OMP_PF_ST_VAL_1]]
+ // LAMBDA: store{{.+}} [[OMP_PF_LB_NEXT]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // LAMBDA-DAG: [[OMP_PF_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+ // LAMBDA-DAG: [[OMP_PF_UB_NEXT:%.+]] = add{{.+}} [[OMP_PF_UB_VAL_5]], [[OMP_PF_ST_VAL_2]]
+ // LAMBDA: store{{.+}} [[OMP_PF_UB_NEXT]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_END]]:
+ // LAMBDA-DAG: call void @__kmpc_for_static_fini(
+ // LAMBDA: ret
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // schedule: dynamic no chunk
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_6]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_6:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(dynamic)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_6]](
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // LAMBDA: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_6:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // LAMBDA: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_6]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // LAMBDA: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+ // LAMBDA: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+ // LAMBDA: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // initialize omp.iv (IV = LB)
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // LAMBDA-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_LOOP_HEADER]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // LAMBDA: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_LOOP_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // LAMBDA: br{{.+}}
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // LAMBDA: [[OMP_PF_INNER_LOOP_INC]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // LAMBDA-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+ // check NextLB and NextUB
+ // LAMBDA: [[OMP_PF_INNER_LOOP_END]]:
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_INC]]:
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_END]]:
+ // LAMBDA: ret
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+
+ // schedule: dynamic chunk
+ #pragma omp target
+ #pragma omp teams
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN_7]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_7:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(dynamic, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // LAMBDA: define{{.+}} void [[OMP_OUTLINED_7]](
+ // LAMBDA: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // LAMBDA: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_7:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // LAMBDA: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED_7]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // LAMBDA-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // LAMBDA-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // LAMBDA-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // LAMBDA-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // LAMBDA-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // LAMBDA-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // LAMBDA-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // LAMBDA: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // LAMBDA: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+ // LAMBDA: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+ // LAMBDA: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // initialize omp.iv (IV = LB)
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // LAMBDA-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // LAMBDA-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_LOOP_HEADER]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // LAMBDA-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // LAMBDA: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // LAMBDA: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // LAMBDA: [[OMP_PF_INNER_LOOP_BODY]]:
+ // LAMBDA-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // LAMBDA: br{{.+}}
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // LAMBDA: [[OMP_PF_INNER_LOOP_INC]]:
+ // LAMBDA-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // LAMBDA-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // LAMBDA-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // LAMBDA: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+ // check NextLB and NextUB
+ // LAMBDA: [[OMP_PF_INNER_LOOP_END]]:
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_INC]]:
+ // LAMBDA: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // LAMBDA: [[OMP_PF_OUTER_LOOP_END]]:
+ // LAMBDA: ret
+ [&]() {
+ a[i] = b[i] + c[i];
+ }();
+ }
+ }();
+ return 0;
+#else
+ // CHECK-LABEL: @main
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_2:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_3:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_4:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_5:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_6:@.+]](
+
+ // CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+ // CHECK: call void [[OFFLOADING_FUN_7:@.+]](
+
+ // CHECK: call{{.+}} [[TMAIN:@.+]]()
+
+ // no schedule clauses
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define internal void [[OFFLOADING_FUN_1]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_1]](
+ // CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+ // check EUB for distribute
+ // CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // CHECK-DAG: [[EUB_TRUE]]:
+ // CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[EUB_END:.+]]
+ // CHECK-DAG: [[EUB_FALSE]]:
+ // CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: br label %[[EUB_END]]
+ // CHECK-DAG: [[EUB_END]]:
+ // CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_JUMP_BACK:.+]]
+
+ // check exit condition
+ // CHECK: [[OMP_JUMP_BACK]]:
+ // CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[DIST_BODY]]:
+ // CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+ // check that distlb and distub are properly passed to fork_call
+ // CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // CHECK: br label %[[DIST_INC:.+]]
+
+ // increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+ // CHECK: [[DIST_INC]]:
+ // CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+ // CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_JUMP_BACK]]
+
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+
+ // implementation of 'parallel for'
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_1]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // CHECK: [[PF_EUB_TRUE]]:
+ // CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[PF_EUB_END:.+]]
+ // CHECK-DAG: [[PF_EUB_FALSE]]:
+ // CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: br label %[[PF_EUB_END]]
+ // CHECK-DAG: [[PF_EUB_END]]:
+ // CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // CHECK: [[OMP_PF_JUMP_BACK]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[PF_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+ }
+
+ // dist_schedule: static no chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_2]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_2:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for dist_schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_2]](
+ // CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+ // check EUB for distribute
+ // CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // CHECK-DAG: [[EUB_TRUE]]:
+ // CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[EUB_END:.+]]
+ // CHECK-DAG: [[EUB_FALSE]]:
+ // CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: br label %[[EUB_END]]
+ // CHECK-DAG: [[EUB_END]]:
+ // CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_JUMP_BACK:.+]]
+
+ // check exit condition
+ // CHECK: [[OMP_JUMP_BACK]]:
+ // CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[DIST_BODY]]:
+ // CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+ // check that distlb and distub are properly passed to fork_call
+ // CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // CHECK: br label %[[DIST_INC:.+]]
+
+ // increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+ // CHECK: [[DIST_INC]]:
+ // CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+ // CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_JUMP_BACK]]
+
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+
+ // implementation of 'parallel for'
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_2]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // CHECK: [[PF_EUB_TRUE]]:
+ // CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[PF_EUB_END:.+]]
+ // CHECK-DAG: [[PF_EUB_FALSE]]:
+ // CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: br label %[[PF_EUB_END]]
+ // CHECK-DAG: [[PF_EUB_END]]:
+ // CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // CHECK: [[OMP_PF_JUMP_BACK]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[PF_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+ }
+
+ // dist_schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_3]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for dist_schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_3]](
+ // CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // unlike the previous tests, in this one we have a outer and inner loop for 'distribute'
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91,
+ // CHECK: br label %[[DIST_OUTER_LOOP_HEADER:.+]]
+
+ // CHECK: [[DIST_OUTER_LOOP_HEADER]]:
+ // check EUB for distribute
+ // CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+ // CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+ // CHECK-DAG: [[EUB_TRUE]]:
+ // CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[EUB_END:.+]]
+ // CHECK-DAG: [[EUB_FALSE]]:
+ // CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+ // CHECK: br label %[[EUB_END]]
+ // CHECK-DAG: [[EUB_END]]:
+ // CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+ // CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+
+ // check exit condition
+ // CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+ // CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+ // CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+ // CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_OUTER_LOOP_BODY:.+]], label %[[DIST_OUTER_LOOP_END:.+]]
+
+ // CHECK: [[DIST_OUTER_LOOP_BODY]]:
+ // CHECK: br label %[[DIST_INNER_LOOP_HEADER:.+]]
+
+ // CHECK: [[DIST_INNER_LOOP_HEADER]]:
+ // CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}} [[OMP_IV]],
+ // CHECK-DAG: [[OMP_UB_VAL_4:%.+]] = load {{.+}} [[OMP_UB]],
+ // CHECK: [[CMP_IV_UB_2:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_2]], [[OMP_UB_VAL_4]]
+ // CHECK: br{{.+}} [[CMP_IV_UB_2]], label %[[DIST_INNER_LOOP_BODY:.+]], label %[[DIST_INNER_LOOP_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[DIST_INNER_LOOP_BODY]]:
+ // CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+ // check that distlb and distub are properly passed to fork_call
+ // CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+ // CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+ // CHECK: br label %[[DIST_INNER_LOOP_INC:.+]]
+
+ // check DistInc
+ // CHECK: [[DIST_INNER_LOOP_INC]]:
+ // CHECK-DAG: [[OMP_IV_VAL_3:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+ // CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_3]], [[OMP_ST_VAL_1]]
+ // CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[DIST_INNER_LOOP_HEADER]]
+
+ // CHECK: [[DIST_INNER_LOOP_END]]:
+ // CHECK: br label %[[DIST_OUTER_LOOP_INC:.+]]
+
+ // CHECK: [[DIST_OUTER_LOOP_INC]]:
+ // check NextLB and NextUB
+ // CHECK-DAG: [[OMP_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_LB]],
+ // CHECK-DAG: [[OMP_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+ // CHECK-DAG: [[OMP_LB_NEXT:%.+]] = add{{.+}} [[OMP_LB_VAL_2]], [[OMP_ST_VAL_2]]
+ // CHECK: store{{.+}} [[OMP_LB_NEXT]], {{.+}}* [[OMP_LB]],
+ // CHECK-DAG: [[OMP_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_UB]],
+ // CHECK-DAG: [[OMP_ST_VAL_3:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+ // CHECK-DAG: [[OMP_UB_NEXT:%.+]] = add{{.+}} [[OMP_UB_VAL_5]], [[OMP_ST_VAL_3]]
+ // CHECK: store{{.+}} [[OMP_UB_NEXT]], {{.+}}* [[OMP_UB]],
+ // CHECK: br label %[[DIST_OUTER_LOOP_HEADER]]
+
+ // outer loop exit
+ // CHECK: [[DIST_OUTER_LOOP_END]]:
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+
+ // skip implementation of 'parallel for': using default scheduling and was tested above
+ }
+
+ // schedule: static no chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_4]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_4:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(static)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_4]](
+ // CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+ // CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+ // CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+ // CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_4:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // CHECK: ret
+
+ // 'parallel for' implementation is the same as the case without schedule clase (static no chunk is the default)
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_4]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+ // PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+ // In this case we use EUB
+ // CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+ // CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+ // CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // CHECK: [[PF_EUB_TRUE]]:
+ // CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+ // CHECK: br label %[[PF_EUB_END:.+]]
+ // CHECK-DAG: [[PF_EUB_FALSE]]:
+ // CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK: br label %[[PF_EUB_END]]
+ // CHECK-DAG: [[PF_EUB_END]]:
+ // CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+ // CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv
+ // CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+ // check exit condition
+ // CHECK: [[OMP_PF_JUMP_BACK]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+ // check that PrevLB and PrevUB are passed to the 'for'
+ // CHECK: [[PF_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label {{.+}}
+
+ // check stride 1 for 'for' in 'distribute parallel for'
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+ // CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+ }
+
+ // schedule: static chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_5]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_5:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(static, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_5]](
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_5:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // CHECK: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_5]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // check PrevEUB (using PrevUB instead of NumIt as upper bound)
+ // CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK-64-DAG: [[OMP_PF_UB_VAL_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_1]] to
+ // CHECK: [[PF_PREV_UB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_CONV]], [[PF_PREV_UB_VAL_1]]
+ // CHECK-32-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_PREV_UB_VAL_1]]
+ // CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+ // CHECK: [[PF_EUB_TRUE]]:
+ // CHECK: [[PF_PREV_UB_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK: br label %[[PF_EUB_END:.+]]
+ // CHECK-DAG: [[PF_EUB_FALSE]]:
+ // CHECK: [[OMP_PF_UB_VAL_2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+ // CHECK-64: [[OMP_PF_UB_VAL_2_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_2]] to
+ // CHECK: br label %[[PF_EUB_END]]
+ // CHECK-DAG: [[PF_EUB_END]]:
+ // CHECK-64-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2_CONV]], %[[PF_EUB_FALSE]] ]
+ // CHECK-32-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2]], %[[PF_EUB_FALSE]] ]
+ // CHECK-64-DAG: [[PF_EUB_RES_CONV:%.+]] = trunc{{.+}} [[PF_EUB_RES]] to
+ // CHECK-64: store{{.+}} [[PF_EUB_RES_CONV]],{{.+}} [[OMP_PF_UB]],
+ // CHECK-32: store{{.+}} [[PF_EUB_RES]], {{.+}} [[OMP_PF_UB]],
+
+ // initialize omp.iv (IV = LB)
+ // CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+
+ // outer loop: while (IV < UB) {
+ // CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB_1:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+ // CHECK: br{{.+}} [[PF_CMP_IV_UB_1]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // CHECK: br label %[[OMP_PF_INNER_FOR_HEADER:.+]]
+
+ // CHECK: [[OMP_PF_INNER_FOR_HEADER]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // CHECK: br{{.+}}
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_PF_INNER_FOR_HEADER]]
+
+ // check NextLB and NextUB
+ // CHECK: [[OMP_PF_INNER_LOOP_END]]:
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+ // CHECK-DAG: [[OMP_PF_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK-DAG: [[OMP_PF_ST_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+ // CHECK-DAG: [[OMP_PF_LB_NEXT:%.+]] = add{{.+}} [[OMP_PF_LB_VAL_2]], [[OMP_PF_ST_VAL_1]]
+ // CHECK: store{{.+}} [[OMP_PF_LB_NEXT]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // CHECK-DAG: [[OMP_PF_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+ // CHECK-DAG: [[OMP_PF_UB_NEXT:%.+]] = add{{.+}} [[OMP_PF_UB_VAL_5]], [[OMP_PF_ST_VAL_2]]
+ // CHECK: store{{.+}} [[OMP_PF_UB_NEXT]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+ // CHECK-DAG: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+ }
+
+ // schedule: dynamic no chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_6]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_6:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(dynamic)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_6]](
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_6:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // CHECK: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_6]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // CHECK: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+ // CHECK: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+ // CHECK: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // initialize omp.iv (IV = LB)
+ // CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // CHECK-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+ // CHECK: [[OMP_PF_INNER_LOOP_HEADER]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // CHECK: br{{.+}}
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+ // check NextLB and NextUB
+ // CHECK: [[OMP_PF_INNER_LOOP_END]]:
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+ // CHECK: ret
+ }
+
+ // schedule: dynamic chunk
+ #pragma omp target
+ #pragma omp teams
+ // CHECK: define{{.+}} void [[OFFLOADING_FUN_7]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_7:@.+]] to {{.+}})
+
+ #pragma omp distribute parallel for schedule(dynamic, ch)
+ for (int i = 0; i < n; ++i) {
+ a[i] = b[i] + c[i];
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_7]](
+ // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+ // CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_7:@.+]] to {{.+}},
+ // skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+ // CHECK: ret
+
+ // 'parallel for' implementation using outer and inner loops and PrevEUB
+ // CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_7]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+ // CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+ // CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+ // initialize lb and ub to PrevLB and PrevUB
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+ // CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+ // CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+ // CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+ // CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+ // CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+ // CHECK-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+ // CHECK: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+ // CHECK: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+ // CHECK: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+ // CHECK: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+ // initialize omp.iv (IV = LB)
+ // CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+ // CHECK-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+ // CHECK-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+ // CHECK: [[OMP_PF_INNER_LOOP_HEADER]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+ // CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+ // CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+ // CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+ // CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+ // skip body branch
+ // CHECK: br{{.+}}
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+ // IV = IV + 1 and inner loop latch
+ // CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+ // CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+ // CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+ // CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+ // CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+ // check NextLB and NextUB
+ // CHECK: [[OMP_PF_INNER_LOOP_END]]:
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+ // CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+ // CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+ // CHECK: ret
+ }
+
+ return tmain<int>();
+#endif
+}
+
+// check code
+// CHECK: define{{.+}} [[TMAIN]]()
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_3:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_4:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_5:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_6:@.+]](
+
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_7:@.+]](
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_1]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_1]](
+// CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+// CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+// CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+// CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+// check EUB for distribute
+// CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+// CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+// CHECK-DAG: [[EUB_TRUE]]:
+// CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[EUB_END:.+]]
+// CHECK-DAG: [[EUB_FALSE]]:
+// CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: br label %[[EUB_END]]
+// CHECK-DAG: [[EUB_END]]:
+// CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+// CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+// CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_JUMP_BACK:.+]]
+
+// check exit condition
+// CHECK: [[OMP_JUMP_BACK]]:
+// CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+// CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+// CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+// CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[DIST_BODY]]:
+// CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+// CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+// CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+// check that distlb and distub are properly passed to fork_call
+// CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+// CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+// CHECK: br label %[[DIST_INC:.+]]
+
+// increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+// CHECK: [[DIST_INC]]:
+// CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+// CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+// CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_JUMP_BACK]]
+
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// implementation of 'parallel for'
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_1]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+// PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+// In this case we use EUB
+// CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+// CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+// CHECK: [[PF_EUB_TRUE]]:
+// CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[PF_EUB_END:.+]]
+// CHECK-DAG: [[PF_EUB_FALSE]]:
+// CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: br label %[[PF_EUB_END]]
+// CHECK-DAG: [[PF_EUB_END]]:
+// CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+// CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+// check exit condition
+// CHECK: [[OMP_PF_JUMP_BACK]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+// CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[PF_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label {{.+}}
+
+// check stride 1 for 'for' in 'distribute parallel for'
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+// CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_2]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_2:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_2]](
+// CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+// CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+// CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+// CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+
+// check EUB for distribute
+// CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+// CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+// CHECK-DAG: [[EUB_TRUE]]:
+// CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[EUB_END:.+]]
+// CHECK-DAG: [[EUB_FALSE]]:
+// CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: br label %[[EUB_END]]
+// CHECK-DAG: [[EUB_END]]:
+// CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+// CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+// CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_JUMP_BACK:.+]]
+
+// check exit condition
+// CHECK: [[OMP_JUMP_BACK]]:
+// CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+// CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+// CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+// CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_BODY:.+]], label %[[DIST_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[DIST_BODY]]:
+// CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+// CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+// CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+// check that distlb and distub are properly passed to fork_call
+// CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+// CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_2:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+// CHECK: br label %[[DIST_INC:.+]]
+
+// increment by stride (distInc - 'parallel for' executes the whole chunk) and latch
+// CHECK: [[DIST_INC]]:
+// CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+// CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_2]], [[OMP_ST_VAL_1]]
+// CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_JUMP_BACK]]
+
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// implementation of 'parallel for'
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_2]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+// PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+// In this case we use EUB
+// CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+// CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+// CHECK: [[PF_EUB_TRUE]]:
+// CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[PF_EUB_END:.+]]
+// CHECK-DAG: [[PF_EUB_FALSE]]:
+// CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: br label %[[PF_EUB_END]]
+// CHECK-DAG: [[PF_EUB_END]]:
+// CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+// CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+// check exit condition
+// CHECK: [[OMP_PF_JUMP_BACK]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+// CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[PF_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label {{.+}}
+
+// check stride 1 for 'for' in 'distribute parallel for'
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+// CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_3]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_3]](
+// CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+// CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+// CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+// CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+// unlike the previous tests, in this one we have a outer and inner loop for 'distribute'
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 91,
+// CHECK: br label %[[DIST_OUTER_LOOP_HEADER:.+]]
+
+// CHECK: [[DIST_OUTER_LOOP_HEADER]]:
+// check EUB for distribute
+// CHECK-DAG: [[OMP_UB_VAL_1:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: [[NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[CMP_UB_NUM_IT:%.+]] = icmp sgt {{.+}} [[OMP_UB_VAL_1]], [[NUM_IT_1]]
+// CHECK: br {{.+}} [[CMP_UB_NUM_IT]], label %[[EUB_TRUE:.+]], label %[[EUB_FALSE:.+]]
+// CHECK-DAG: [[EUB_TRUE]]:
+// CHECK: [[NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[EUB_END:.+]]
+// CHECK-DAG: [[EUB_FALSE]]:
+// CHECK: [[OMP_UB_VAL2:%.+]] = load{{.+}} [[OMP_UB]],
+// CHECK: br label %[[EUB_END]]
+// CHECK-DAG: [[EUB_END]]:
+// CHECK-DAG: [[EUB_RES:%.+]] = phi{{.+}} [ [[NUM_IT_2]], %[[EUB_TRUE]] ], [ [[OMP_UB_VAL2]], %[[EUB_FALSE]] ]
+// CHECK: store{{.+}} [[EUB_RES]], {{.+}}* [[OMP_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_LB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_LB]],
+// CHECK: store {{.+}} [[OMP_LB_VAL_1]], {{.+}}* [[OMP_IV]],
+
+// check exit condition
+// CHECK-DAG: [[OMP_IV_VAL_1:%.+]] = load {{.+}} [[OMP_IV]],
+// CHECK-DAG: [[OMP_UB_VAL_3:%.+]] = load {{.+}} [[OMP_UB]],
+// CHECK: [[CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_1]], [[OMP_UB_VAL_3]]
+// CHECK: br {{.+}} [[CMP_IV_UB]], label %[[DIST_OUTER_LOOP_BODY:.+]], label %[[DIST_OUTER_LOOP_END:.+]]
+
+// CHECK: [[DIST_OUTER_LOOP_BODY]]:
+// CHECK: br label %[[DIST_INNER_LOOP_HEADER:.+]]
+
+// CHECK: [[DIST_INNER_LOOP_HEADER]]:
+// CHECK-DAG: [[OMP_IV_VAL_2:%.+]] = load {{.+}} [[OMP_IV]],
+// CHECK-DAG: [[OMP_UB_VAL_4:%.+]] = load {{.+}} [[OMP_UB]],
+// CHECK: [[CMP_IV_UB_2:%.+]] = icmp sle {{.+}} [[OMP_IV_VAL_2]], [[OMP_UB_VAL_4]]
+// CHECK: br{{.+}} [[CMP_IV_UB_2]], label %[[DIST_INNER_LOOP_BODY:.+]], label %[[DIST_INNER_LOOP_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[DIST_INNER_LOOP_BODY]]:
+// CHECK-DAG: [[OMP_PREV_LB:%.+]] = load {{.+}}, {{.+}} [[OMP_LB]],
+// CHECK-64-DAG: [[OMP_PREV_LB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_LB]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB:%.+]] = load {{.+}}, {{.+}} [[OMP_UB]],
+// CHECK-64-DAG: [[OMP_PREV_UB_EXT:%.+]] = zext {{.+}} [[OMP_PREV_UB]] to {{.+}}
+// check that distlb and distub are properly passed to fork_call
+// CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_EXT]], i{{[0-9]+}} [[OMP_PREV_UB_EXT]], {{.+}})
+// CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_3:@.+]] to {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB]], i{{[0-9]+}} [[OMP_PREV_UB]], {{.+}})
+// CHECK: br label %[[DIST_INNER_LOOP_INC:.+]]
+
+// check DistInc
+// CHECK: [[DIST_INNER_LOOP_INC]]:
+// CHECK-DAG: [[OMP_IV_VAL_3:%.+]] = load {{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_ST_VAL_1:%.+]] = load {{.+}}, {{.+}}* [[OMP_ST]],
+// CHECK: [[OMP_IV_INC:%.+]] = add{{.+}} [[OMP_IV_VAL_3]], [[OMP_ST_VAL_1]]
+// CHECK: store{{.+}} [[OMP_IV_INC]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[DIST_INNER_LOOP_HEADER]]
+
+// CHECK: [[DIST_INNER_LOOP_END]]:
+// CHECK: br label %[[DIST_OUTER_LOOP_INC:.+]]
+
+// CHECK: [[DIST_OUTER_LOOP_INC]]:
+// check NextLB and NextUB
+// CHECK-DAG: [[OMP_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_LB]],
+// CHECK-DAG: [[OMP_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+// CHECK-DAG: [[OMP_LB_NEXT:%.+]] = add{{.+}} [[OMP_LB_VAL_2]], [[OMP_ST_VAL_2]]
+// CHECK: store{{.+}} [[OMP_LB_NEXT]], {{.+}}* [[OMP_LB]],
+// CHECK-DAG: [[OMP_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_UB]],
+// CHECK-DAG: [[OMP_ST_VAL_3:%.+]] = load{{.+}}, {{.+}} [[OMP_ST]],
+// CHECK-DAG: [[OMP_UB_NEXT:%.+]] = add{{.+}} [[OMP_UB_VAL_5]], [[OMP_ST_VAL_3]]
+// CHECK: store{{.+}} [[OMP_UB_NEXT]], {{.+}}* [[OMP_UB]],
+// CHECK: br label %[[DIST_OUTER_LOOP_HEADER]]
+
+// outer loop exit
+// CHECK: [[DIST_OUTER_LOOP_END]]:
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// skip implementation of 'parallel for': using default scheduling and was tested above
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_4]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_4:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_4]](
+// CHECK-DAG: [[OMP_IV:%.omp.iv]] = alloca
+// CHECK-DAG: [[OMP_LB:%.omp.comb.lb]] = alloca
+// CHECK-DAG: [[OMP_UB:%.omp.comb.ub]] = alloca
+// CHECK-DAG: [[OMP_ST:%.omp.stride]] = alloca
+
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_4:@.+]] to {{.+}},
+// skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+// CHECK: ret
+
+// 'parallel for' implementation is the same as the case without schedule clase (static no chunk is the default)
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_4]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 34, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+
+// PrevEUB is only used when 'for' has a chunked schedule, otherwise EUB is used
+// In this case we use EUB
+// CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_NUM_IT_1:%.+]] = load{{.+}},
+// CHECK-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_NUM_IT_1]]
+// CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+// CHECK: [[PF_EUB_TRUE]]:
+// CHECK: [[PF_NUM_IT_2:%.+]] = load{{.+}},
+// CHECK: br label %[[PF_EUB_END:.+]]
+// CHECK-DAG: [[PF_EUB_FALSE]]:
+// CHECK: [[OMP_PF_UB_VAL2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK: br label %[[PF_EUB_END]]
+// CHECK-DAG: [[PF_EUB_END]]:
+// CHECK-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_NUM_IT_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL2]], %[[PF_EUB_FALSE]] ]
+// CHECK: store{{.+}} [[PF_EUB_RES]],{{.+}} [[OMP_PF_UB]],
+
+// initialize omp.iv
+// CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK:.+]]
+
+// check exit condition
+// CHECK: [[OMP_PF_JUMP_BACK]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load {{.+}} [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load {{.+}} [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB:%.+]] = icmp sle {{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+// CHECK: br {{.+}} [[PF_CMP_IV_UB]], label %[[PF_BODY:.+]], label %[[PF_END:.+]]
+
+// check that PrevLB and PrevUB are passed to the 'for'
+// CHECK: [[PF_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label {{.+}}
+
+// check stride 1 for 'for' in 'distribute parallel for'
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load {{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK: [[OMP_PF_IV_INC:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_2]], 1
+// CHECK: store{{.+}} [[OMP_PF_IV_INC]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_JUMP_BACK]]
+
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_5]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_5:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_5]](
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_5:@.+]] to {{.+}},
+// skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+// CHECK: ret
+
+// 'parallel for' implementation using outer and inner loops and PrevEUB
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_5]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]],{{.+}})
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+// check PrevEUB (using PrevUB instead of NumIt as upper bound)
+// CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+// CHECK-DAG: [[OMP_PF_UB_VAL_1:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK-64-DAG: [[OMP_PF_UB_VAL_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_1]] to
+// CHECK: [[PF_PREV_UB_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_CONV]], [[PF_PREV_UB_VAL_1]]
+// CHECK-32-DAG: [[PF_CMP_UB_NUM_IT:%.+]] = icmp{{.+}} [[OMP_PF_UB_VAL_1]], [[PF_PREV_UB_VAL_1]]
+// CHECK: br i1 [[PF_CMP_UB_NUM_IT]], label %[[PF_EUB_TRUE:.+]], label %[[PF_EUB_FALSE:.+]]
+// CHECK: [[PF_EUB_TRUE]]:
+// CHECK: [[PF_PREV_UB_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK: br label %[[PF_EUB_END:.+]]
+// CHECK-DAG: [[PF_EUB_FALSE]]:
+// CHECK: [[OMP_PF_UB_VAL_2:%.+]] = load{{.+}} [[OMP_PF_UB]],
+// CHECK-64: [[OMP_PF_UB_VAL_2_CONV:%.+]] = sext{{.+}} [[OMP_PF_UB_VAL_2]] to
+// CHECK: br label %[[PF_EUB_END]]
+// CHECK-DAG: [[PF_EUB_END]]:
+// CHECK-64-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2_CONV]], %[[PF_EUB_FALSE]] ]
+// CHECK-32-DAG: [[PF_EUB_RES:%.+]] = phi{{.+}} [ [[PF_PREV_UB_VAL_2]], %[[PF_EUB_TRUE]] ], [ [[OMP_PF_UB_VAL_2]], %[[PF_EUB_FALSE]] ]
+// CHECK-64-DAG: [[PF_EUB_RES_CONV:%.+]] = trunc{{.+}} [[PF_EUB_RES]] to
+// CHECK-64: store{{.+}} [[PF_EUB_RES_CONV]],{{.+}} [[OMP_PF_UB]],
+// CHECK-32: store{{.+}} [[PF_EUB_RES]], {{.+}} [[OMP_PF_UB]],
+
+// initialize omp.iv (IV = LB)
+// CHECK: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+
+// outer loop: while (IV < UB) {
+// CHECK-DAG: [[OMP_PF_IV_VAL_1:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB_1:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_1]], [[OMP_PF_UB_VAL_3]]
+// CHECK: br{{.+}} [[PF_CMP_IV_UB_1]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+// CHECK: br label %[[OMP_PF_INNER_FOR_HEADER:.+]]
+
+// CHECK: [[OMP_PF_INNER_FOR_HEADER]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+// CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+// CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// skip body branch
+// CHECK: br{{.+}}
+// CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+// IV = IV + 1 and inner loop latch
+// CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+// CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_PF_INNER_FOR_HEADER]]
+
+// check NextLB and NextUB
+// CHECK: [[OMP_PF_INNER_LOOP_END]]:
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+// CHECK-DAG: [[OMP_PF_LB_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK-DAG: [[OMP_PF_ST_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+// CHECK-DAG: [[OMP_PF_LB_NEXT:%.+]] = add{{.+}} [[OMP_PF_LB_VAL_2]], [[OMP_PF_ST_VAL_1]]
+// CHECK: store{{.+}} [[OMP_PF_LB_NEXT]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_5:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+// CHECK-DAG: [[OMP_PF_ST_VAL_2:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_ST]],
+// CHECK-DAG: [[OMP_PF_UB_NEXT:%.+]] = add{{.+}} [[OMP_PF_UB_VAL_5]], [[OMP_PF_ST_VAL_2]]
+// CHECK: store{{.+}} [[OMP_PF_UB_NEXT]], {{.+}}* [[OMP_PF_UB]],
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+// CHECK-DAG: call void @__kmpc_for_static_fini(
+// CHECK: ret
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_6]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED_6:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_6]](
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_6:@.+]] to {{.+}},
+// skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+// CHECK: ret
+
+// 'parallel for' implementation using outer and inner loops and PrevEUB
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_6]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}})
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+// CHECK: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+// CHECK: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+// CHECK: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+// initialize omp.iv (IV = LB)
+// CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+// CHECK-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+// CHECK: [[OMP_PF_INNER_LOOP_HEADER]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+// CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+// CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// skip body branch
+// CHECK: br{{.+}}
+// CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+// IV = IV + 1 and inner loop latch
+// CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+// CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+// check NextLB and NextUB
+// CHECK: [[OMP_PF_INNER_LOOP_END]]:
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+// CHECK: ret
+
+// CHECK: define{{.+}} void [[OFFLOADING_FUN_7]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 5, {{.+}}* [[OMP_OUTLINED_7:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[OMP_OUTLINED_7]](
+// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, i32 92,
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_7:@.+]] to {{.+}},
+// skip rest of implementation of 'distribute' as it is tested above for default dist_schedule case
+// CHECK: ret
+
+// 'parallel for' implementation using outer and inner loops and PrevEUB
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_7]]({{.+}}, {{.+}}, i{{[0-9]+}} [[OMP_PREV_LB_IN:%.+]], i{{[0-9]+}} [[OMP_PREV_UB_IN:%.+]], {{.+}}, {{.+}}, {{.+}}, {{.+}}, {{.+}})
+// CHECK-DAG: [[OMP_PF_LB:%.omp.lb]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_UB:%.omp.ub]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_IV:%.omp.iv]] = alloca{{.+}},
+// CHECK-DAG: [[OMP_PF_ST:%.omp.stride]] = alloca{{.+}},
+
+// initialize lb and ub to PrevLB and PrevUB
+// CHECK-DAG: store{{.+}} [[OMP_PREV_LB_IN]], {{.+}}* [[PREV_LB_ADDR:%.+]],
+// CHECK-DAG: store{{.+}} [[OMP_PREV_UB_IN]], {{.+}}* [[PREV_UB_ADDR:%.+]],
+// CHECK-DAG: [[OMP_PREV_LB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_LB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_LB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_LB_VAL]] to {{.+}}
+// CHECK-DAG: [[OMP_PREV_UB_VAL:%.+]] = load{{.+}}, {{.+}}* [[PREV_UB_ADDR]],
+// CHECK-64-DAG: [[OMP_PREV_UB_TRC:%.+]] = trunc{{.+}} [[OMP_PREV_UB_VAL]] to {{.+}}
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_LB_TRC]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-64-DAG: store{{.+}} [[OMP_PREV_UB_TRC]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_LB_VAL]], {{.+}}* [[OMP_PF_LB]],
+// CHECK-32-DAG: store{{.+}} [[OMP_PREV_UB_VAL]], {{.+}}* [[OMP_PF_UB]],
+// CHECK-DAG: [[OMP_PF_LB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK-DAG: [[OMP_PF_UB_VAL:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_UB]],
+// CHECK: call void @__kmpc_dispatch_init_4({{.+}}, {{.+}}, {{.+}} 35, {{.+}} [[OMP_PF_LB_VAL]], {{.+}} [[OMP_PF_UB_VAL]], {{.+}}, {{.+}})
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_HEADER]]:
+// CHECK: [[IS_FIN:%.+]] = call{{.+}} @__kmpc_dispatch_next_4({{.+}}, {{.+}}, {{.+}}, {{.+}}* [[OMP_PF_LB]], {{.+}}* [[OMP_PF_UB]], {{.+}}* [[OMP_PF_ST]])
+// CHECK: [[IS_FIN_CMP:%.+]] = icmp{{.+}} [[IS_FIN]], 0
+// CHECK: br{{.+}} [[IS_FIN_CMP]], label %[[OMP_PF_OUTER_LOOP_BODY:.+]], label %[[OMP_PF_OUTER_LOOP_END:.+]]
+
+// initialize omp.iv (IV = LB)
+// CHECK: [[OMP_PF_OUTER_LOOP_BODY]]:
+// CHECK-DAG: [[OMP_PF_LB_VAL_1:%.+]] = load{{.+}}, {{.+}} [[OMP_PF_LB]],
+// CHECK-DAG: store {{.+}} [[OMP_PF_LB_VAL_1]], {{.+}}* [[OMP_PF_IV]],
+// CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER:.+]]
+
+// CHECK: [[OMP_PF_INNER_LOOP_HEADER]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_2:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// CHECK-DAG: [[OMP_PF_UB_VAL_4:%.+]] = load{{.+}}, {{.+}}* [[OMP_PF_UB]],
+// CHECK: [[PF_CMP_IV_UB_2:%.+]] = icmp{{.+}} [[OMP_PF_IV_VAL_2]], [[OMP_PF_UB_VAL_4]]
+// CHECK: br{{.+}} [[PF_CMP_IV_UB_2]], label %[[OMP_PF_INNER_LOOP_BODY:.+]], label %[[OMP_PF_INNER_LOOP_END:.+]]
+
+// CHECK: [[OMP_PF_INNER_LOOP_BODY]]:
+// CHECK-DAG: {{.+}} = load{{.+}}, {{.+}}* [[OMP_PF_IV]],
+// skip body branch
+// CHECK: br{{.+}}
+// CHECK: br label %[[OMP_PF_INNER_LOOP_INC:.+]]
+
+// IV = IV + 1 and inner loop latch
+// CHECK: [[OMP_PF_INNER_LOOP_INC]]:
+// CHECK-DAG: [[OMP_PF_IV_VAL_3:%.+]] = load{{.+}}, {{.+}}* [[OMP_IV]],
+// CHECK-DAG: [[OMP_PF_NEXT_IV:%.+]] = add{{.+}} [[OMP_PF_IV_VAL_3]], 1
+// CHECK-DAG: store{{.+}} [[OMP_PF_NEXT_IV]], {{.+}}* [[OMP_IV]],
+// CHECK: br label %[[OMP_PF_INNER_LOOP_HEADER]]
+
+// check NextLB and NextUB
+// CHECK: [[OMP_PF_INNER_LOOP_END]]:
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_INC:.+]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_INC]]:
+// CHECK: br label %[[OMP_PF_OUTER_LOOP_HEADER]]
+
+// CHECK: [[OMP_PF_OUTER_LOOP_END]]:
+// CHECK: ret
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
new file mode 100644
index 00000000000..d12c0aa11cf
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -0,0 +1,619 @@
+// RxUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+ T f;
+ S(T a) : f(a) {}
+ S() : f() {}
+ operator T() { return T(); }
+ ~S() {}
+};
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+ S<T> test;
+ T t_var = T();
+ T vec[] = {1, 2};
+ S<T> s_arr[] = {1, 2};
+ S<T> &var = test;
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for firstprivate(t_var, vec, s_arr, s_arr, var, var)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return T();
+}
+
+int main() {
+ static int svar;
+ volatile double g;
+ volatile double &g1 = g;
+
+ #ifdef LAMBDA
+ // LAMBDA-LABEL: @main
+ // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+ [&]() {
+ static float sfvar;
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN:@.+]](
+
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}})
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for firstprivate(g, g1, svar, sfvar)
+ for (int i = 0; i < 2; ++i) {
+ // LAMBDA-64: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{[0-9]+}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]])
+ // LAMBDA-32: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, double* {{.+}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]])
+
+ // addr alloca's
+ // LAMBDA-64: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA-32: [[G_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[G1_REF:%.+]] = alloca double*,
+ // LAMBDA: [[TMP:%.+]] = alloca double*,
+
+ // private alloca's
+ // LAMBDA: [[G_PRIV:%.+]] = alloca double,
+ // LAMBDA: [[G1_PRIV:%.+]] = alloca double,
+ // LAMBDA: [[TMP_PRIV:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_PRIV:%.+]] = alloca float,
+
+ // transfer input parameters into addr alloca's
+ // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+
+ // init private alloca's with addr alloca's
+ // g
+ // LAMBDA-64-DAG: [[G_CONV:%.+]] = bitcast {{.+}}* [[G_ADDR]] to
+ // LAMBDA-32-DAG: [[G_CONV:%.+]] = load {{.+}}*, {{.+}}** [[G_ADDR]]
+ // LAMBDA-DAG: [[G_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[G_CONV]],
+ // LAMBDA-DAG: store {{.+}} [[G_ADDR_VAL]], {{.+}}* [[G_PRIV]],
+
+ // g1
+ // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}}* [[G1_ADDR]] to
+ // LAMBDA-DAG: store {{.+}}* [[G1_CONV]], {{.+}}** [[G1_REF]],
+ // LAMBDA-DAG: [[G1_REF_VAL:%.+]] = load {{.+}}*, {{.+}}** [[G1_REF]],
+ // LAMBDA-DAG: store {{.+}}* [[G1_REF_VAL]], {{.+}}** [[TMP]],
+ // LAMBDA-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP]],
+ // LAMBDA-DAG: [[TMP_VAL:%.+]] = load {{.+}}, {{.+}}* [[TMP_REF]],
+ // LAMBDA-DAG: store {{.+}} [[TMP_VAL]], {{.+}}* [[G1_PRIV]]
+ // LAMBDA-DAG: store {{.+}}* [[G1_PRIV]], {{.+}}** [[TMP_PRIV]],
+
+ // svar
+ // LAMBDA-64-DAG: [[SVAR_CONV:%.+]] = bitcast {{.+}}* [[SVAR_ADDR]] to
+ // LAMBDA-64-DAG: [[SVAR_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_CONV]],
+ // LAMBDA-32-DAG: [[SVAR_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[SVAR_VAL]], {{.+}}* [[SVAR_PRIV]],
+
+ // sfvar
+ // LAMBDA-DAG: [[SFVAR_CONV:%.+]] = bitcast {{.+}}* [[SFVAR_ADDR]] to
+ // LAMBDA-DAG: [[SFVAR_VAL:%.+]] = load {{.+}}, {{.+}}* [[SFVAR_CONV]],
+ // LAMBDA-DAG: store {{.+}} [[SFVAR_VAL]], {{.+}}* [[SFVAR_PRIV]],
+
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+ // pass firstprivate parameters to parallel outlined function
+ // g
+ // LAMBDA-64-DAG: [[G_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[G_PRIV]],
+ // LAMBDA-64: [[G_CAST_CONV:%.+]] = bitcast {{.+}}* [[G_CAST:%.+]] to
+ // LAMBDA-64-DAG: store {{.+}} [[G_PRIV_VAL]], {{.+}}* [[G_CAST_CONV]],
+ // LAMBDA-64-DAG: [[G_PAR:%.+]] = load {{.+}}, {{.+}}* [[G_CAST]],
+
+ // g1
+ // LAMBDA-DAG: [[TMP_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[TMP_PRIV]],
+ // LAMBDA-DAG: [[G1_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[TMP_PRIV_VAL]],
+ // LAMBDA: [[G1_CAST_CONV:%.+]] = bitcast {{.+}}* [[G1_CAST:%.+]] to
+ // LAMBDA-DAG: store {{.+}} [[G1_PRIV_VAL]], {{.+}}* [[G1_CAST_CONV]],
+ // LAMBDA-DAG: [[G1_PAR:%.+]] = load {{.+}}, {{.+}}* [[G1_CAST]],
+
+ // svar
+ // LAMBDA: [[SVAR_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_PRIV]],
+ // LAMBDA-64-DAG: [[SVAR_CAST_CONV:%.+]] = bitcast {{.+}}* [[SVAR_CAST:%.+]] to
+ // LAMBDA-64-DAG: store {{.+}} [[SVAR_VAL]], {{.+}}* [[SVAR_CAST_CONV]],
+ // LAMBDA-32-DAG: store {{.+}} [[SVAR_VAL]], {{.+}}* [[SVAR_CAST:%.+]],
+ // LAMBDA-DAG: [[SVAR_PAR:%.+]] = load {{.+}}, {{.+}}* [[SVAR_CAST]],
+
+ // sfvar
+ // LAMBDA: [[SFVAR_VAL:%.+]] = load {{.+}}, {{.+}}* [[SFVAR_PRIV]],
+ // LAMBDA-DAG: [[SFVAR_CAST_CONV:%.+]] = bitcast {{.+}}* [[SFVAR_CAST:%.+]] to
+ // LAMBDA-DAG: store {{.+}} [[SFVAR_VAL]], {{.+}}* [[SFVAR_CAST_CONV]],
+ // LAMBDA-DAG: [[SFVAR_PAR:%.+]] = load {{.+}}, {{.+}}* [[SFVAR_CAST]],
+
+ // LAMBDA-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[G_PAR]], {{.+}} [[G1_PAR]], {{.+}} [[SVAR_PAR]], {{.+}} [[SFVAR_PAR]])
+ // LAMBDA-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[G_PRIV]], {{.+}} [[G1_PAR]], {{.+}} [[SVAR_PAR]], {{.+}} [[SFVAR_PAR]])
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+ // LAMBDA: ret void
+
+
+ // LAMBDA-64: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, i{{[0-9]+}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]])
+ // LAMBDA-32: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, double* {{.+}} [[G_IN:%.+]], i{{[0-9]+}} [[G1_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]], i{{[0-9]+}} [[SFVAR_IN:%.+]])
+ // skip initial params
+ // LAMBDA: {{.+}} = alloca{{.+}},
+ // LAMBDA: {{.+}} = alloca{{.+}},
+ // LAMBDA: {{.+}} = alloca{{.+}},
+ // LAMBDA: {{.+}} = alloca{{.+}},
+
+ // addr alloca's
+ // LAMBDA-64: [[G_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA-32: [[G_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[G1_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[G1_REF:%.+]] = alloca double*,
+
+ // private alloca's (only for 32-bit)
+ // LAMBDA-32: [[G_PRIV:%.+]] = alloca double,
+
+ // transfer input parameters into addr alloca's
+ // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+ // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
+
+ // prepare parameters for lambda
+ // g
+ // LAMBDA-64-DAG: [[G_CONV:%.+]] = bitcast {{.+}}* [[G_ADDR]] to
+ // LAMBDA-32-DAG: [[G_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}** [[G_ADDR]]
+ // LAMBDA-32-DAG: [[G_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[G_ADDR_REF]],
+ // LAMBDA-32-DAG: store {{.+}} [[G_ADDR_VAL]], {{.+}}* [[G_PRIV]],
+
+ // g1
+ // LAMBDA-DAG: [[G1_CONV:%.+]] = bitcast {{.+}}* [[G1_ADDR]] to
+ // LAMBDA-DAG: store {{.+}}* [[G1_CONV]], {{.+}}* [[G1_REF]],
+
+ // svar
+ // LAMBDA-64-DAG: [[SVAR_CONV:%.+]] = bitcast {{.+}}* [[SVAR_ADDR]] to
+
+ // sfvar
+ // LAMBDA-DAG: [[SFVAR_CONV:%.+]] = bitcast {{.+}}* [[SFVAR_ADDR]] to
+
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+ g = 1;
+ g1 = 1;
+ svar = 3;
+ sfvar = 4.0;
+ // LAMBDA-64: store double 1.0{{.+}}, double* [[G_CONV]],
+ // LAMBDA-32: store double 1.0{{.+}}, double* [[G_PRIV]],
+ // LAMBDA: [[G1_REF_REF:%.+]] = load {{.+}}*, {{.+}}** [[G1_REF]],
+ // LAMBDA: store {{.+}} 1.0{{.+}}, {{.+}}* [[G1_REF_REF]],
+ // LAMBDA-64: store {{.+}} 3, {{.+}}* [[SVAR_CONV]],
+ // LAMBDA-32: store {{.+}} 3, {{.+}}* [[SVAR_ADDR]],
+ // LAMBDA: store {{.+}} 4.0{{.+}}, {{.+}}* [[SFVAR_CONV]],
+
+ // pass params to inner lambda
+ // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA-64: store double* [[G_CONV]], double** [[G_PRIVATE_ADDR_REF]],
+ // LAMBDA-32: store double* [[G_PRIV]], double** [[G_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[G1_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_REF_REF:%.+]] = load double*, double** [[G1_REF]],
+ // LAMBDA: store double* [[G1_REF_REF]], double** [[G1_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[SVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA-64: store i{{[0-9]+}}* [[SVAR_CONV]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA-32: store i{{[0-9]+}}* [[SVAR_ADDR]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: store float* [[SFVAR_CONV]], float** [[SFVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+ // LAMBDA: ret void
+ [&]() {
+ // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+ // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+ g = 2;
+ g1 = 2;
+ svar = 4;
+ sfvar = 8.0;
+ // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+ // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+ // LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+ // LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+ // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+ // LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+ // LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+ }();
+ }
+ }();
+ return 0;
+ #else
+ S<float> test;
+ int t_var = 0;
+ int vec[] = {1, 2};
+ S<float> s_arr[] = {1, 2};
+ S<float> &var = test;
+
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for firstprivate(t_var, vec, s_arr, s_arr, var, var, svar)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return tmain<int>();
+ #endif
+}
+
+// CHECK-LABEL: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_0:@.+]](
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN_0]](i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], [2 x [[S_FLOAT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]* {{.+}} [[VAR_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}, [2 x i{{[0-9]+}}]*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[OMP_OUTLINED_0:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[OMP_OUTLINED_0]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], [2 x [[S_FLOAT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]* {{.+}} [[VAR_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]])
+
+// addr alloca's
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP:%.+]] = alloca [[S_FLOAT_TY]]*,
+
+// skip loop alloca's
+// CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_LB:.omp.comb.lb+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_UB:.omp.comb.ub+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_ST:.omp.stride+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:.omp.is_last+]] = alloca i{{[0-9]+}},
+
+// private alloca's
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// init addr alloca's with input values
+// CHECK-DAG: store {{.+}} [[T_VAR_IN]], {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// init private alloca's with addr alloca's
+// t-var
+// CHECK-64-DAG: [[T_VAR_CONV:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-64-DAG: [[T_VAR_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_CONV]],
+// CHECK-32-DAG: [[T_VAR_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[T_VAR_ADDR_VAL]], {{.+}} [[T_VAR_PRIV]],
+
+// vec
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
+// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
+// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+
+// s_arr
+// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
+// CHECK-DAG: [[S_ARR_BGN:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[S_ARR_ADDR_VAL]] to
+// CHECK-DAG: [[S_ARR_BGN_GEP:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_BGN]],
+// CHECK-DAG: [[S_ARR_EMPTY:%.+]] = icmp {{.+}} [[S_ARR_BGN]], [[S_ARR_BGN_GEP]]
+// CHECK-DAG: br {{.+}} [[S_ARR_EMPTY]], label %[[CPY_DONE:.+]], label %[[CPY_BODY:.+]]
+// CHECK-DAG: [[CPY_BODY]]:
+// CHECK-DAG: call void @llvm.memcpy{{.+}}(
+// CHECK-DAG: [[CPY_DONE]]:
+
+// var
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
+// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
+
+// svar
+// CHECK-64-DAG: [[SVAR_CONV:%.+]] = bitcast {{.+}}* [[SVAR_ADDR]] to
+// CHECK-64-DAG: [[SVAR_CONV_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_CONV]],
+// CHECK-32-DAG: [[SVAR_CONV_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[SVAR_CONV_VAL]], {{.+}}* [[SVAR_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// pass private alloca's to fork
+// CHECK-DAG: [[T_VAR_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_PRIV]],
+// not dag to distinguish with S_VAR_CAST
+// CHECK-64: [[T_VAR_CAST_CONV:%.+]] = bitcast {{.+}}* [[T_VAR_CAST:%.+]] to
+// CHECK-64-DAG: store {{.+}} [[T_VAR_PRIV_VAL]], {{.+}} [[T_VAR_CAST_CONV]],
+// CHECK-32: store {{.+}} [[T_VAR_PRIV_VAL]], {{.+}} [[T_VAR_CAST:%.+]],
+// CHECK-DAG: [[T_VAR_CAST_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_CAST]],
+// CHECK-DAG: [[TMP_PRIV_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
+// CHECK-DAG: [[SVAR_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_PRIV]],
+// CHECK-64-DAG: [[SVAR_CAST_CONV:%.+]] = bitcast {{.+}}* [[SVAR_CAST:%.+]] to
+// CHECK-64-DAG: store {{.+}} [[SVAR_PRIV_VAL]], {{.+}}* [[SVAR_CAST_CONV]],
+// CHECK-32-DAG: store {{.+}} [[SVAR_PRIV_VAL]], {{.+}}* [[SVAR_CAST:%.+]],
+// CHECK-DAG: [[SVAR_CAST_VAL:%.+]] = load {{.+}}, {{.+}}* [[SVAR_CAST]],
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_0:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]* [[VEC_PRIV]], i{{[0-9]+}} [[T_VAR_CAST_VAL]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], [[S_FLOAT_TY]]* [[TMP_PRIV_VAL]], i{{[0-9]+}} [[SVAR_CAST_VAL]])
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+// By OpenMP specifications, 'firstprivate' applies to both distribute and parallel for.
+// However, the support for 'firstprivate' of 'parallel' is only used when 'parallel'
+// is found alone. Therefore we only have one 'firstprivate' support for 'parallel for'
+// in combination
+// CHECK: define internal void [[OMP_PARFOR_OUTLINED_0]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x [[S_FLOAT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]* {{.+}} [[VAR_IN:%.+]], i{{[0-9]+}} [[SVAR_IN:%.+]])
+
+// addr alloca's
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+
+// skip loop alloca's
+// CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_LB:.omp.lb+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_UB:.omp.ub+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_ST:.omp.stride+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:.omp.is_last+]] = alloca i{{[0-9]+}},
+
+// private alloca's
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// init addr alloca's with input values
+// CHECK-DAG: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: store {{.+}} [[T_VAR_IN]], {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
+
+// init private alloca's with addr alloca's
+// vec
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
+// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
+// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+
+// s_arr
+// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
+// CHECK-DAG: [[S_ARR_BGN:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[S_ARR_ADDR_VAL]] to
+// CHECK-DAG: [[S_ARR_BGN_GEP:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_BGN]],
+// CHECK-DAG: [[S_ARR_EMPTY:%.+]] = icmp {{.+}} [[S_ARR_BGN]], [[S_ARR_BGN_GEP]]
+// CHECK-DAG: br {{.+}} [[S_ARR_EMPTY]], label %[[CPY_DONE:.+]], label %[[CPY_BODY:.+]]
+// CHECK-DAG: [[CPY_BODY]]:
+// CHECK-DAG: call void @llvm.memcpy{{.+}}(
+// CHECK-DAG: [[CPY_DONE]]:
+
+// var
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+// template tmain with S_INT_TY
+// CHECK-LABEL: define{{.*}} i{{[0-9]+}} @{{.+}}tmain{{.+}}()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_0:@.+]](
+// CHECK: call {{.*}} [[S_INT_TY_DEF_DESTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN_0]](i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], [2 x [[S_INT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_INT_TY]]* {{.+}} [[VAR_IN:%.+]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}, [2 x i{{[0-9]+}}]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[OMP_OUTLINED_0:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[OMP_OUTLINED_0]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], [2 x [[S_INT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_INT_TY]]* {{.+}} [[VAR_IN:%.+]])
+
+// addr alloca's
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+
+// skip loop alloca's
+// CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_LB:.omp.comb.lb+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_UB:.omp.comb.ub+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_ST:.omp.stride+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:.omp.is_last+]] = alloca i{{[0-9]+}},
+
+// private alloca's
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// init addr alloca's with input values
+// CHECK-DAG: store {{.+}} [[T_VAR_IN]], {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+
+// init private alloca's with addr alloca's
+// t-var
+// CHECK-64-DAG: [[T_VAR_CONV:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
+// CHECK-64-DAG: [[T_VAR_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_CONV]],
+// CHECK-32-DAG: [[T_VAR_ADDR_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[T_VAR_ADDR_VAL]], {{.+}} [[T_VAR_PRIV]],
+
+// vec
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
+// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
+// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+
+// s_arr
+// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
+// CHECK-DAG: [[S_ARR_BGN:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[S_ARR_ADDR_VAL]] to
+// CHECK-DAG: [[S_ARR_BGN_GEP:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_BGN]],
+// CHECK-DAG: [[S_ARR_EMPTY:%.+]] = icmp {{.+}} [[S_ARR_BGN]], [[S_ARR_BGN_GEP]]
+// CHECK-DAG: br {{.+}} [[S_ARR_EMPTY]], label %[[CPY_DONE:.+]], label %[[CPY_BODY:.+]]
+// CHECK-DAG: [[CPY_BODY]]:
+// CHECK-DAG: call void @llvm.memcpy{{.+}}(
+// CHECK-DAG: [[CPY_DONE]]:
+
+// var
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
+// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// pass private alloca's to fork
+// CHECK-DAG: [[T_VAR_PRIV_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_PRIV]],
+// not dag to distinguish with S_VAR_CAST
+// CHECK-64: [[T_VAR_CAST_CONV:%.+]] = bitcast {{.+}}* [[T_VAR_CAST:%.+]] to
+// CHECK-64-DAG: store {{.+}} [[T_VAR_PRIV_VAL]], {{.+}} [[T_VAR_CAST_CONV]],
+// CHECK-32: store {{.+}} [[T_VAR_PRIV_VAL]], {{.+}} [[T_VAR_CAST:%.+]],
+// CHECK-DAG: [[T_VAR_CAST_VAL:%.+]] = load {{.+}}, {{.+}}* [[T_VAR_CAST]],
+// CHECK-DAG: [[TMP_PRIV_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_0:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]* [[VEC_PRIV]], i{{[0-9]+}} [[T_VAR_CAST_VAL]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], [[S_INT_TY]]* [[TMP_PRIV_VAL]])
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+// By OpenMP specifications, 'firstprivate' applies to both distribute and parallel for.
+// However, the support for 'firstprivate' of 'parallel' is only used when 'parallel'
+// is found alone. Therefore we only have one 'firstprivate' support for 'parallel for'
+// in combination
+// CHECK: define internal void [[OMP_PARFOR_OUTLINED_0]]({{.+}}, {{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]* {{.+}} [[VEC_IN:%.+]], i{{[0-9]+}} [[T_VAR_IN:%.+]], [2 x [[S_INT_TY]]]* {{.+}} [[S_ARR_IN:%.+]], [[S_INT_TY]]* {{.+}} [[VAR_IN:%.+]])
+
+// addr alloca's
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+
+// skip loop alloca's
+// CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_LB:.omp.lb+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_UB:.omp.ub+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_ST:.omp.stride+]] = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:.omp.is_last+]] = alloca i{{[0-9]+}},
+
+// private alloca's
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+
+// init addr alloca's with input values
+// CHECK-DAG: store {{.+}} [[VEC_IN]], {{.+}} [[VEC_ADDR]],
+// CHECK-DAG: store {{.+}} [[T_VAR_IN]], {{.+}}* [[T_VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[S_ARR_IN]], {{.+}} [[S_ARR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_IN]], {{.+}} [[VAR_ADDR]],
+
+// init private alloca's with addr alloca's
+// vec
+// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
+// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
+// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+
+// s_arr
+// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
+// CHECK-DAG: [[S_ARR_BGN:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_PRIV]],
+// CHECK-DAG: [[S_ARR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[S_ARR_ADDR_VAL]] to
+// CHECK-DAG: [[S_ARR_BGN_GEP:%.+]] = getelementptr {{.+}}, {{.+}}* [[S_ARR_BGN]],
+// CHECK-DAG: [[S_ARR_EMPTY:%.+]] = icmp {{.+}} [[S_ARR_BGN]], [[S_ARR_BGN_GEP]]
+// CHECK-DAG: br {{.+}} [[S_ARR_EMPTY]], label %[[CPY_DONE:.+]], label %[[CPY_BODY:.+]]
+// CHECK-DAG: [[CPY_BODY]]:
+// CHECK-DAG: call void @llvm.memcpy{{.+}}(
+// CHECK-DAG: [[CPY_DONE]]:
+
+// var
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
+// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
new file mode 100644
index 00000000000..e6cd7215f58
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
@@ -0,0 +1,192 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void fn1();
+void fn2();
+void fn3();
+void fn4();
+void fn5();
+void fn6();
+
+int Arg;
+
+// CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
+void gtid_test() {
+#pragma omp target
+#pragma omp teams
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+#pragma omp distribute parallel for
+ for(int i = 0 ; i < 100; i++) {}
+ // CHECK: define internal void [[OFFLOADING_FUN_0]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_0:@.+]] to void
+ // CHECK: call void @__kmpc_for_static_fini(
+
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_0]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (parallel: false)
+ for(int i = 0 ; i < 100; i++) {
+ // CHECK: define internal void [[OFFLOADING_FUN_1]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void @__kmpc_serialized_parallel(
+ // CHECK: call void [[OMP_OUTLINED_1:@.+]](
+ // CHECK: call void @__kmpc_end_serialized_parallel(
+ // CHECK: call void @__kmpc_for_static_fini(
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_1]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void @{{.+}}gtid_test
+ // CHECK: call void @__kmpc_for_static_fini(
+ // CHECK: ret
+ gtid_test();
+ }
+}
+
+
+template <typename T>
+int tmain(T Arg) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (true)
+ for(int i = 0 ; i < 100; i++) {
+ fn1();
+ }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (false)
+ for(int i = 0 ; i < 100; i++) {
+ fn2();
+ }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (parallel: Arg)
+ for(int i = 0 ; i < 100; i++) {
+ fn3();
+ }
+ return 0;
+}
+
+// CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
+int main() {
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
+// CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (true)
+ for(int i = 0 ; i < 100; i++) {
+ // CHECK: define internal void [[OFFLOADING_FUN_0]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_2:@.+]] to void
+ // CHECK: call void @__kmpc_for_static_fini(
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_2]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call {{.*}}void @{{.+}}fn4
+ // CHECK: call void @__kmpc_for_static_fini(
+
+ fn4();
+ }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (false)
+ for(int i = 0 ; i < 100; i++) {
+ // CHECK: define internal void [[OFFLOADING_FUN_1]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void @__kmpc_serialized_parallel(
+ // CHECK: call void [[OMP_OUTLINED_3:@.+]](
+ // CHECK: call void @__kmpc_end_serialized_parallel(
+ // CHECK: call void @__kmpc_for_static_fini(
+
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_3]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call {{.*}}void @{{.+}}fn5
+ // CHECK: call void @__kmpc_for_static_fini(
+ fn5();
+ }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (Arg)
+ for(int i = 0 ; i < 100; i++) {
+ // CHECK: define internal void [[OFFLOADING_FUN_2]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_2:@.+]] to {{.+}})
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_2]](
+
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, {{.+}}* [[OMP_OUTLINED_4:@.+]] to void
+ // CHECK: call void @__kmpc_serialized_parallel(
+ // CHECK: call void [[OMP_OUTLINED_4:@.+]](
+ // CHECK: call void @__kmpc_end_serialized_parallel(
+ // CHECK: call void @__kmpc_for_static_fini(
+
+ // CHECK: define{{.+}} void [[OMP_OUTLINED_4]](
+ // CHECK: call void @__kmpc_for_static_init_4(
+ // CHECK: call {{.*}}void @{{.+}}fn6
+ // CHECK: call void @__kmpc_for_static_fini(
+ fn6();
+ }
+
+ return tmain(Arg);
+}
+
+// CHECK-LABEL: define {{.+}} @{{.+}}tmain
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_1:@.+]] to void
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_1]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn1
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// CHECK: call void [[T_OUTLINE_FUN_2:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_2]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn2
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{.+}} 2, void {{.+}}* [[T_OUTLINE_FUN_3:@.+]] to void
+// CHECK: call {{.*}}void @__kmpc_serialized_parallel(
+// call void [[T_OUTLINE_FUN_3:@.+]](
+// CHECK: call {{.*}}void @__kmpc_end_serialized_parallel(
+
+// CHECK: define internal {{.*}}void [[T_OUTLINE_FUN_3]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call {{.*}}void @{{.+}}fn3
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
new file mode 100644
index 00000000000..2e8da79c03b
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -0,0 +1,653 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+ T f;
+ S(T a) : f(a) {}
+ S() : f() {}
+ operator T() { return T(); }
+ ~S() {}
+};
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+ S<T> test;
+ T t_var = T();
+ T vec[] = {1, 2};
+ S<T> s_arr[] = {1, 2};
+ S<T> &var = test;
+ #pragma omp target
+ #pragma omp teams
+#pragma omp distribute parallel for lastprivate(t_var, vec, s_arr, s_arr, var, var)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return T();
+}
+
+int main() {
+ static int svar;
+ volatile double g;
+ volatile double &g1 = g;
+
+ #ifdef LAMBDA
+ // LAMBDA-LABEL: @main
+ // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+ [&]() {
+ static float sfvar;
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN:@.+]](
+
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]](
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}})
+ #pragma omp target
+ #pragma omp teams
+#pragma omp distribute parallel for lastprivate(g, g1, svar, sfvar)
+ for (int i = 0; i < 2; ++i) {
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, double*{{.+}} [[G_IN:%.+]], double*{{.+}} [[G1_IN:%.+]], i{{[0-9]+}}*{{.+}} [[SVAR_IN:%.+]], float*{{.+}} [[SFVAR_IN:%.+]])
+ // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[G1_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR:%.+]] = alloca float*,
+ // LAMBDA: [[TMP_G1:%.+]] = alloca double*,
+ // loop variables
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[G_PRIVATE:%.+]] = alloca double,
+ // LAMBDA: [[G1_PRIVATE:%.+]] = alloca double,
+ // LAMBDA: [[TMP_G1_PRIVATE:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_PRIVATE:%.+]] = alloca float,
+
+ // init addr alloca's
+ // LAMBDA: store double* [[G_IN]], double** [[G_PRIVATE_ADDR]],
+ // LAMBDA: store double* [[G1_IN]], double** [[G1_PRIVATE_ADDR]],
+ // LAMBDA: store i{{[0-9]+}}* [[SVAR_IN]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
+ // LAMBDA: store float* [[SFVAR_IN]], float** [[SFVAR_PRIVATE_ADDR]],
+
+ // init private variables
+ // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+ // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
+ // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
+ // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+ // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP_G1]],
+ // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP_G1]],
+ // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
+
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+ // LAMBDA: [[G1_PAR:%.+]] = load{{.+}}, {{.+}} [[TMP_G1_PRIVATE]],
+ // LAMBDA-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[G_PRIVATE]], {{.+}} [[G1_PAR]], {{.+}} [[SVAR_PRIVATE]], {{.+}} [[SFVAR_PRIVATE]])
+ // LAMBDA-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[G_PRIVATE]], {{.+}} [[G1_PAR]], {{.+}} [[SVAR_PRIVATE]], {{.+}} [[SFVAR_PRIVATE]])
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+ // lastprivate
+ // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+ // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+ // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+ // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+ // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE]],
+ // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
+ // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
+ // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
+ // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[TMP_G1_VAL]],
+
+ // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
+ // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
+ // LAMBDA: [[SFVAR_PRIV_VAL:%.+]] = load float, float* [[SFVAR_PRIVATE]],
+ // LAMBDA: store float [[SFVAR_PRIV_VAL]], float* [[SFVAR_IN_REF]],
+ // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+ // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+ // LAMBDA: ret
+
+ g = 1;
+ g1 = 1;
+ svar = 3;
+ sfvar = 4.0;
+ // outlined function for 'parallel for'
+ // LAMBDA-64: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[G_IN:%.+]], {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]])
+ // LAMBDA-32: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[G_IN:%.+]], {{.+}} [[G1_IN:%.+]], {{.+}} [[SVAR_IN:%.+]], {{.+}} [[SFVAR_IN:%.+]])
+
+ // addr alloca's
+ // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[G1_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR:%.+]] = alloca float*,
+
+ // loop variables
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+ // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
+
+ // private alloca's
+ // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[G_PRIVATE:%.+]] = alloca double,
+ // LAMBDA: [[G1_PRIVATE:%.+]] = alloca double,
+ // LAMBDA: [[TMP_G1_PRIVATE:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_PRIVATE:%.+]] = alloca float,
+
+ // init addr alloca's
+ // LAMBDA: store double* [[G_IN]], double** [[G_PRIVATE_ADDR]],
+ // LAMBDA: store double* [[G1_IN]], double** [[G1_PRIVATE_ADDR]],
+ // LAMBDA: store i{{[0-9]+}}* [[SVAR_IN]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
+ // LAMBDA: store float* [[SFVAR_IN]], float** [[SFVAR_PRIVATE_ADDR]],
+
+ // init private variables
+ // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+ // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
+ // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
+
+ // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+ // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1]],
+
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+
+ // loop body
+ // LAMBDA: store double 1.0{{.+}}, double* [[G_PRIVATE]],
+ // LAMBDA: [[TMP_G1_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
+ // LAMBDA: store{{.+}} double 1.0{{.+}}, double* [[TMP_G1_REF]],
+ // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SVAR_PRIVATE]],
+ // LAMBDA: store float 4.0{{.+}}, float* [[SFVAR_PRIVATE]],
+ // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA: store double* [[G_PRIVATE]], double** [[G_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[TMP_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_PRIVATE_ADDR_FROM_TMP:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
+ // LAMBDA: store double* [[G1_PRIVATE_ADDR_FROM_TMP]], double** [[TMP_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[SVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA: store i{{[0-9]+}}* [[SVAR_PRIVATE]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: store float* [[SFVAR_PRIVATE]], float** [[SFVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+
+ // lastprivate
+ // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+ // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+ // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+ // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
+ // LAMBDA: [[G_PRIV_VAL:%.+]] = load double, double* [[G_PRIVATE]],
+ // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
+ // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
+ // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
+ // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[G1_IN_REF]],
+ // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
+ // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
+ // LAMBDA: [[SFVAR_PRIV_VAL:%.+]] = load float, float* [[SFVAR_PRIVATE]],
+ // LAMBDA: store float [[SFVAR_PRIV_VAL]], float* [[SFVAR_IN_REF]],
+ // LAMBDA: br label %[[OMP_LASTPRIV_DONE]]
+ // LAMBDA: [[OMP_LASTPRIV_DONE]]:
+ // LAMBDA: ret
+
+ [&]() {
+ // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+ // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+ g = 2;
+ g1 = 2;
+ svar = 4;
+ sfvar = 8.0;
+ // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+ // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+ // LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+ // LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+ // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+ // LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+ // LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+ }();
+ }
+ }();
+ return 0;
+ #else
+ S<float> test;
+ int t_var = 0;
+ int vec[] = {1, 2};
+ S<float> s_arr[] = {1, 2};
+ S<float> &var = test;
+
+ #pragma omp target
+ #pragma omp teams
+#pragma omp distribute parallel for lastprivate(t_var, vec, s_arr, s_arr, var, var, svar)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ int i;
+
+ return tmain<int>();
+ #endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN:@.+]](i{{[0-9]+}} {{.+}}, [2 x i{{[0-9]+}}]* {{.+}}, [2 x [[S_FLOAT_TY]]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, i{{[0-9]+}} {{.+}})
+// CHECK: ret
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN]](i{{[0-9]+}} {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} {{.+}}, [2 x [[S_FLOAT_TY]]]*{{.+}} {{.+}}, [[S_FLOAT_TY]]*{{.+}} {{.+}}, i{{[0-9]+}} {{.+}})
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(
+// CHECK: ret
+//
+// CHECK: define internal void [[OMP_OUTLINED:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i{{[0-9]+}}*{{.+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], [2 x [[S_FLOAT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_FLOAT_TY]]*{{.+}} [[VAR_IN:%.+]], i{{[0-9]+}}*{{.*}} [[S_VAR_IN:%.+]])
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CHECK: [[TMP:%.*]] = alloca [[S_FLOAT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[S_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store i{{[0-9]+}}* [[T_VAR_IN]], i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[S_ARR_IN]], [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_IN]], [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}}* [[S_VAR_IN]], i{{[0-9]+}}** [[SVAR_ADDR]],
+
+// load content of local address variables
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
+
+// call constructor for s_arr
+// CHECK: [[S_ARR_BGN:%.+]] = getelementptr{{.+}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK: [[S_ARR_END:%.+]] = getelementptr {{.+}} [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BGN]],
+// CHECK: br label %[[S_ARR_CST_LOOP:.+]]
+// CHECK: [[S_ARR_CST_LOOP]]:
+// CHECK: [[S_ARR_CTOR:%.+]] = phi {{.+}}
+// CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_CTOR]])
+// CHECK: [[S_ARR_NEXT:%.+]] = getelementptr {{.+}} [[S_ARR_CTOR]],
+// CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
+// CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
+// CHECK: [[S_ARR_CST_END]]:
+// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
+
+// the distribute loop
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[TMP_PRIV_VAL:%.+]] = load {{.+}}, {{.+}} [[TMP_PRIV]],
+// CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[VEC_PRIV]], {{.+}} [[T_VAR_PRIV]], {{.+}} [[S_ARR_PRIV]], {{.+}} [[TMP_PRIV_VAL]], {{.+}} [[S_VAR_PRIV]])
+// CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[VEC_PRIV]], {{.+}} [[T_VAR_PRIV]], {{.+}} [[S_ARR_PRIV]], {{.+}} [[TMP_PRIV_VAL]], {{.+}} [[S_VAR_PRIV]])
+
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
+// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
+// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_COPY_DONE:.+]], label %[[S_ARR_COPY_BLOCK:.+]]
+// CHECK: [[S_ARR_COPY_BLOCK]]:
+// CHECK: [[S_ARR_SRC_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
+// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
+// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
+// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
+// CHECK: [[S_ARR_COPY_DONE]]:
+// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
+// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
+// CHECK: ret void
+
+// outlined function for 'parallel for'
+// CHECK-64: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[VEC_IN:%.+]], {{.+}} [[T_VAR_IN:%.+]], {{.+}} [[S_ARR_IN:%.+]], {{.+}} [[VAR_IN:%.+]], {{.+}} [[SVAR_IN:%.+]])
+// CHECK-32: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[VEC_IN:%.+]], {{.+}} [[T_VAR_IN:%.+]], {{.+}} [[S_ARR_IN:%.+]], {{.+}} [[VAR_IN:%.+]], {{.+}} [[SVAR_IN:%.+]])
+
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[S_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// copy from parameters to local address variables
+// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}}* [[T_VAR_IN]], i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[S_ARR_IN]], [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_IN]], [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: store i{{[0-9]+}}* [[S_VAR_IN]], i{{[0-9]+}}** [[SVAR_ADDR]],
+
+// load content of local address variables
+// CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
+
+// call constructor for s_arr
+// CHECK: [[S_ARR_BGN:%.+]] = getelementptr{{.+}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK: [[S_ARR_END:%.+]] = getelementptr {{.+}} [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BGN]],
+// CHECK: br label %[[S_ARR_CST_LOOP:.+]]
+// CHECK: [[S_ARR_CST_LOOP]]:
+// CHECK: [[S_ARR_CTOR:%.+]] = phi {{.+}}
+// CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_CTOR]])
+// CHECK: [[S_ARR_NEXT:%.+]] = getelementptr {{.+}} [[S_ARR_CTOR]],
+// CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
+// CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
+// CHECK: [[S_ARR_CST_END]]:
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+
+// loop body
+// assignment: vec[i] = t_var;
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: [[VEC_PTR:%.+]] = getelementptr inbounds [2 x i{{[0-9]+}}], [2 x i{{[0-9]+}}]* [[VEC_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} {{.+}}
+// CHECK: store i{{[0-9]+}} [[T_VAR_PRIV_VAL]], i{{[0-9]+}}* [[VEC_PTR]],
+
+// assignment: s_arr[i] = var;
+// CHECK-DAG: [[S_ARR_PTR:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
+// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
+// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
+// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_COPY_DONE:.+]], label %[[S_ARR_COPY_BLOCK:.+]]
+// CHECK: [[S_ARR_COPY_BLOCK]]:
+// CHECK: [[S_ARR_SRC_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
+// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
+// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
+// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
+// CHECK: [[S_ARR_COPY_DONE]]:
+// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
+// CHECK: ret void
+
+// template tmain
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT:@.+]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_1:@.+]](i{{[0-9]+}} {{.+}}, [2 x i{{[0-9]+}}]* {{.+}}, [2 x [[S_INT_TY]]]* {{.+}}, [[S_INT_TY]]* {{.+}})
+// CHECK: ret
+
+// CHECK: define internal void [[OFFLOAD_FUN_1]](
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4,
+// CHECK: ret
+
+// CHECK: define internal void [[OMP_OUTLINED_1:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i{{[0-9]+}}*{{.+}} [[T_VAR_IN:%.+]], [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN:%.+]])
+// skip alloca of global_tid and bound_tid
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+
+// skip init of bound and global tid
+// CHECK: store i{{[0-9]+}}* {{.*}},
+// CHECK: store i{{[0-9]+}}* {{.*}},
+// copy from parameters to local address variables
+// CHECK: store i{{[0-9]+}}* [[T_VAR_IN]], i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_IN]], [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* [[VAR_IN]], [[S_INT_TY]]** [[VAR_ADDR]],
+
+// load content of local address variables
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_ADDR_REF]], [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[TMP_PRIV_VAL:%.+]] = load {{.+}}, {{.+}} [[TMP_PRIV]],
+// CHECK-64: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[VEC_PRIV]], {{.+}} [[T_VAR_PRIV]], {{.+}} [[S_ARR_PRIV]], {{.+}} [[TMP_PRIV_VAL]])
+// CHECK-32: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to void ({{.+}})*), {{.+}}, {{.+}}, {{.+}} [[VEC_PRIV]], {{.+}} [[T_VAR_PRIV]], {{.+}} [[S_ARR_PRIV]], {{.+}} [[TMP_PRIV_VAL]])
+
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
+// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
+// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_COPY_DONE:.+]], label %[[S_ARR_COPY_BLOCK:.+]]
+// CHECK: [[S_ARR_COPY_BLOCK]]:
+// CHECK: [[S_ARR_SRC_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
+// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
+// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
+// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
+// CHECK: [[S_ARR_COPY_DONE]]:
+// CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
+// CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL_BCAST]],{{.+}})
+// CHECK: ret void
+
+// outlined function for 'parallel for'
+// CHECK-64: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[VEC_IN:%.+]], {{.+}} [[T_VAR_IN:%.+]], {{.+}} [[S_ARR_IN:%.+]], {{.+}} [[VAR_IN:%.+]])
+// CHECK-32: define{{.+}} void [[OMP_PARFOR_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, {{.+}}, {{.+}}, {{.+}} [[VEC_IN:%.+]], {{.+}} [[T_VAR_IN:%.+]], {{.+}} [[S_ARR_IN:%.+]], {{.+}} [[VAR_IN:%.+]])
+
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: {{.+}} = alloca i{{[0-9]+}}*,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i{{[0-9]+}}]*,
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// skip loop variables
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: {{.+}} = alloca i{{[0-9]+}},
+// CHECK: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TMP_PRIV:%.+]] = alloca [[S_INT_TY]]*,
+
+// copy from parameters to local address variables
+// CHECK: store [2 x i{{[0-9]+}}]* [[VEC_IN]], [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: store i{{[0-9]+}}* [[T_VAR_IN]], i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_IN]], [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store [[S_INT_TY]]* [[VAR_IN]], [[S_INT_TY]]** [[VAR_ADDR]],
+
+// load content of local address variables
+// CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
+
+// call constructor for s_arr
+// CHECK: [[S_ARR_BGN:%.+]] = getelementptr{{.+}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK: [[S_ARR_END:%.+]] = getelementptr {{.+}} [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BGN]],
+// CHECK: br label %[[S_ARR_CST_LOOP:.+]]
+// CHECK: [[S_ARR_CST_LOOP]]:
+// CHECK: [[S_ARR_CTOR:%.+]] = phi {{.+}}
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CTOR]])
+// CHECK: [[S_ARR_NEXT:%.+]] = getelementptr {{.+}} [[S_ARR_CTOR]],
+// CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
+// CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
+// CHECK: [[S_ARR_CST_END]]:
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+
+// assignment: vec[i] = t_var;
+// CHECK: [[IV_VAL:%.+]] =
+// CHECK: [[T_VAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: [[VEC_PTR:%.+]] = getelementptr inbounds [2 x i{{[0-9]+}}], [2 x i{{[0-9]+}}]* [[VEC_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} {{.+}}
+// CHECK: store i{{[0-9]+}} [[T_VAR_PRIV_VAL]], i{{[0-9]+}}* [[VEC_PTR]],
+
+// assignment: s_arr[i] = var;
+// CHECK-DAG: [[S_ARR_PTR:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
+// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
+// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR]] to i8*
+// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+
+// CHECK: call void @__kmpc_for_static_fini(
+
+// lastprivates
+// CHECK: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
+// CHECK: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
+
+// CHECK: [[OMP_LASTPRIV_BLOCK]]:
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
+// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
+// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[S_ARR_IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_BEGIN]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[S_ARR_IS_EMPTY]], label %[[S_ARR_COPY_DONE:.+]], label %[[S_ARR_COPY_BLOCK:.+]]
+// CHECK: [[S_ARR_COPY_BLOCK]]:
+// CHECK: [[S_ARR_SRC_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
+// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
+// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
+// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
+// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
+// CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
+// CHECK: [[S_ARR_COPY_DONE]]:
+// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: ret void
+
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
new file mode 100644
index 00000000000..20698ce3466
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -0,0 +1,121 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef __INTPTR_TYPE__ intptr_t;
+
+// CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+void foo();
+
+struct S {
+ intptr_t a, b, c;
+ S(intptr_t a) : a(a) {}
+ operator char() { return a; }
+ ~S() {}
+};
+
+template <typename T, int C>
+int tmain() {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads(C)
+ for (int i = 0; i < 100; i++)
+ foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads(T(23))
+ for (int i = 0; i < 100; i++)
+ foo();
+ return 0;
+}
+
+int main() {
+ S s(0);
+ char a = s;
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
+// CHECK: invoke{{.+}} [[TMAIN_5:@.+]]()
+// CHECK: invoke{{.+}} [[TMAIN_1:@.+]]()
+#pragma omp target
+#pragma omp teams
+ // CHECK: define internal void [[OFFLOADING_FUN_0]](
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+#pragma omp distribute parallel for num_threads(2)
+ for (int i = 0; i < 100; i++) {
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_0]](
+ // CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 2)
+ // CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+ foo();
+ }
+#pragma omp target
+#pragma omp teams
+ // CHECK: define internal void [[OFFLOADING_FUN_1]](
+
+ // CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 1, {{.+}}* [[OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+#pragma omp distribute parallel for num_threads(a)
+ for (int i = 0; i < 100; i++) {
+ // CHECK: define{{.+}} void [[OMP_TEAMS_OUTLINED_1]](
+ // CHECK-DAG: [[A_ADDR:%.+]] = alloca i8*,
+ // CHECK-DAG: [[A_REF:%.+]] = load i8*, i8** [[A_ADDR]],
+ // CHECK-DAG: [[A_VAL:%.+]] = load i8, i8* [[A_REF]],
+ // CHECK-DAG: [[A_EXT:%.+]] = sext i8 [[A_VAL]] to {{.+}}
+ // CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 [[A_EXT]])
+ // CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+ foo();
+ }
+ return a + tmain<char, 5>() + tmain<S, 1>();
+}
+
+// tmain 5
+// CHECK-DAG: define {{.*}}i{{[0-9]+}} [[TMAIN_5]]()
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[T_OFFLOADING_FUN_0:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[T_OFFLOADING_FUN_1:@.+]](
+
+// tmain 1
+// CHECK-DAG: define {{.*}}i{{[0-9]+}} [[TMAIN_1]]()
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[T_OFFLOADING_FUN_2:@.+]](
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[T_OFFLOADING_FUN_3:@.+]](
+
+// CHECK: define internal void [[T_OFFLOADING_FUN_0]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[T_OMP_TEAMS_OUTLINED_0:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[T_OMP_TEAMS_OUTLINED_0]](
+// CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 5)
+// CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+
+// CHECK: define internal void [[T_OFFLOADING_FUN_1]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[T_OMP_TEAMS_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[T_OMP_TEAMS_OUTLINED_1]](
+// CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 23)
+// CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+
+// CHECK: define internal void [[T_OFFLOADING_FUN_2]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[T_OMP_TEAMS_OUTLINED_2:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[T_OMP_TEAMS_OUTLINED_2]](
+// CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 1)
+// CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+
+// CHECK: define internal void [[T_OFFLOADING_FUN_3]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[T_OMP_TEAMS_OUTLINED_3:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} void [[T_OMP_TEAMS_OUTLINED_3]](
+// CHECK-DAG: [[CALL_RES:%.+]] = invoke{{.+}} i8 [[S_TY_CHAR_OP:@.+]]([[S_TY]]* {{.+}})
+// CHECK-DAG: [[CALL_RES_SEXT:%.+]] = sext i8 [[CALL_RES]] to {{.+}}
+// CHECK: call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 {{.+}}, i32 [[CALL_RES_SEXT]])
+// CHECK: call {{.*}}void {{.*}} @__kmpc_fork_call(
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
new file mode 100644
index 00000000000..d8d8a9a1ede
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
@@ -0,0 +1,297 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+ T f;
+ S(T a) : f(a) {}
+ S() : f() {}
+ operator T() { return T(); }
+ ~S() {}
+};
+
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+ S<T> test;
+ T t_var = T();
+ T vec[] = {1, 2};
+ S<T> s_arr[] = {1, 2};
+ S<T> &var = test;
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for private(t_var, vec, s_arr, s_arr, var, var)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return T();
+}
+
+int main() {
+ static int svar;
+ volatile double g;
+ volatile double &g1 = g;
+
+ #ifdef LAMBDA
+ // LAMBDA-LABEL: @main
+ // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+ [&]() {
+ static float sfvar;
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+ // LAMBDA: call i{{[0-9]+}} @__tgt_target_teams(
+ // LAMBDA: call void [[OFFLOADING_FUN:@.+]](
+
+ // LAMBDA: define{{.+}} void [[OFFLOADING_FUN]]()
+ // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 0, {{.+}}* [[OMP_OUTLINED:@.+]] to {{.+}})
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for private(g, g1, svar, sfvar)
+ for (int i = 0; i < 2; ++i) {
+ // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
+ // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+ // LAMBDA: [[G1_PRIVATE_ADDR:%.+]] = alloca double,
+ // LAMBDA: [[TMP_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR:%.+]] = alloca float,
+ // LAMBDA: store double* [[G1_PRIVATE_ADDR]], double** [[TMP_PRIVATE_ADDR]],
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
+ // LAMBDA: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED:@.+]] to {{.+}},
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+ // LAMBDA: ret void
+
+ // LAMBDA: define{{.+}} void [[OMP_PARFOR_OUTLINED]](
+ // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca double,
+ // LAMBDA: [[G1_PRIVATE_ADDR:%.+]] = alloca double,
+ // LAMBDA: [[TMP_PRIVATE_ADDR:%.+]] = alloca double*,
+ // LAMBDA: [[SVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR:%.+]] = alloca float,
+
+ g = 1;
+ g1 = 1;
+ svar = 3;
+ sfvar = 4.0;
+ // LAMBDA: store double* [[G1_PRIVATE_ADDR]], double** [[TMP_PRIVATE_ADDR]],
+ // LAMBDA: store double 1.0{{.+}}, double* [[G_PRIVATE_ADDR]],
+ // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SVAR_PRIVATE_ADDR]],
+ // LAMBDA: store float 4.0{{.+}}, float* [[SFVAR_PRIVATE_ADDR]],
+ // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA: store double* [[G_PRIVATE_ADDR]], double** [[G_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[TMP_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_PRIVATE_ADDR_FROM_TMP:%.+]] = load double*, double** [[TMP_PRIVATE_ADDR]],
+ // LAMBDA: store double* [[G1_PRIVATE_ADDR_FROM_TMP]], double** [[TMP_PRIVATE_ADDR_REF]],
+ // LAMBDA: [[SVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA: store i{{[0-9]+}}* [[SVAR_PRIVATE_ADDR]], i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: [[SFVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: store float* [[SFVAR_PRIVATE_ADDR]], float** [[SFVAR_PRIVATE_ADDR_REF]]
+ // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+ // LAMBDA: call {{.*}}void @__kmpc_for_static_fini(
+ // LAMBDA: ret void
+ [&]() {
+ // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+ // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+ g = 2;
+ g1 = 2;
+ svar = 4;
+ sfvar = 8.0;
+ // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+ // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+ // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+ // LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+ // LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+ // LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+ // LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+ // LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+ // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+ // LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+ // LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+ // LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+ }();
+ }
+ }();
+ return 0;
+ #else
+ S<float> test;
+ int t_var = 0;
+ int vec[] = {1, 2};
+ S<float> s_arr[] = {1, 2};
+ S<float> &var = test;
+
+ #pragma omp target
+ #pragma omp teams
+ #pragma omp distribute parallel for private(t_var, vec, s_arr, s_arr, var, var, svar)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return tmain<int>();
+ #endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_0:@.+]](
+
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: ret
+
+// CHECK: define{{.+}} [[OFFLOAD_FUN_0]]()
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_0:@.+]] to void
+// CHECK: ret
+//
+// CHECK: define internal void [[OMP_OUTLINED_0]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: [[S_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// this is the ctor loop
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_0:@.+]] to {{.+}},
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+// By OpenMP specifications, private applies to both distribute and parallel for.
+// However, the support for 'private' of 'parallel' is only used when 'parallel'
+// is found alone. Therefore we only have one 'private' support for 'parallel for'
+// in combination
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_0]](
+// CHECK: [[T_VAR_PRIV:%t_var+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%vec+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%s_arr+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%var+]] = alloca [[S_FLOAT_TY]],
+// CHECK-NOT: alloca [[S_FLOAT_TY]],
+// CHECK: [[S_VAR_PRIV:%svar+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// this is the ctor loop
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_DESTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+// template tmain with S_INT_TY
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT:@.+]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
+// CHECK: call void [[OFFLOAD_FUN_1:@.+]](
+// CHECK: call {{.*}} [[S_INT_TY_DEF_DESTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: ret
+
+// CHECK: ret
+
+// CHECK: define internal void [[OFFLOAD_FUN_1]]()
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_1:@.+]] to void
+// CHECK: ret
+//
+// CHECK: define internal void [[OMP_OUTLINED_1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK-NOT: alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call{{.+}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}}[[OMP_PARFOR_OUTLINED_1:@.+]] to {{.+}},
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: ret void
+
+// CHECK: define{{.+}} void [[OMP_PARFOR_OUTLINED_1]](
+// CHECK: [[T_VAR_PRIV:%t_var+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%vec+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%s_arr+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK-NOT: alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_PRIV:%var+]] = alloca [[S_INT_TY]],
+// CHECK-NOT: alloca [[S_INT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// this is the ctor loop
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_for_static_fini(
+
+// call destructors: var..
+// CHECK-DAG: call {{.+}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+
+// ..and s_arr
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_EL_PAST:%.+]] = phi [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = getelementptr {{.+}}, {{.+}} [[S_ARR_EL_PAST]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_DESTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+
+// CHECK: ret void
+
+#endif
diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
new file mode 100644
index 00000000000..958b4e782f1
--- /dev/null
+++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
@@ -0,0 +1,93 @@
+// add -fopenmp-targets
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef __INTPTR_TYPE__ intptr_t;
+
+// CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+void foo();
+
+struct S {
+ intptr_t a, b, c;
+ S(intptr_t a) : a(a) {}
+ operator char() { return a; }
+ ~S() {}
+};
+
+template <typename T>
+T tmain() {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master)
+ for(int i = 0; i < 1000; i++) {}
+ return T();
+}
+
+int main() {
+ // CHECK-LABEL: @main
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(spread)
+ for(int i = 0; i < 1000; i++) {}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(close)
+ for(int i = 0; i < 1000; i++) {}
+ return tmain<int>();
+}
+
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL1:@.+]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL2:@.+]]()
+// CHECK: [[CALL_RET:%.+]] = call{{.+}} i32 [[TMAIN:@.+]]()
+// CHECK: ret i32 [[CALL_RET]]
+
+// CHECK: define{{.+}} void [[OFFL1]](
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 4)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[OFFL2]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_1:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_1]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 3)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+
+// CHECK: define{{.+}} [[TMAIN]]()
+// CHECK: call {{.*}}@__tgt_target_teams({{.+}})
+// CHECK: call void [[OFFL3:@.+]]()
+
+// CHECK: define{{.+}} [[OFFL3]]()
+// CHECK: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, {{.+}}, {{.+}}* [[OMP_OUTLINED_3:@.+]] to {{.+}})
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_3]](i32* {{.+}} [[GTID_IN:%.+]],
+// CHECK: [[GTID_ADDR:%.+]] = alloca i32*,
+// CHECK: store i32* [[GTID_IN]], i32** [[GTID_ADDR]],
+// CHECK: [[GTID_REF:%.+]] = load i32*, i32** [[GTID_ADDR]],
+// CHECK: [[GTID_VAL:%.+]] = load i32, i32* [[GTID_REF]],
+// CHECK: call {{.*}}void @__kmpc_push_proc_bind([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID_VAL]], i32 2)
+// CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+// CHECK: ret void
+#endif
OpenPOWER on IntegriCloud