diff options
Diffstat (limited to 'polly/lib/External/ppcg/cuda.c')
-rw-r--r-- | polly/lib/External/ppcg/cuda.c | 704 |
1 files changed, 704 insertions, 0 deletions
diff --git a/polly/lib/External/ppcg/cuda.c b/polly/lib/External/ppcg/cuda.c new file mode 100644 index 00000000000..3063f6df6a5 --- /dev/null +++ b/polly/lib/External/ppcg/cuda.c @@ -0,0 +1,704 @@ +/* + * Copyright 2012 Ecole Normale Superieure + * + * Use of this software is governed by the MIT license + * + * Written by Sven Verdoolaege, + * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France + */ + +#include <isl/aff.h> +#include <isl/ast.h> + +#include "cuda_common.h" +#include "cuda.h" +#include "gpu.h" +#include "gpu_print.h" +#include "print.h" +#include "util.h" + +static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p) +{ + const char *macros = + "#define cudaCheckReturn(ret) \\\n" + " do { \\\n" + " cudaError_t cudaCheckReturn_e = (ret); \\\n" + " if (cudaCheckReturn_e != cudaSuccess) { \\\n" + " fprintf(stderr, \"CUDA error: %s\\n\", " + "cudaGetErrorString(cudaCheckReturn_e)); \\\n" + " fflush(stderr); \\\n" + " } \\\n" + " assert(cudaCheckReturn_e == cudaSuccess); \\\n" + " } while(0)\n" + "#define cudaCheckKernel() \\\n" + " do { \\\n" + " cudaCheckReturn(cudaGetLastError()); \\\n" + " } while(0)\n\n"; + + p = isl_printer_print_str(p, macros); + return p; +} + +/* Print a declaration for the device array corresponding to "array" on "p". + */ +static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p, + struct gpu_array_info *array) +{ + int i; + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, array->type); + p = isl_printer_print_str(p, " "); + if (!array->linearize && array->n_index > 1) + p = isl_printer_print_str(p, "("); + p = isl_printer_print_str(p, "*dev_"); + p = isl_printer_print_str(p, array->name); + if (!array->linearize && array->n_index > 1) { + p = isl_printer_print_str(p, ")"); + for (i = 1; i < array->n_index; i++) { + p = isl_printer_print_str(p, "["); + p = isl_printer_print_pw_aff(p, array->bound[i]); + p = isl_printer_print_str(p, "]"); + } + } + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + + p = declare_device_array(p, &prog->array[i]); + } + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + return p; +} + +static __isl_give isl_printer *allocate_device_arrays( + __isl_take isl_printer *p, struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, + "cudaCheckReturn(cudaMalloc((void **) &dev_"); + p = isl_printer_print_str(p, prog->array[i].name); + p = isl_printer_print_str(p, ", "); + p = gpu_array_info_print_size(p, &prog->array[i]); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + } + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + return p; +} + +/* Print code to "p" for copying "array" from the host to the device + * in its entirety. The bounds on the extent of "array" have + * been precomputed in extract_array_info and are used in + * gpu_array_info_print_size. + */ +static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p, + struct gpu_array_info *array) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + + if (gpu_array_is_scalar(array)) + p = isl_printer_print_str(p, "&"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + + p = gpu_array_info_print_size(p, array); + p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));"); + p = isl_printer_end_line(p); + + return p; +} + +/* Print code to "p" for copying "array" back from the device to the host + * in its entirety. The bounds on the extent of "array" have + * been precomputed in extract_array_info and are used in + * gpu_array_info_print_size. + */ +static __isl_give isl_printer *copy_array_from_device( + __isl_take isl_printer *p, struct gpu_array_info *array) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy("); + if (gpu_array_is_scalar(array)) + p = isl_printer_print_str(p, "&"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", dev_"); + p = isl_printer_print_str(p, array->name); + p = isl_printer_print_str(p, ", "); + p = gpu_array_info_print_size(p, array); + p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));"); + p = isl_printer_end_line(p); + + return p; +} + +static void print_reverse_list(FILE *out, int len, int *list) +{ + int i; + + if (len == 0) + return; + + fprintf(out, "("); + for (i = 0; i < len; ++i) { + if (i) + fprintf(out, ", "); + fprintf(out, "%d", list[len - 1 - i]); + } + fprintf(out, ")"); +} + +/* Print the effective grid size as a list of the sizes in each + * dimension, from innermost to outermost. + */ +static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + int i; + int dim; + + dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set); + if (dim == 0) + return p; + + p = isl_printer_print_str(p, "("); + for (i = dim - 1; i >= 0; --i) { + isl_pw_aff *bound; + + bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i); + p = isl_printer_print_pw_aff(p, bound); + isl_pw_aff_free(bound); + + if (i > 0) + p = isl_printer_print_str(p, ", "); + } + + p = isl_printer_print_str(p, ")"); + + return p; +} + +/* Print the grid definition. + */ +static __isl_give isl_printer *print_grid(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "dim3 k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimGrid"); + p = print_grid_size(p, kernel); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +/* Print the arguments to a kernel declaration or call. If "types" is set, + * then print a declaration (including the types of the arguments). + * + * The arguments are printed in the following order + * - the arrays accessed by the kernel + * - the parameters + * - the host loop iterators + */ +static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p, + struct gpu_prog *prog, struct ppcg_kernel *kernel, int types) +{ + int i, n; + int first = 1; + unsigned nparam; + isl_space *space; + const char *type; + + for (i = 0; i < prog->n_array; ++i) { + int required; + + required = ppcg_kernel_requires_array_argument(kernel, i); + if (required < 0) + return isl_printer_free(p); + if (!required) + continue; + + if (!first) + p = isl_printer_print_str(p, ", "); + + if (types) + p = gpu_array_info_print_declaration_argument(p, + &prog->array[i], NULL); + else + p = gpu_array_info_print_call_argument(p, + &prog->array[i]); + + first = 0; + } + + space = isl_union_set_get_space(kernel->arrays); + nparam = isl_space_dim(space, isl_dim_param); + for (i = 0; i < nparam; ++i) { + const char *name; + + name = isl_space_get_dim_name(space, isl_dim_param, i); + + if (!first) + p = isl_printer_print_str(p, ", "); + if (types) + p = isl_printer_print_str(p, "int "); + p = isl_printer_print_str(p, name); + + first = 0; + } + isl_space_free(space); + + n = isl_space_dim(kernel->space, isl_dim_set); + type = isl_options_get_ast_iterator_type(prog->ctx); + for (i = 0; i < n; ++i) { + const char *name; + + if (!first) + p = isl_printer_print_str(p, ", "); + name = isl_space_get_dim_name(kernel->space, isl_dim_set, i); + if (types) { + p = isl_printer_print_str(p, type); + p = isl_printer_print_str(p, " "); + } + p = isl_printer_print_str(p, name); + + first = 0; + } + + return p; +} + +/* Print the header of the given kernel. + */ +static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p, + struct gpu_prog *prog, struct ppcg_kernel *kernel) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "__global__ void kernel"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "("); + p = print_kernel_arguments(p, prog, kernel, 1); + p = isl_printer_print_str(p, ")"); + + return p; +} + +/* Print the header of the given kernel to both gen->cuda.kernel_h + * and gen->cuda.kernel_c. + */ +static void print_kernel_headers(struct gpu_prog *prog, + struct ppcg_kernel *kernel, struct cuda_info *cuda) +{ + isl_printer *p; + + p = isl_printer_to_file(prog->ctx, cuda->kernel_h); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = print_kernel_header(p, prog, kernel); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + isl_printer_free(p); + + p = isl_printer_to_file(prog->ctx, cuda->kernel_c); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = print_kernel_header(p, prog, kernel); + p = isl_printer_end_line(p); + isl_printer_free(p); +} + +static void print_indent(FILE *dst, int indent) +{ + fprintf(dst, "%*s", indent, ""); +} + +/* Print a list of iterators of type "type" with names "ids" to "out". + * Each iterator is assigned one of the cuda identifiers in cuda_dims. + * In particular, the last iterator is assigned the x identifier + * (the first in the list of cuda identifiers). + */ +static void print_iterators(FILE *out, const char *type, + __isl_keep isl_id_list *ids, const char *cuda_dims[]) +{ + int i, n; + + n = isl_id_list_n_id(ids); + if (n <= 0) + return; + print_indent(out, 4); + fprintf(out, "%s ", type); + for (i = 0; i < n; ++i) { + isl_id *id; + + if (i) + fprintf(out, ", "); + id = isl_id_list_get_id(ids, i); + fprintf(out, "%s = %s", isl_id_get_name(id), + cuda_dims[n - 1 - i]); + isl_id_free(id); + } + fprintf(out, ";\n"); +} + +static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel) +{ + isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); + const char *type; + const char *block_dims[] = { "blockIdx.x", "blockIdx.y" }; + const char *thread_dims[] = { "threadIdx.x", "threadIdx.y", + "threadIdx.z" }; + + type = isl_options_get_ast_iterator_type(ctx); + + print_iterators(out, type, kernel->block_ids, block_dims); + print_iterators(out, type, kernel->thread_ids, thread_dims); +} + +static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p, + struct ppcg_kernel_var *var) +{ + int j; + + p = isl_printer_start_line(p); + if (var->type == ppcg_access_shared) + p = isl_printer_print_str(p, "__shared__ "); + p = isl_printer_print_str(p, var->array->type); + p = isl_printer_print_str(p, " "); + p = isl_printer_print_str(p, var->name); + for (j = 0; j < var->array->n_index; ++j) { + isl_val *v; + + p = isl_printer_print_str(p, "["); + v = isl_vec_get_element_val(var->size, j); + p = isl_printer_print_val(p, v); + isl_val_free(v); + p = isl_printer_print_str(p, "]"); + } + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + return p; +} + +static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p, + struct ppcg_kernel *kernel) +{ + int i; + + for (i = 0; i < kernel->n_var; ++i) + p = print_kernel_var(p, &kernel->var[i]); + + return p; +} + +/* Print a sync statement. + */ +static __isl_give isl_printer *print_sync(__isl_take isl_printer *p, + struct ppcg_kernel_stmt *stmt) +{ + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "__syncthreads();"); + p = isl_printer_end_line(p); + + return p; +} + +/* This function is called for each user statement in the AST, + * i.e., for each kernel body statement, copy statement or sync statement. + */ +static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + isl_id *id; + struct ppcg_kernel_stmt *stmt; + + id = isl_ast_node_get_annotation(node); + stmt = isl_id_get_user(id); + isl_id_free(id); + + isl_ast_print_options_free(print_options); + + switch (stmt->type) { + case ppcg_kernel_copy: + return ppcg_kernel_print_copy(p, stmt); + case ppcg_kernel_sync: + return print_sync(p, stmt); + case ppcg_kernel_domain: + return ppcg_kernel_print_domain(p, stmt); + } + + return p; +} + +static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel, + struct cuda_info *cuda) +{ + isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree); + isl_ast_print_options *print_options; + isl_printer *p; + + print_kernel_headers(prog, kernel, cuda); + fprintf(cuda->kernel_c, "{\n"); + print_kernel_iterators(cuda->kernel_c, kernel); + + p = isl_printer_to_file(ctx, cuda->kernel_c); + p = isl_printer_set_output_format(p, ISL_FORMAT_C); + p = isl_printer_indent(p, 4); + + p = print_kernel_vars(p, kernel); + p = isl_printer_end_line(p); + p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p); + p = ppcg_print_macros(p, kernel->tree); + + print_options = isl_ast_print_options_alloc(ctx); + print_options = isl_ast_print_options_set_print_user(print_options, + &print_kernel_stmt, NULL); + p = isl_ast_node_print(kernel->tree, p, print_options); + isl_printer_free(p); + + fprintf(cuda->kernel_c, "}\n"); +} + +/* Print a statement for copying an array to or from the device. + * The statement identifier is called "to_device_<array name>" or + * "from_device_<array name>" and its user pointer points + * to the gpu_array_info of the array that needs to be copied. + * + * Extract the array from the identifier and call + * copy_array_to_device or copy_array_from_device. + */ +static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p, + __isl_keep isl_ast_node *node, struct gpu_prog *prog) +{ + isl_ast_expr *expr, *arg; + isl_id *id; + const char *name; + struct gpu_array_info *array; + + expr = isl_ast_node_user_get_expr(node); + arg = isl_ast_expr_get_op_arg(expr, 0); + id = isl_ast_expr_get_id(arg); + name = isl_id_get_name(id); + array = isl_id_get_user(id); + isl_id_free(id); + isl_ast_expr_free(arg); + isl_ast_expr_free(expr); + + if (!name) + array = NULL; + if (!array) + return isl_printer_free(p); + + if (!prefixcmp(name, "to_device")) + return copy_array_to_device(p, array); + else + return copy_array_from_device(p, array); +} + +struct print_host_user_data { + struct cuda_info *cuda; + struct gpu_prog *prog; +}; + +/* Print the user statement of the host code to "p". + * + * The host code may contain original user statements, kernel launches and + * statements that copy data to/from the device. + * The original user statements and the kernel launches have + * an associated annotation, while the data copy statements do not. + * The latter are handled by print_to_from_device. + * The annotation on the user statements is called "user". + * + * In case of a kernel launch, print a block of statements that + * defines the grid and the block and then launches the kernel. + */ +static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p, + __isl_take isl_ast_print_options *print_options, + __isl_keep isl_ast_node *node, void *user) +{ + isl_id *id; + int is_user; + struct ppcg_kernel *kernel; + struct ppcg_kernel_stmt *stmt; + struct print_host_user_data *data; + + isl_ast_print_options_free(print_options); + + data = (struct print_host_user_data *) user; + + id = isl_ast_node_get_annotation(node); + if (!id) + return print_to_from_device(p, node, data->prog); + + is_user = !strcmp(isl_id_get_name(id), "user"); + kernel = is_user ? NULL : isl_id_get_user(id); + stmt = is_user ? isl_id_get_user(id) : NULL; + isl_id_free(id); + + if (is_user) + return ppcg_kernel_print_domain(p, stmt); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "{"); + p = isl_printer_end_line(p); + p = isl_printer_indent(p, 2); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "dim3 k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimBlock"); + print_reverse_list(isl_printer_get_file(p), + kernel->n_block, kernel->block_dim); + p = isl_printer_print_str(p, ";"); + p = isl_printer_end_line(p); + + p = print_grid(p, kernel); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "kernel"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, " <<<k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimGrid, k"); + p = isl_printer_print_int(p, kernel->id); + p = isl_printer_print_str(p, "_dimBlock>>> ("); + p = print_kernel_arguments(p, data->prog, kernel, 0); + p = isl_printer_print_str(p, ");"); + p = isl_printer_end_line(p); + + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckKernel();"); + p = isl_printer_end_line(p); + + p = isl_printer_indent(p, -2); + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "}"); + p = isl_printer_end_line(p); + + p = isl_printer_start_line(p); + p = isl_printer_end_line(p); + + print_kernel(data->prog, kernel, data->cuda); + + return p; +} + +static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct cuda_info *cuda) +{ + isl_ast_print_options *print_options; + isl_ctx *ctx = isl_ast_node_get_ctx(tree); + struct print_host_user_data data = { cuda, prog }; + + print_options = isl_ast_print_options_alloc(ctx); + print_options = isl_ast_print_options_set_print_user(print_options, + &print_host_user, &data); + + p = ppcg_print_macros(p, tree); + p = isl_ast_node_print(tree, p, print_options); + + return p; +} + +static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p, + struct gpu_prog *prog) +{ + int i; + + for (i = 0; i < prog->n_array; ++i) { + if (!gpu_array_requires_device_allocation(&prog->array[i])) + continue; + p = isl_printer_start_line(p); + p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_"); + p = isl_printer_print_str(p, prog->array[i].name); + p = isl_printer_print_str(p, "));"); + p = isl_printer_end_line(p); + } + + return p; +} + +/* Given a gpu_prog "prog" and the corresponding transformed AST + * "tree", print the entire CUDA code to "p". + * "types" collects the types for which a definition has already + * been printed. + */ +static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p, + struct gpu_prog *prog, __isl_keep isl_ast_node *tree, + struct gpu_types *types, void *user) +{ + struct cuda_info *cuda = user; + isl_printer *kernel; + + kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c); + kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C); + kernel = gpu_print_types(kernel, types, prog); + isl_printer_free(kernel); + + if (!kernel) + return isl_printer_free(p); + + p = ppcg_start_block(p); + + p = print_cuda_macros(p); + + p = gpu_print_local_declarations(p, prog); + p = declare_device_arrays(p, prog); + p = allocate_device_arrays(p, prog); + + p = print_host_code(p, prog, tree, cuda); + + p = free_device_arrays(p, prog); + + p = ppcg_end_block(p); + + return p; +} + +/* Transform the code in the file called "input" by replacing + * all scops by corresponding CUDA code. + * The names of the output files are derived from "input". + * + * We let generate_gpu do all the hard work and then let it call + * us back for printing the AST in print_cuda. + * + * To prepare for this printing, we first open the output files + * and we close them after generate_gpu has finished. + */ +int generate_cuda(isl_ctx *ctx, struct ppcg_options *options, + const char *input) +{ + struct cuda_info cuda; + int r; + + cuda_open_files(&cuda, input); + + r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda); + + cuda_close_files(&cuda); + + return r; +} |