//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.txt for details. // //===----------------------------------------------------------------------===// // Forward declaration as the following 2 functions are declared as friend in offload_engine.h // CLANG does not like static to been after friend declaration. static void __offload_init_library_once(void); static void __offload_fini_library(void); #include "offload_host.h" #ifdef MYO_SUPPORT #include "offload_myo_host.h" #endif #include #ifndef TARGET_WINNT #include #include #endif // TARGET_WINNT #include #include #include #include #include #include #include #include #include #if defined(HOST_WINNT) #define PATH_SEPARATOR ";" #else #define PATH_SEPARATOR ":" #endif #define GET_OFFLOAD_NUMBER(timer_data) \ timer_data? timer_data->offload_number : 0 #ifdef TARGET_WINNT // Small subset of ELF declarations for Windows which is needed to compile // this file. ELF header is used to understand what binary type is contained // in the target image - shared library or executable. typedef uint16_t Elf64_Half; typedef uint32_t Elf64_Word; typedef uint64_t Elf64_Addr; typedef uint64_t Elf64_Off; #define EI_NIDENT 16 #define ET_EXEC 2 #define ET_DYN 3 typedef struct { unsigned char e_ident[EI_NIDENT]; Elf64_Half e_type; Elf64_Half e_machine; Elf64_Word e_version; Elf64_Addr e_entry; Elf64_Off e_phoff; Elf64_Off e_shoff; Elf64_Word e_flags; Elf64_Half e_ehsize; Elf64_Half e_phentsize; Elf64_Half e_phnum; Elf64_Half e_shentsize; Elf64_Half e_shnum; Elf64_Half e_shstrndx; } Elf64_Ehdr; #endif // TARGET_WINNT // Host console and file logging const char *prefix; int console_enabled = 0; int offload_number = 0; static const char *htrace_envname = "H_TRACE"; static const char *offload_report_envname = "OFFLOAD_REPORT"; static char *timer_envname = "H_TIME"; // Trace information static const char* vardesc_direction_as_string[] = { "NOCOPY", "IN", "OUT", "INOUT" }; static const char* vardesc_type_as_string[] = { "unknown", "data", "data_ptr", "func_ptr", "void_ptr", "string_ptr", "dv", "dv_data", "dv_data_slice", "dv_ptr", "dv_ptr_data", "dv_ptr_data_slice", "cean_var", "cean_var_ptr", "c_data_ptr_array", "c_func_ptr_array", "c_void_ptr_array", "c_string_ptr_array" }; Engine* mic_engines = 0; uint32_t mic_engines_total = 0; pthread_key_t mic_thread_key; MicEnvVar mic_env_vars; uint64_t cpu_frequency = 0; // MIC_STACKSIZE uint32_t mic_stack_size = 12 * 1024 * 1024; // MIC_BUFFERSIZE uint64_t mic_buffer_size = 0; // MIC_LD_LIBRARY_PATH char* mic_library_path = 0; // MIC_PROXY_IO bool mic_proxy_io = true; // MIC_PROXY_FS_ROOT char* mic_proxy_fs_root = 0; // Threshold for creating buffers with large pages. Buffer is created // with large pages hint if its size exceeds the threshold value. // By default large pages are disabled right now (by setting default // value for threshold to MAX) due to HSD 4114629. uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL; static const char *mic_use_2mb_buffers_envname = "MIC_USE_2MB_BUFFERS"; static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024; static const char *mic_use_async_buffer_write_envname = "MIC_USE_ASYNC_BUFFER_WRITE"; static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024; static const char *mic_use_async_buffer_read_envname = "MIC_USE_ASYNC_BUFFER_READ"; // device initialization type OffloadInitType __offload_init_type = c_init_on_offload_all; static const char *offload_init_envname = "OFFLOAD_INIT"; // active wait static bool __offload_active_wait = true; static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT"; // OMP_DEFAULT_DEVICE int __omp_device_num = 0; static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE"; // The list of pending target libraries static bool __target_libs; static TargetImageList __target_libs_list; static mutex_t __target_libs_lock; static mutex_t stack_alloc_lock; // Target executable TargetImage* __target_exe; static char * offload_get_src_base(void * ptr, uint8_t type) { char *base; if (VAR_TYPE_IS_PTR(type)) { base = *static_cast(ptr); } else if (VAR_TYPE_IS_SCALAR(type)) { base = static_cast(ptr); } else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) { ArrDesc *dvp; if (VAR_TYPE_IS_DV_DATA_SLICE(type)) { const arr_desc *ap = static_cast(ptr); dvp = (type == c_dv_data_slice) ? reinterpret_cast(ap->base) : *reinterpret_cast(ap->base); } else { dvp = (type == c_dv_data) ? static_cast(ptr) : *static_cast(ptr); } base = reinterpret_cast(dvp->Base); } else { base = NULL; } return base; } void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res) { // special case for the 'process died' error if (res == COI_PROCESS_DIED) { m_device.fini_process(true); } else { switch (msg) { case c_buf_create: if (res == COI_OUT_OF_MEMORY) { msg = c_buf_create_out_of_mem; } /* fallthru */ case c_buf_create_from_mem: case c_buf_get_address: case c_pipeline_create: case c_pipeline_run_func: LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res); break; case c_buf_read: case c_buf_write: case c_buf_copy: case c_buf_map: case c_buf_unmap: case c_buf_destroy: case c_buf_set_state: LIBOFFLOAD_ERROR(msg, res); break; default: break; } } exit(1); } _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const { switch (res) { case COI_SUCCESS: return OFFLOAD_SUCCESS; case COI_PROCESS_DIED: return OFFLOAD_PROCESS_DIED; case COI_OUT_OF_MEMORY: return OFFLOAD_OUT_OF_MEMORY; default: return OFFLOAD_ERROR; } } bool OffloadDescriptor::alloc_ptr_data( PtrData* &ptr_data, void *base, int64_t disp, int64_t size, int64_t alloc_disp, int align ) { // total length of base int64_t length = disp + size; bool is_new; OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n", base, length); // add new entry ptr_data = m_device.insert_ptr_data(base, length, is_new); if (is_new) { OFFLOAD_TRACE(3, "Added new association\n"); if (length > 0) { OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers); COIRESULT res; // align should be a power of 2 if (align > 0 && (align & (align - 1)) == 0) { // offset within mic_buffer. Can do offset optimization // only when source address alignment satisfies requested // alignment on the target (cq172736). if ((reinterpret_cast(base) & (align - 1)) == 0) { ptr_data->mic_offset = reinterpret_cast(base) & 4095; } } // buffer size and flags uint64_t buffer_size = length + ptr_data->mic_offset; uint32_t buffer_flags = 0; // create buffer with large pages if data length exceeds // large page threshold if (length >= __offload_use_2mb_buffers) { buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE; } // create CPU buffer OFFLOAD_DEBUG_TRACE_1(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_create_buf_host, "Creating buffer from source memory %p, " "length %lld\n", base, length); // result is not checked because we can continue without cpu // buffer. In this case we will use COIBufferRead/Write instead // of COIBufferCopy. COI::BufferCreateFromMemory(length, COI_BUFFER_NORMAL, 0, base, 1, &m_device.get_process(), &ptr_data->cpu_buf); OFFLOAD_DEBUG_TRACE_1(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_create_buf_mic, "Creating buffer for sink: size %lld, offset %d, " "flags =0x%x\n", buffer_size - alloc_disp, ptr_data->mic_offset, buffer_flags); // create MIC buffer res = COI::BufferCreate(buffer_size - alloc_disp, COI_BUFFER_NORMAL, buffer_flags, 0, 1, &m_device.get_process(), &ptr_data->mic_buf); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_create, res); } ptr_data->alloc_ptr_data_lock.unlock(); return false; } // make buffer valid on the device. res = COI::BufferSetState(ptr_data->mic_buf, m_device.get_process(), COI_BUFFER_VALID, COI_BUFFER_NO_MOVE, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_set_state, res); } ptr_data->alloc_ptr_data_lock.unlock(); return false; } res = COI::BufferSetState(ptr_data->mic_buf, COI_PROCESS_SOURCE, COI_BUFFER_INVALID, COI_BUFFER_NO_MOVE, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_set_state, res); } ptr_data->alloc_ptr_data_lock.unlock(); return false; } } ptr_data->alloc_disp = alloc_disp; ptr_data->alloc_ptr_data_lock.unlock(); } else { mutex_locker_t locker(ptr_data->alloc_ptr_data_lock); OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, " "is_static %d\n", ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(), ptr_data->is_static); // This is not a new entry. Make sure that provided address range fits // into existing one. MemRange addr_range(base, length - ptr_data->alloc_disp); if (!ptr_data->cpu_addr.contains(addr_range)) { LIBOFFLOAD_ERROR(c_bad_ptr_mem_range); exit(1); } // if the entry is associated with static data it may not have buffers // created because they are created on demand. if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) { return false; } } return true; } bool OffloadDescriptor::find_ptr_data( PtrData* &ptr_data, void *base, int64_t disp, int64_t size, bool report_error ) { // total length of base int64_t length = disp + size; OFFLOAD_TRACE(3, "Looking for association for data: addr %p, " "length %lld\n", base, length); // find existing association in pointer table ptr_data = m_device.find_ptr_data(base); if (ptr_data == 0) { if (report_error) { LIBOFFLOAD_ERROR(c_no_ptr_data, base); exit(1); } OFFLOAD_TRACE(3, "Association does not exist\n"); return true; } OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n", ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(), ptr_data->is_static); // make sure that provided address range fits into existing one MemRange addr_range(base, length); if (!ptr_data->cpu_addr.contains(addr_range)) { if (report_error) { LIBOFFLOAD_ERROR(c_bad_ptr_mem_range); exit(1); } OFFLOAD_TRACE(3, "Existing association partially overlaps with " "data address range\n"); ptr_data = 0; return true; } // if the entry is associated with static data it may not have buffers // created because they are created on demand. if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) { return false; } return true; } bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data) { OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers); if (ptr_data->cpu_buf == 0) { OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n", ptr_data->cpu_addr.start()); COIRESULT res = COI::BufferCreateFromMemory( ptr_data->cpu_addr.length(), COI_BUFFER_NORMAL, 0, const_cast(ptr_data->cpu_addr.start()), 1, &m_device.get_process(), &ptr_data->cpu_buf); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_create_from_mem, res); } } if (ptr_data->mic_buf == 0) { OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", ptr_data->mic_addr); COIRESULT res = COI::BufferCreateFromMemory( ptr_data->cpu_addr.length(), COI_BUFFER_NORMAL, COI_SINK_MEMORY, reinterpret_cast(ptr_data->mic_addr), 1, &m_device.get_process(), &ptr_data->mic_buf); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_create_from_mem, res); } } return true; } bool OffloadDescriptor::init_mic_address(PtrData *ptr_data) { if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) { COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf, &ptr_data->mic_addr); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_get_address, res); } return false; } } return true; } bool OffloadDescriptor::nullify_target_stack( COIBUFFER targ_buf, uint64_t size ) { char * ptr = (char*)malloc(size); COIRESULT res; memset(ptr, 0, size); res = COI::BufferWrite( targ_buf, 0, ptr, size, COI_COPY_UNSPECIFIED, 0, 0, 0); free(ptr); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } return true; } bool OffloadDescriptor::offload_stack_memory_manager( const void * stack_begin, int routine_id, int buf_size, int align, bool *is_new) { mutex_locker_t locker(stack_alloc_lock); PersistData * new_el; PersistDataList::iterator it_begin = m_device.m_persist_list.begin(); PersistDataList::iterator it_end; int erase = 0; *is_new = false; for (PersistDataList::iterator it = m_device.m_persist_list.begin(); it != m_device.m_persist_list.end(); it++) { PersistData cur_el = *it; if (stack_begin > it->stack_cpu_addr) { // this stack data must be destroyed m_destroy_stack.push_front(cur_el.stack_ptr_data); it_end = it; erase++; } else if (stack_begin == it->stack_cpu_addr) { if (routine_id != it-> routine_id) { // this stack data must be destroyed m_destroy_stack.push_front(cur_el.stack_ptr_data); it_end = it; erase++; break; } else { // stack data is reused m_stack_ptr_data = it->stack_ptr_data; if (erase > 0) { // all obsolete stack sections must be erased from the list m_device.m_persist_list.erase(it_begin, ++it_end); m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr); } OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n", m_stack_ptr_data->mic_addr); return true; } } else if (stack_begin < it->stack_cpu_addr) { break; } } if (erase > 0) { // all obsolete stack sections must be erased from the list m_device.m_persist_list.erase(it_begin, ++it_end); m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr); } // new stack table is created new_el = new PersistData(stack_begin, routine_id, buf_size); // create MIC buffer COIRESULT res; uint32_t buffer_flags = 0; // create buffer with large pages if data length exceeds // large page threshold if (buf_size >= __offload_use_2mb_buffers) { buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE; } res = COI::BufferCreate(buf_size, COI_BUFFER_NORMAL, buffer_flags, 0, 1, &m_device.get_process(), &new_el->stack_ptr_data->mic_buf); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_create, res); } return false; } // make buffer valid on the device. res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf, m_device.get_process(), COI_BUFFER_VALID, COI_BUFFER_NO_MOVE, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_set_state, res); } return false; } res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf, COI_PROCESS_SOURCE, COI_BUFFER_INVALID, COI_BUFFER_NO_MOVE, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); } else if (m_is_mandatory) { report_coi_error(c_buf_set_state, res); } return false; } // persistence algorithm requires target stack initialy to be nullified if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) { return false; } m_stack_ptr_data = new_el->stack_ptr_data; init_mic_address(m_stack_ptr_data); OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n", m_stack_ptr_data->mic_addr); m_device.m_persist_list.push_front(*new_el); init_mic_address(new_el->stack_ptr_data); *is_new = true; return true; } bool OffloadDescriptor::setup_descriptors( VarDesc *vars, VarDesc2 *vars2, int vars_total, int entry_id, const void *stack_addr ) { COIRESULT res; OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers); // make a copy of variable descriptors m_vars_total = vars_total; if (vars_total > 0) { m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc)); memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc)); m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra)); } // dependencies m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total + 1)); if (m_vars_total > 0) { m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total); } // copyin/copyout data length m_in_datalen = 0; m_out_datalen = 0; // First pass over variable descriptors // - Calculate size of the input and output non-pointer data // - Allocate buffers for input and output pointers for (int i = 0; i < m_vars_total; i++) { void* alloc_base = NULL; int64_t alloc_disp = 0; int64_t alloc_size; bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); const char *var_sname = ""; if (vars2 != NULL && i < vars_total) { if (vars2[i].sname != NULL) { var_sname = vars2[i].sname; } } OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n", i, var_sname, vardesc_direction_as_string[m_vars[i].direction.bits], vardesc_type_as_string[m_vars[i].type.src]); if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) { OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname, vardesc_type_as_string[m_vars[i].type.dst]); } OFFLOAD_TRACE(2, " type_src=%d, type_dstn=%d, direction=%d, " "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, " "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n", m_vars[i].type.src, m_vars[i].type.dst, m_vars[i].direction.bits, m_vars[i].alloc_if, m_vars[i].free_if, m_vars[i].align, m_vars[i].mic_offset, m_vars[i].flags.bits, m_vars[i].offset, m_vars[i].size, m_vars[i].count, m_vars[i].ptr, m_vars[i].into); if (m_vars[i].alloc != NULL) { // array descriptor const arr_desc *ap = static_cast(m_vars[i].alloc); // debug dump __arr_desc_dump(" ", "ALLOC", ap, 0); __arr_data_offset_and_length(ap, alloc_disp, alloc_size); alloc_base = reinterpret_cast(ap->base); } m_vars_extra[i].cpu_disp = 0; m_vars_extra[i].cpu_offset = 0; m_vars_extra[i].src_data = 0; m_vars_extra[i].read_rng_src = 0; m_vars_extra[i].read_rng_dst = 0; // flag is_arr_ptr_el is 1 only for var_descs generated // for c_data_ptr_array type if (i < vars_total) { m_vars_extra[i].is_arr_ptr_el = 0; } switch (m_vars[i].type.src) { case c_data_ptr_array: { const arr_desc *ap; const VarDesc3 *vd3 = static_cast(m_vars[i].ptr); int flags = vd3->array_fields; OFFLOAD_TRACE(2, " pointer array flags = %04x\n", flags); OFFLOAD_TRACE(2, " pointer array type is %s\n", vardesc_type_as_string[flags & 0x3f]); ap = static_cast(vd3->ptr_array); __arr_desc_dump(" ", "ptr array", ap, 0); if (m_vars[i].into) { ap = static_cast(m_vars[i].into); __arr_desc_dump( " ", "into array", ap, 0); } if ((flags & (1<(vd3->align_array); __arr_desc_dump( " ", "align array", ap, 0); } if ((flags & (1<(vd3->alloc_if_array); __arr_desc_dump( " ", "alloc_if array", ap, 0); } if ((flags & (1<(vd3->free_if_array); __arr_desc_dump( " ", "free_if array", ap, 0); } if ((flags & (1<(vd3->extent_start); __arr_desc_dump( " ", "extent_start array", ap, 0); } else if ((flags & (1<extent_start); } if ((flags & (1< (vd3->extent_elements); __arr_desc_dump( " ", "extent_elements array", ap, 0); } else if ((flags & (1<extent_elements); } if ((flags & (1<(vd3->into_start); __arr_desc_dump( " ", "into_start array", ap, 0); } else if ((flags & (1<into_start); } if ((flags & (1<(vd3->into_elements); __arr_desc_dump( " ", "into_elements array", ap, 0); } else if ((flags & (1<into_elements); } if ((flags & (1<(vd3->alloc_start); __arr_desc_dump( " ", "alloc_start array", ap, 0); } else if ((flags & (1<alloc_start); } if ((flags & (1<(vd3->alloc_elements); __arr_desc_dump( " ", "alloc_elements array", ap, 0); } else if ((flags & (1<alloc_elements); } } if (!gen_var_descs_for_pointer_array(i)) { return false; } break; case c_data: case c_void_ptr: case c_cean_var: // In all uses later // VarDesc.size will have the length of the data to be // transferred // VarDesc.disp will have an offset from base if (m_vars[i].type.src == c_cean_var) { // array descriptor const arr_desc *ap = static_cast(m_vars[i].ptr); // debug dump __arr_desc_dump("", "IN/OUT", ap, 0); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, m_vars[i].disp, m_vars[i].size); if (!is_arr_desc_contiguous(ap)) { m_vars[i].flags.is_noncont_src = 1; m_vars_extra[i].read_rng_src = init_read_ranges_arr_desc(ap); } // all necessary information about length and offset is // transferred in var descriptor. There is no need to send // array descriptor to the target side. m_vars[i].ptr = reinterpret_cast(ap->base); } else { m_vars[i].size *= m_vars[i].count; m_vars[i].disp = 0; } if (m_vars[i].direction.bits) { // make sure that transfer size > 0 if (m_vars[i].size <= 0) { LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size); exit(1); } if (m_vars[i].flags.is_static) { PtrData *ptr_data; // find data associated with variable if (!find_ptr_data(ptr_data, m_vars[i].ptr, m_vars[i].disp, m_vars[i].size, false)) { return false; } if (ptr_data != 0) { // offset to base from the beginning of the buffer // memory m_vars[i].offset = (char*) m_vars[i].ptr - (char*) ptr_data->cpu_addr.start(); } else { m_vars[i].flags.is_static = false; if (m_vars[i].into == NULL) { m_vars[i].flags.is_static_dstn = false; } } m_vars_extra[i].src_data = ptr_data; } if (m_is_openmp) { if (m_vars[i].flags.is_static) { // Static data is transferred only by omp target // update construct which passes zeros for // alloc_if and free_if. if (m_vars[i].alloc_if || m_vars[i].free_if) { m_vars[i].direction.bits = c_parameter_nocopy; } } else { AutoData *auto_data; if (m_vars[i].alloc_if) { auto_data = m_device.insert_auto_data( m_vars[i].ptr, m_vars[i].size); auto_data->add_reference(); } else { // TODO: what should be done if var is not in // the table? auto_data = m_device.find_auto_data( m_vars[i].ptr); } // For automatic variables data is transferred // only if alloc_if == 0 && free_if == 0 // or reference count is 1 if ((m_vars[i].alloc_if || m_vars[i].free_if) && auto_data != 0 && auto_data->get_reference() != 1) { m_vars[i].direction.bits = c_parameter_nocopy; } // save data for later use m_vars_extra[i].auto_data = auto_data; } } if (m_vars[i].direction.in && !m_vars[i].flags.is_static) { m_in_datalen += m_vars[i].size; // for non-static target destination defined as CEAN // expression we pass to target its size and dist if (m_vars[i].into == NULL && m_vars[i].type.src == c_cean_var) { m_in_datalen += 2 * sizeof(uint64_t); } m_need_runfunction = true; } if (m_vars[i].direction.out && !m_vars[i].flags.is_static) { m_out_datalen += m_vars[i].size; m_need_runfunction = true; } } break; case c_dv: if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { ArrDesc *dvp = static_cast(m_vars[i].ptr); // debug dump __dv_desc_dump("IN/OUT", dvp); // send dope vector contents excluding base m_in_datalen += m_vars[i].size - sizeof(uint64_t); m_need_runfunction = true; } break; case c_string_ptr: if ((m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) && m_vars[i].size == 0) { m_vars[i].size = 1; m_vars[i].count = strlen(*static_cast(m_vars[i].ptr)) + 1; } /* fallthru */ case c_data_ptr: if (m_vars[i].flags.is_stack_buf && !m_vars[i].direction.bits && m_vars[i].alloc_if) { // this var_desc is for stack buffer bool is_new; if (!offload_stack_memory_manager( stack_addr, entry_id, m_vars[i].count, m_vars[i].align, &is_new)) { return false; } if (is_new) { m_compute_buffers.push_back( m_stack_ptr_data->mic_buf); m_device.m_persist_list.front().cpu_stack_addr = static_cast(m_vars[i].ptr); } else { m_vars[i].flags.sink_addr = 1; m_in_datalen += sizeof(m_stack_ptr_data->mic_addr); } m_vars[i].size = m_destroy_stack.size(); m_vars_extra[i].src_data = m_stack_ptr_data; // need to add reference for buffer m_need_runfunction = true; break; } /* fallthru */ case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].type.src == c_cean_var_ptr) { // array descriptor const arr_desc *ap = static_cast(m_vars[i].ptr); // debug dump __arr_desc_dump("", "IN/OUT", ap, 1); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, m_vars[i].disp, m_vars[i].size); if (!is_arr_desc_contiguous(ap)) { m_vars[i].flags.is_noncont_src = 1; m_vars_extra[i].read_rng_src = init_read_ranges_arr_desc(ap); } // all necessary information about length and offset is // transferred in var descriptor. There is no need to send // array descriptor to the target side. m_vars[i].ptr = reinterpret_cast(ap->base); } else if (m_vars[i].type.src == c_dv_ptr) { // need to send DV to the device unless it is 'nocopy' if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { ArrDesc *dvp = *static_cast(m_vars[i].ptr); // debug dump __dv_desc_dump("IN/OUT", dvp); m_vars[i].direction.bits = c_parameter_in; } // no displacement m_vars[i].disp = 0; } else { // c_data_ptr or c_string_ptr m_vars[i].size *= m_vars[i].count; m_vars[i].disp = 0; } if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { PtrData *ptr_data; // check that buffer length >= 0 if (m_vars[i].alloc_if && m_vars[i].disp + m_vars[i].size < 0) { LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len); exit(1); } // base address void *base = *static_cast(m_vars[i].ptr); // allocate buffer if we have no INTO and don't need // allocation for the ptr at target if (src_is_for_mic) { if (m_vars[i].flags.is_stack_buf) { // for stack persistent objects ptr data is created // by var_desc with number 0. // Its ptr_data is stored at m_stack_ptr_data ptr_data = m_stack_ptr_data; m_vars[i].flags.sink_addr = 1; } else if (m_vars[i].alloc_if) { // add new entry if (!alloc_ptr_data( ptr_data, base, (alloc_base != NULL) ? alloc_disp : m_vars[i].disp, (alloc_base != NULL) ? alloc_size : m_vars[i].size, alloc_disp, (alloc_base != NULL) ? 0 : m_vars[i].align)) { return false; } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf != 0) { // add buffer to the list of buffers that // are passed to dispatch call m_compute_buffers.push_back( ptr_data->mic_buf); } else { // will send buffer address to device m_vars[i].flags.sink_addr = 1; } if (!ptr_data->is_static) { // need to add reference for buffer m_need_runfunction = true; } } else { bool error_if_not_found = true; if (m_is_openmp) { // For omp target update variable is ignored // if it does not exist. if (!m_vars[i].alloc_if && !m_vars[i].free_if) { error_if_not_found = false; } } // use existing association from pointer table if (!find_ptr_data(ptr_data, base, m_vars[i].disp, m_vars[i].size, error_if_not_found)) { return false; } if (m_is_openmp) { // make var nocopy if it does not exist if (ptr_data == 0) { m_vars[i].direction.bits = c_parameter_nocopy; } } if (ptr_data != 0) { m_vars[i].flags.sink_addr = 1; } } if (ptr_data != 0) { if (m_is_openmp) { // data is transferred only if // alloc_if == 0 && free_if == 0 // or reference count is 1 if ((m_vars[i].alloc_if || m_vars[i].free_if) && ptr_data->get_reference() != 1) { m_vars[i].direction.bits = c_parameter_nocopy; } } if (ptr_data->alloc_disp != 0) { m_vars[i].flags.alloc_disp = 1; m_in_datalen += sizeof(alloc_disp); } if (m_vars[i].flags.sink_addr) { // get buffers's address on the sink if (!init_mic_address(ptr_data)) { return false; } m_in_datalen += sizeof(ptr_data->mic_addr); } if (!ptr_data->is_static && m_vars[i].free_if) { // need to decrement buffer reference on target m_need_runfunction = true; } // offset to base from the beginning of the buffer // memory m_vars[i].offset = (char*) base - (char*) ptr_data->cpu_addr.start(); // copy other pointer properties to var descriptor m_vars[i].mic_offset = ptr_data->mic_offset; m_vars[i].flags.is_static = ptr_data->is_static; } } else { if (!find_ptr_data(ptr_data, base, m_vars[i].disp, m_vars[i].size, false)) { return false; } if (ptr_data) { m_vars[i].offset = (char*) base - (char*) ptr_data->cpu_addr.start(); } } // save pointer data m_vars_extra[i].src_data = ptr_data; } break; case c_func_ptr: if (m_vars[i].direction.in) { m_in_datalen += __offload_funcs.max_name_length(); } if (m_vars[i].direction.out) { m_out_datalen += __offload_funcs.max_name_length(); } m_need_runfunction = true; break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: ArrDesc *dvp; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) { const arr_desc *ap; ap = static_cast(m_vars[i].ptr); dvp = (m_vars[i].type.src == c_dv_data_slice) ? reinterpret_cast(ap->base) : *reinterpret_cast(ap->base); } else { dvp = (m_vars[i].type.src == c_dv_data) ? static_cast(m_vars[i].ptr) : *static_cast(m_vars[i].ptr); } // if allocatable dope vector isn't allocated don't // transfer its data if (!__dv_is_allocated(dvp)) { m_vars[i].direction.bits = c_parameter_nocopy; m_vars[i].alloc_if = 0; m_vars[i].free_if = 0; } if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { const arr_desc *ap; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) { ap = static_cast(m_vars[i].ptr); // debug dump __arr_desc_dump("", "IN/OUT", ap, 0); } if (!__dv_is_contiguous(dvp)) { m_vars[i].flags.is_noncont_src = 1; m_vars_extra[i].read_rng_src = init_read_ranges_dv(dvp); } // size and displacement if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) { // offset and length are derived from the // array descriptor __arr_data_offset_and_length(ap, m_vars[i].disp, m_vars[i].size); if (m_vars[i].direction.bits) { if (!is_arr_desc_contiguous(ap)) { if (m_vars[i].flags.is_noncont_src) { LIBOFFLOAD_ERROR(c_slice_of_noncont_array); return false; } m_vars[i].flags.is_noncont_src = 1; m_vars_extra[i].read_rng_src = init_read_ranges_arr_desc(ap); } } } else { if (m_vars[i].flags.has_length) { m_vars[i].size = __dv_data_length(dvp, m_vars[i].count); } else { m_vars[i].size = __dv_data_length(dvp); } m_vars[i].disp = 0; } // check that length >= 0 if (m_vars[i].alloc_if && (m_vars[i].disp + m_vars[i].size < 0)) { LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len); exit(1); } // base address void *base = reinterpret_cast(dvp->Base); PtrData *ptr_data; // allocate buffer if we have no INTO and don't need // allocation for the ptr at target if (src_is_for_mic) { if (m_vars[i].alloc_if) { // add new entry if (!alloc_ptr_data( ptr_data, base, (alloc_base != NULL) ? alloc_disp : m_vars[i].disp, (alloc_base != NULL) ? alloc_size : m_vars[i].size, alloc_disp, (alloc_base != NULL) ? 0 : m_vars[i].align)) { return false; } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf != 0) { // add buffer to the list of buffers // that are passed to dispatch call m_compute_buffers.push_back( ptr_data->mic_buf); } else { // will send buffer address to device m_vars[i].flags.sink_addr = 1; } if (!ptr_data->is_static) { // need to add reference for buffer m_need_runfunction = true; } } else { bool error_if_not_found = true; if (m_is_openmp) { // For omp target update variable is ignored // if it does not exist. if (!m_vars[i].alloc_if && !m_vars[i].free_if) { error_if_not_found = false; } } // use existing association from pointer table if (!find_ptr_data(ptr_data, base, m_vars[i].disp, m_vars[i].size, error_if_not_found)) { return false; } if (m_is_openmp) { // make var nocopy if it does not exist if (ptr_data == 0) { m_vars[i].direction.bits = c_parameter_nocopy; } } if (ptr_data != 0) { // need to update base in dope vector on device m_vars[i].flags.sink_addr = 1; } } if (ptr_data != 0) { if (m_is_openmp) { // data is transferred only if // alloc_if == 0 && free_if == 0 // or reference count is 1 if ((m_vars[i].alloc_if || m_vars[i].free_if) && ptr_data->get_reference() != 1) { m_vars[i].direction.bits = c_parameter_nocopy; } } if (ptr_data->alloc_disp != 0) { m_vars[i].flags.alloc_disp = 1; m_in_datalen += sizeof(alloc_disp); } if (m_vars[i].flags.sink_addr) { // get buffers's address on the sink if (!init_mic_address(ptr_data)) { return false; } m_in_datalen += sizeof(ptr_data->mic_addr); } if (!ptr_data->is_static && m_vars[i].free_if) { // need to decrement buffer reference on target m_need_runfunction = true; } // offset to base from the beginning of the buffer // memory m_vars[i].offset = (char*) base - (char*) ptr_data->cpu_addr.start(); // copy other pointer properties to var descriptor m_vars[i].mic_offset = ptr_data->mic_offset; m_vars[i].flags.is_static = ptr_data->is_static; } } else { // !src_is_for_mic if (!find_ptr_data(ptr_data, base, m_vars[i].disp, m_vars[i].size, false)) { return false; } m_vars[i].offset = !ptr_data ? 0 : (char*) base - (char*) ptr_data->cpu_addr.start(); } // save pointer data m_vars_extra[i].src_data = ptr_data; } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src); LIBOFFLOAD_ABORT; } if (m_vars[i].type.src == c_data_ptr_array) { continue; } if (src_is_for_mic && m_vars[i].flags.is_stack_buf) { m_vars[i].offset = static_cast(m_vars[i].ptr) - m_device.m_persist_list.front().cpu_stack_addr; } // if source is used at CPU save its offset and disp if (m_vars[i].into == NULL || m_vars[i].direction.in) { m_vars_extra[i].cpu_offset = m_vars[i].offset; m_vars_extra[i].cpu_disp = m_vars[i].disp; } // If "into" is define we need to do the similar work for it if (!m_vars[i].into) { continue; } int64_t into_disp =0, into_offset = 0; switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: { int64_t size = m_vars[i].size; if (m_vars[i].type.dst == c_cean_var) { // array descriptor const arr_desc *ap = static_cast(m_vars[i].into); // debug dump __arr_desc_dump(" ", "INTO", ap, 0); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, into_disp, size); if (!is_arr_desc_contiguous(ap)) { m_vars[i].flags.is_noncont_dst = 1; m_vars_extra[i].read_rng_dst = init_read_ranges_arr_desc(ap); if (!cean_ranges_match( m_vars_extra[i].read_rng_src, m_vars_extra[i].read_rng_dst)) { LIBOFFLOAD_ERROR(c_ranges_dont_match); exit(1); } } m_vars[i].into = reinterpret_cast(ap->base); } int64_t size_src = m_vars_extra[i].read_rng_src ? cean_get_transf_size(m_vars_extra[i].read_rng_src) : m_vars[i].size; int64_t size_dst = m_vars_extra[i].read_rng_dst ? cean_get_transf_size(m_vars_extra[i].read_rng_dst) : size; // It's supposed that "into" size must be not less // than src size if (size_src > size_dst) { LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes, size_src, size_dst); exit(1); } if (m_vars[i].direction.bits) { if (m_vars[i].flags.is_static_dstn) { PtrData *ptr_data; // find data associated with variable if (!find_ptr_data(ptr_data, m_vars[i].into, into_disp, size, false)) { return false; } if (ptr_data != 0) { // offset to base from the beginning of the buffer // memory into_offset = (char*) m_vars[i].into - (char*) ptr_data->cpu_addr.start(); } else { m_vars[i].flags.is_static_dstn = false; } m_vars_extra[i].dst_data = ptr_data; } } if (m_vars[i].direction.in && !m_vars[i].flags.is_static_dstn) { m_in_datalen += m_vars[i].size; // for non-static target destination defined as CEAN // expression we pass to target its size and dist if (m_vars[i].type.dst == c_cean_var) { m_in_datalen += 2 * sizeof(uint64_t); } m_need_runfunction = true; } break; } case c_dv: if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { ArrDesc *dvp = static_cast(m_vars[i].into); // debug dump __dv_desc_dump("INTO", dvp); // send dope vector contents excluding base m_in_datalen += m_vars[i].size - sizeof(uint64_t); m_need_runfunction = true; } break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: { int64_t size = m_vars[i].size; if (m_vars[i].type.dst == c_cean_var_ptr) { // array descriptor const arr_desc *ap = static_cast(m_vars[i].into); // debug dump __arr_desc_dump(" ", "INTO", ap, 1); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, into_disp, size); if (!is_arr_desc_contiguous(ap)) { m_vars[i].flags.is_noncont_src = 1; m_vars_extra[i].read_rng_dst = init_read_ranges_arr_desc(ap); if (!cean_ranges_match( m_vars_extra[i].read_rng_src, m_vars_extra[i].read_rng_dst)) { LIBOFFLOAD_ERROR(c_ranges_dont_match); } } m_vars[i].into = reinterpret_cast(ap->base); } else if (m_vars[i].type.dst == c_dv_ptr) { // need to send DV to the device unless it is 'nocopy' if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { ArrDesc *dvp = *static_cast(m_vars[i].into); // debug dump __dv_desc_dump("INTO", dvp); m_vars[i].direction.bits = c_parameter_in; } } int64_t size_src = m_vars_extra[i].read_rng_src ? cean_get_transf_size(m_vars_extra[i].read_rng_src) : m_vars[i].size; int64_t size_dst = m_vars_extra[i].read_rng_dst ? cean_get_transf_size(m_vars_extra[i].read_rng_dst) : size; // It's supposed that "into" size must be not less than // src size if (size_src > size_dst) { LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes, size_src, size_dst); exit(1); } if (m_vars[i].direction.bits) { PtrData *ptr_data; // base address void *base = *static_cast(m_vars[i].into); if (m_vars[i].direction.in) { // allocate buffer if (m_vars[i].flags.is_stack_buf) { // for stack persistent objects ptr data is created // by var_desc with number 0. // Its ptr_data is stored at m_stack_ptr_data ptr_data = m_stack_ptr_data; m_vars[i].flags.sink_addr = 1; } else if (m_vars[i].alloc_if) { // add new entry if (!alloc_ptr_data( ptr_data, base, (alloc_base != NULL) ? alloc_disp : into_disp, (alloc_base != NULL) ? alloc_size : size, alloc_disp, (alloc_base != NULL) ? 0 : m_vars[i].align)) { return false; } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf != 0) { // add buffer to the list of buffers that // are passed to dispatch call m_compute_buffers.push_back( ptr_data->mic_buf); } else { // will send buffer address to device m_vars[i].flags.sink_addr = 1; } if (!ptr_data->is_static) { // need to add reference for buffer m_need_runfunction = true; } } else { // use existing association from pointer table if (!find_ptr_data(ptr_data, base, into_disp, size)) { return false; } m_vars[i].flags.sink_addr = 1; } if (ptr_data->alloc_disp != 0) { m_vars[i].flags.alloc_disp = 1; m_in_datalen += sizeof(alloc_disp); } if (m_vars[i].flags.sink_addr) { // get buffers's address on the sink if (!init_mic_address(ptr_data)) { return false; } m_in_datalen += sizeof(ptr_data->mic_addr); } if (!ptr_data->is_static && m_vars[i].free_if) { // need to decrement buffer reference on target m_need_runfunction = true; } // copy other pointer properties to var descriptor m_vars[i].mic_offset = ptr_data->mic_offset; m_vars[i].flags.is_static_dstn = ptr_data->is_static; } else { if (!find_ptr_data(ptr_data, base, into_disp, m_vars[i].size, false)) { return false; } } if (ptr_data) { into_offset = ptr_data ? (char*) base - (char*) ptr_data->cpu_addr.start() : 0; } // save pointer data m_vars_extra[i].dst_data = ptr_data; } break; } case c_func_ptr: break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { const arr_desc *ap; ArrDesc *dvp; PtrData *ptr_data; int64_t disp; int64_t size; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) { ap = static_cast(m_vars[i].into); // debug dump __arr_desc_dump(" ", "INTO", ap, 0); dvp = (m_vars[i].type.dst == c_dv_data_slice) ? reinterpret_cast(ap->base) : *reinterpret_cast(ap->base); } else { dvp = (m_vars[i].type.dst == c_dv_data) ? static_cast(m_vars[i].into) : *static_cast(m_vars[i].into); } if (!__dv_is_contiguous(dvp)) { m_vars[i].flags.is_noncont_dst = 1; m_vars_extra[i].read_rng_dst = init_read_ranges_dv(dvp); } // size and displacement if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) { // offset and length are derived from the array // descriptor __arr_data_offset_and_length(ap, into_disp, size); if (m_vars[i].direction.bits) { if (!is_arr_desc_contiguous(ap)) { if (m_vars[i].flags.is_noncont_dst) { LIBOFFLOAD_ERROR(c_slice_of_noncont_array); return false; } m_vars[i].flags.is_noncont_dst = 1; m_vars_extra[i].read_rng_dst = init_read_ranges_arr_desc(ap); if (!cean_ranges_match( m_vars_extra[i].read_rng_src, m_vars_extra[i].read_rng_dst)) { LIBOFFLOAD_ERROR(c_ranges_dont_match); } } } } else { if (m_vars[i].flags.has_length) { size = __dv_data_length(dvp, m_vars[i].count); } else { size = __dv_data_length(dvp); } disp = 0; } int64_t size_src = m_vars_extra[i].read_rng_src ? cean_get_transf_size(m_vars_extra[i].read_rng_src) : m_vars[i].size; int64_t size_dst = m_vars_extra[i].read_rng_dst ? cean_get_transf_size(m_vars_extra[i].read_rng_dst) : size; // It's supposed that "into" size must be not less // than src size if (size_src > size_dst) { LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes, size_src, size_dst); exit(1); } // base address void *base = reinterpret_cast(dvp->Base); // allocate buffer if (m_vars[i].direction.in) { if (m_vars[i].alloc_if) { // add new entry if (!alloc_ptr_data( ptr_data, base, (alloc_base != NULL) ? alloc_disp : into_disp, (alloc_base != NULL) ? alloc_size : size, alloc_disp, (alloc_base != NULL) ? 0 : m_vars[i].align)) { return false; } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf !=0) { // add buffer to the list of buffers // that are passed to dispatch call m_compute_buffers.push_back( ptr_data->mic_buf); } else { // will send buffer address to device m_vars[i].flags.sink_addr = 1; } if (!ptr_data->is_static) { // need to add reference for buffer m_need_runfunction = true; } } else { // use existing association from pointer table if (!find_ptr_data(ptr_data, base, into_disp, size)) { return false; } // need to update base in dope vector on device m_vars[i].flags.sink_addr = 1; } if (ptr_data->alloc_disp != 0) { m_vars[i].flags.alloc_disp = 1; m_in_datalen += sizeof(alloc_disp); } if (m_vars[i].flags.sink_addr) { // get buffers's address on the sink if (!init_mic_address(ptr_data)) { return false; } m_in_datalen += sizeof(ptr_data->mic_addr); } if (!ptr_data->is_static && m_vars[i].free_if) { // need to decrement buffer reference on target m_need_runfunction = true; } // offset to base from the beginning of the buffer // memory into_offset = (char*) base - (char*) ptr_data->cpu_addr.start(); // copy other pointer properties to var descriptor m_vars[i].mic_offset = ptr_data->mic_offset; m_vars[i].flags.is_static_dstn = ptr_data->is_static; } else { // src_is_for_mic if (!find_ptr_data(ptr_data, base, into_disp, size, false)) { return false; } into_offset = !ptr_data ? 0 : (char*) base - (char*) ptr_data->cpu_addr.start(); } // save pointer data m_vars_extra[i].dst_data = ptr_data; } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src); LIBOFFLOAD_ABORT; } // if into is used at CPU save its offset and disp if (m_vars[i].direction.out) { m_vars_extra[i].cpu_offset = into_offset; m_vars_extra[i].cpu_disp = into_disp; } else { if (m_vars[i].flags.is_stack_buf) { into_offset = static_cast(m_vars[i].into) - m_device.m_persist_list.front().cpu_stack_addr; } m_vars[i].offset = into_offset; m_vars[i].disp = into_disp; } } return true; } bool OffloadDescriptor::setup_misc_data(const char *name) { OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data); // we can skip run functon call together with wait if offloaded // region is empty and there is no user defined non-pointer IN/OUT data if (m_need_runfunction) { // variable descriptors are sent as input data m_in_datalen += m_vars_total * sizeof(VarDesc); // timer data is sent as a part of the output data m_out_datalen += OFFLOAD_TIMER_DATALEN(); // max from input data and output data length uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen : m_out_datalen; // Misc data has the following layout // // // (optional) // // We can transfer copyin/copyout data in misc/return data which can // be passed to run function call if its size does not exceed // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate // buffer for it. m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1; m_func_desc_size = (m_func_desc_size + 7) & ~7; int misc_data_offset = 0; int misc_data_size = 0; if (data_len > 0) { if (m_func_desc_size + m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN && m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) { // use misc/return data for copyin/copyout misc_data_offset = m_func_desc_size; misc_data_size = data_len; } else { OffloadTimer timer_buf(get_timer_data(), c_offload_host_alloc_data_buffer); // send/receive data using buffer COIRESULT res = COI::BufferCreate(data_len, COI_BUFFER_NORMAL, 0, 0, 1, &m_device.get_process(), &m_inout_buf); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_create, res); } m_compute_buffers.push_back(m_inout_buf); m_destroy_buffers.push_back(m_inout_buf); } } // initialize function descriptor m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size + misc_data_size); m_func_desc->console_enabled = console_enabled; m_func_desc->timer_enabled = timer_enabled || (offload_report_level && offload_report_enabled); m_func_desc->offload_report_level = offload_report_level; m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data()); m_func_desc->in_datalen = m_in_datalen; m_func_desc->out_datalen = m_out_datalen; m_func_desc->vars_num = m_vars_total; m_func_desc->data_offset = misc_data_offset; // append entry name strcpy(m_func_desc->data, name); } return true; } bool OffloadDescriptor::wait_dependencies( const void **waits, int num_waits ) { OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps); bool ret = true; for (int i = 0; i < num_waits; i++) { OffloadDescriptor *task = m_device.find_signal(waits[i], true); if (task == 0) { LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(), waits[i]); LIBOFFLOAD_ABORT; } if (!task->offload_finish()) { ret = false; } task->cleanup(); delete task; } return ret; } bool OffloadDescriptor::offload( const char *name, bool is_empty, VarDesc *vars, VarDesc2 *vars2, int vars_total, const void **waits, int num_waits, const void **signal, int entry_id, const void *stack_addr ) { if (signal == 0) { OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_init_func, "Offload function %s, is_empty=%d, #varDescs=%d, " "#waits=%d, signal=none\n", name, is_empty, vars_total, num_waits); OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_sent_pointer_data, "#Wait : %d \n", num_waits); OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_signal, "none %d\n", 0); } else { OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_init_func, "Offload function %s, is_empty=%d, #varDescs=%d, " "#waits=%d, signal=%p\n", name, is_empty, vars_total, num_waits, *signal); OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_signal, "%d\n", signal); } OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_wait, "#Wait : %d %p\n", num_waits, waits); if (m_status != 0) { m_status->result = OFFLOAD_SUCCESS; m_status->device_number = m_device.get_logical_index(); } m_need_runfunction = !is_empty; // wait for dependencies to finish if (!wait_dependencies(waits, num_waits)) { cleanup(); return false; } // setup buffers if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) { cleanup(); return false; } // initiate send for pointers. Want to do it as early as possible. if (!send_pointer_data(signal != 0)) { cleanup(); return false; } // setup misc data for run function if (!setup_misc_data(name)) { cleanup(); return false; } // gather copyin data into buffer if (!gather_copyin_data()) { cleanup(); return false; } // Start the computation if (!compute()) { cleanup(); return false; } // initiate receive for pointers if (!receive_pointer_data(signal != 0)) { cleanup(); return false; } // if there is a signal save descriptor for the later use. if (signal != 0) { m_device.add_signal(*signal, this); return true; } // wait for the offload to finish. if (!offload_finish()) { cleanup(); return false; } cleanup(); return true; } bool OffloadDescriptor::offload_finish() { COIRESULT res; // wait for compute dependencies to become signaled if (m_in_deps_total > 0) { OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute); if (__offload_active_wait) { // keep CPU busy do { res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0); } while (res == COI_TIME_OUT_REACHED); } else { res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0); } if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_event_wait, res); } } // scatter copyout data received from target if (!scatter_copyout_data()) { return false; } // wait for receive dependencies to become signaled if (m_out_deps_total > 0) { OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads); if (__offload_active_wait) { // keep CPU busy do { res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0); } while (res == COI_TIME_OUT_REACHED); } else { res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0); } if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_event_wait, res); } } // destroy buffers { OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers); for (BufferList::const_iterator it = m_destroy_buffers.begin(); it != m_destroy_buffers.end(); it++) { res = COI::BufferDestroy(*it); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_destroy, res); } } } return true; } void OffloadDescriptor::cleanup() { // release device in orsl ORSL::release(m_device.get_logical_index()); OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload); // report stuff Offload_Report_Epilog(get_timer_data()); } bool OffloadDescriptor::is_signaled() { bool signaled = true; COIRESULT res; // check compute and receive dependencies if (m_in_deps_total > 0) { res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0); signaled = signaled && (res == COI_SUCCESS); } if (m_out_deps_total > 0) { res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0); signaled = signaled && (res == COI_SUCCESS); } return signaled; } // Send pointer data if source or destination or both of them are // noncontiguous. There is guarantee that length of destination enough for // transferred data. bool OffloadDescriptor::send_noncontiguous_pointer_data( int i, PtrData* src_data, PtrData* dst_data, COIEVENT *event ) { int64_t offset_src, offset_dst; int64_t length_src, length_dst; int64_t length_src_cur, length_dst_cur; int64_t send_size, data_sent = 0; COIRESULT res; bool dst_is_empty = true; bool src_is_empty = true; // Set length_src and length_dst length_src = (m_vars_extra[i].read_rng_src) ? m_vars_extra[i].read_rng_src->range_size : m_vars[i].size; length_dst = !m_vars[i].into ? length_src : (m_vars_extra[i].read_rng_dst) ? m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size; send_size = (length_src < length_dst) ? length_src : length_dst; // consequently get contiguous ranges, // define corresponded destination offset and send data do { if (src_is_empty) { if (m_vars_extra[i].read_rng_src) { if (!get_next_range(m_vars_extra[i].read_rng_src, &offset_src)) { // source ranges are over - nothing to send break; } } else if (data_sent == 0) { offset_src = m_vars_extra[i].cpu_disp; } else { break; } length_src_cur = length_src; } else { // if source is contiguous or its contiguous range is greater // than destination one offset_src += send_size; } length_src_cur -= send_size; src_is_empty = length_src_cur == 0; if (dst_is_empty) { if (m_vars[i].into) { if (m_vars_extra[i].read_rng_dst) { if (!get_next_range(m_vars_extra[i].read_rng_dst, &offset_dst)) { // destination ranges are over LIBOFFLOAD_ERROR(c_destination_is_over); return false; } } // into is contiguous. else { offset_dst = m_vars[i].disp; } length_dst_cur = length_dst; } // same as source else { offset_dst = offset_src; length_dst_cur = length_src; } } else { // if destination is contiguous or its contiguous range is greater // than source one offset_dst += send_size; } length_dst_cur -= send_size; dst_is_empty = length_dst_cur == 0; if (src_data != 0 && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + offset_dst, m_vars_extra[i].cpu_offset + offset_src, send_size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { char *base = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + offset_dst, base + offset_src, send_size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } } data_sent += length_src; } while (true); return true; } bool OffloadDescriptor::send_pointer_data(bool is_async) { OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers); uint64_t ptr_sent = 0; COIRESULT res; // Initiate send for pointer data for (int i = 0; i < m_vars_total; i++) { switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.in && m_vars[i].flags.is_static_dstn) { COIEVENT *event = (is_async || m_vars[i].size >= __offload_use_async_buffer_write) ? &m_in_deps[m_in_deps_total++] : 0; PtrData* dst_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; PtrData* src_data = VAR_TYPE_IS_PTR(m_vars[i].type.src) || VAR_TYPE_IS_SCALAR(m_vars[i].type.src) && m_vars[i].flags.is_static ? m_vars_extra[i].src_data : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { if (!send_noncontiguous_pointer_data( i, src_data, dst_data, event)) { return false; } } else if (src_data != 0 && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { char *base = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } } ptr_sent += m_vars[i].size; } break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].direction.in && m_vars[i].size > 0) { COIEVENT *event = (is_async || m_vars[i].size >= __offload_use_async_buffer_write) ? &m_in_deps[m_in_deps_total++] : 0; PtrData* dst_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; PtrData* src_data = VAR_TYPE_IS_PTR(m_vars[i].type.src) || VAR_TYPE_IS_SCALAR(m_vars[i].type.src) && m_vars[i].flags.is_static ? m_vars_extra[i].src_data : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( i, src_data, dst_data, event); } else if (src_data != 0 && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { char *base = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, m_vars[i].mic_offset - dst_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } } ptr_sent += m_vars[i].size; } break; case c_dv_data: case c_dv_ptr_data: if (m_vars[i].direction.in && m_vars[i].size > 0) { PtrData *ptr_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; PtrData* src_data = m_vars_extra[i].src_data; COIEVENT *event = (is_async || m_vars[i].size >= __offload_use_async_buffer_write) ? &m_in_deps[m_in_deps_total++] : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( i, src_data, ptr_data, event); } else if (src_data && src_data->cpu_buf != 0) { res = COI::BufferCopy( ptr_data->mic_buf, src_data->cpu_buf, m_vars[i].offset + ptr_data->mic_offset - ptr_data->alloc_disp + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { char *base = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); res = COI::BufferWrite( ptr_data->mic_buf, ptr_data->mic_offset - ptr_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } } ptr_sent += m_vars[i].size; } break; case c_dv_data_slice: case c_dv_ptr_data_slice: if (m_vars[i].direction.in && m_vars[i].size > 0) { PtrData *dst_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; PtrData* src_data = (VAR_TYPE_IS_PTR(m_vars[i].type.src) || VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) || VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) || VAR_TYPE_IS_SCALAR(m_vars[i].type.src) && m_vars[i].flags.is_static) ? m_vars_extra[i].src_data : 0; COIEVENT *event = (is_async || m_vars[i].size >= __offload_use_async_buffer_write) ? &m_in_deps[m_in_deps_total++] : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( i, src_data, dst_data, event); } else if (src_data && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, m_vars[i].offset - dst_data->alloc_disp + dst_data->mic_offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { char *base = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, dst_data->mic_offset - dst_data->alloc_disp + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, 0, 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_write, res); } } ptr_sent += m_vars[i].size; } break; default: break; } // alloc field isn't used at target. // We can reuse it for offset of array pointers. if (m_vars_extra[i].is_arr_ptr_el) { m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset; } } if (m_status) { m_status->data_sent += ptr_sent; } OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent); OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_sent_pointer_data, "Total pointer data sent to target: [%lld] bytes\n", ptr_sent); return true; } bool OffloadDescriptor::gather_copyin_data() { OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs); if (m_need_runfunction && m_in_datalen > 0) { COIMAPINSTANCE map_inst; char *data; // init marshaller if (m_inout_buf != 0) { OffloadTimer timer_map(get_timer_data(), c_offload_host_map_in_data_buffer); COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen, COI_MAP_WRITE_ENTIRE_BUFFER, 0, 0, 0, &map_inst, reinterpret_cast(&data)); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_map, res); } } else { data = (char*) m_func_desc + m_func_desc->data_offset; } // send variable descriptors memcpy(data, m_vars, m_vars_total * sizeof(VarDesc)); data += m_vars_total * sizeof(VarDesc); // init marshaller m_in.init_buffer(data, m_in_datalen); // Gather copy data into buffer for (int i = 0; i < m_vars_total; i++) { bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); PtrData* ptr_data = src_is_for_mic ? m_vars_extra[i].src_data : m_vars_extra[i].dst_data; if (m_vars[i].flags.alloc_disp) { m_in.send_data(&ptr_data->alloc_disp, sizeof(ptr_data->alloc_disp)); } // send sink address to the target if (m_vars[i].flags.sink_addr) { m_in.send_data(&ptr_data->mic_addr, sizeof(ptr_data->mic_addr)); } switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.in && !m_vars[i].flags.is_static_dstn) { char *ptr = offload_get_src_base(m_vars[i].ptr, m_vars[i].type.src); if (m_vars[i].type.dst == c_cean_var) { // offset and length are derived from the array // descriptor int64_t size = m_vars[i].size; int64_t disp = m_vars[i].disp; m_in.send_data(reinterpret_cast(&size), sizeof(int64_t)); m_in.send_data(reinterpret_cast(&disp), sizeof(int64_t)); } m_in.send_data(ptr + m_vars_extra[i].cpu_disp, m_vars[i].size); } break; case c_dv: if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { // send dope vector excluding base char *ptr = static_cast(m_vars[i].ptr); m_in.send_data(ptr + sizeof(uint64_t), m_vars[i].size - sizeof(uint64_t)); } break; case c_data_ptr: // send to target addresses of obsolete // stacks to be released if (m_vars[i].flags.is_stack_buf && !m_vars[i].direction.bits && m_vars[i].alloc_if && m_vars[i].size != 0) { for (PtrDataList::iterator it = m_destroy_stack.begin(); it != m_destroy_stack.end(); it++) { PtrData * ptr_data = *it; m_in.send_data(&(ptr_data->mic_addr), sizeof(ptr_data->mic_addr)); } } break; case c_func_ptr: if (m_vars[i].direction.in) { m_in.send_func_ptr(*((const void**) m_vars[i].ptr)); } break; default: break; } } if (m_status) { m_status->data_sent += m_in.get_tfr_size(); } if (m_func_desc->data_offset == 0) { OffloadTimer timer_unmap(get_timer_data(), c_offload_host_unmap_in_data_buffer); COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_unmap, res); } } } OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size()); OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data, "Total copyin data sent to target: [%lld] bytes\n", m_in.get_tfr_size()); return true; } bool OffloadDescriptor::compute() { OffloadTimer timer(get_timer_data(), c_offload_host_start_compute); if (m_need_runfunction) { OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_compute, "Compute task on MIC\n"); void* misc = m_func_desc; int misc_len = m_func_desc_size; void* ret = 0; int ret_len = 0; if (m_func_desc->data_offset != 0) { misc_len += m_in_datalen; if (m_out_datalen > 0) { ret = (char*) m_func_desc + m_func_desc->data_offset; ret_len = m_out_datalen; } } // dispatch task COIRESULT res; COIEVENT event; res = m_device.compute(m_compute_buffers, misc, misc_len, ret, ret_len, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, &event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_pipeline_run_func, res); } m_in_deps_total = 1; m_in_deps[0] = event; } return true; } // receive pointer data if source or destination or both of them are // noncontiguous. There is guarantee that length of destination enough for // transferred data. bool OffloadDescriptor::receive_noncontiguous_pointer_data( int i, char* base, COIBUFFER dst_buf, COIEVENT *event ) { int64_t offset_src, offset_dst; int64_t length_src, length_dst; int64_t length_src_cur, length_dst_cur; int64_t receive_size, data_received = 0; COIRESULT res; bool dst_is_empty = true; bool src_is_empty = true; // Set length_src and length_dst length_src = (m_vars_extra[i].read_rng_src) ? m_vars_extra[i].read_rng_src->range_size : m_vars[i].size; length_dst = !m_vars[i].into ? length_src : (m_vars_extra[i].read_rng_dst) ? m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size; receive_size = (length_src < length_dst) ? length_src : length_dst; // consequently get contiguous ranges, // define corresponded destination offset and receive data do { // get sorce offset if (src_is_empty) { if (m_vars_extra[i].read_rng_src) { if (!get_next_range(m_vars_extra[i].read_rng_src, &offset_src)) { // source ranges are over - nothing to send break; } } else if (data_received == 0) { offset_src = 0; } else { break; } length_src_cur = length_src; } else { // if source is contiguous or its contiguous range is greater // than destination one offset_src += receive_size; } length_src_cur -= receive_size; src_is_empty = length_src_cur == 0; // get destination offset if (dst_is_empty) { if (m_vars[i].into) { if (m_vars_extra[i].read_rng_dst) { if (!get_next_range(m_vars_extra[i].read_rng_dst, &offset_dst)) { // destination ranges are over LIBOFFLOAD_ERROR(c_destination_is_over); return false; } } // destination is contiguous. else { offset_dst = m_vars_extra[i].cpu_disp; } length_dst_cur = length_dst; } // same as source else { offset_dst = offset_src; length_dst_cur = length_src; } } else { // if destination is contiguous or its contiguous range is greater // than source one offset_dst += receive_size; } length_dst_cur -= receive_size; dst_is_empty = length_dst_cur == 0; if (dst_buf != 0) { res = COI::BufferCopy( dst_buf, m_vars_extra[i].src_data->mic_buf, m_vars_extra[i].cpu_offset + offset_dst, m_vars[i].offset + offset_src + m_vars[i].mic_offset - m_vars_extra[i].src_data->alloc_disp, receive_size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { res = COI::BufferRead( m_vars_extra[i].src_data->mic_buf, m_vars[i].offset + offset_src + m_vars[i].mic_offset - m_vars_extra[i].src_data->alloc_disp, base + offset_dst, receive_size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_read, res); } } data_received += receive_size; } while (true); return true; } bool OffloadDescriptor::receive_pointer_data(bool is_async) { OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads); uint64_t ptr_received = 0; COIRESULT res; for (int i = 0; i < m_vars_total; i++) { switch (m_vars[i].type.src) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.out && m_vars[i].flags.is_static) { COIEVENT *event = (is_async || m_in_deps_total > 0 || m_vars[i].size >= __offload_use_async_buffer_read) ? &m_out_deps[m_out_deps_total++] : 0; PtrData *ptr_data = NULL; COIBUFFER dst_buf = NULL; // buffer at host char *base; if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) { ptr_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; } else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) { if (m_vars[i].flags.is_static_dstn) { ptr_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; } } dst_buf = ptr_data ? ptr_data->cpu_buf : NULL; if (dst_buf == NULL) { base = offload_get_src_base( m_vars[i].into ? static_cast(m_vars[i].into) : static_cast(m_vars[i].ptr), m_vars[i].type.dst); } if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { receive_noncontiguous_pointer_data( i, base, dst_buf, event); } else if (dst_buf != 0) { res = COI::BufferCopy( dst_buf, m_vars_extra[i].src_data->mic_buf, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].offset + m_vars[i].disp, m_vars[i].size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { res = COI::BufferRead( m_vars_extra[i].src_data->mic_buf, m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_read, res); } } ptr_received += m_vars[i].size; } break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: case c_dv_ptr: { COIBUFFER dst_buf = NULL; // buffer on host if (m_vars[i].direction.out && m_vars[i].size > 0) { COIEVENT *event = (is_async || m_in_deps_total > 0 || m_vars[i].size >= __offload_use_async_buffer_read) ? &m_out_deps[m_out_deps_total++] : 0; uint64_t dst_offset = 0; char *base = static_cast(m_vars[i].ptr); if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) { PtrData *ptr_data = m_vars[i].into ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; dst_buf = ptr_data ? ptr_data->cpu_buf : NULL; if (dst_buf == NULL) { base = m_vars[i].into ? *static_cast(m_vars[i].into) : *static_cast(m_vars[i].ptr); } dst_offset = m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp; } else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) { if (m_vars[i].flags.is_static_dstn) { dst_buf = m_vars[i].into ? m_vars_extra[i].dst_data->cpu_buf : m_vars_extra[i].src_data->cpu_buf; } if (dst_buf == NULL) { base = offload_get_src_base( m_vars[i].into ? static_cast(m_vars[i].into) : static_cast(m_vars[i].ptr), m_vars[i].type.dst); } dst_offset = m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp; } else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) || VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) { PtrData *ptr_data = m_vars[i].into != 0 ? m_vars_extra[i].dst_data : m_vars_extra[i].src_data; dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0; if (dst_buf == NULL) { base = offload_get_src_base( m_vars[i].into ? static_cast(m_vars[i].into) : static_cast(m_vars[i].ptr), m_vars[i].type.dst); } dst_offset = m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp; } if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { receive_noncontiguous_pointer_data( i, base, dst_buf, event); } else if (dst_buf != 0) { res = COI::BufferCopy( dst_buf, m_vars_extra[i].src_data->mic_buf, dst_offset, m_vars[i].offset + m_vars[i].disp + m_vars[i].mic_offset - m_vars_extra[i].src_data->alloc_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_copy, res); } } else { res = COI::BufferRead( m_vars_extra[i].src_data->mic_buf, m_vars[i].offset + m_vars[i].disp + m_vars[i].mic_offset - m_vars_extra[i].src_data->alloc_disp, base + dst_offset, m_vars[i].size, COI_COPY_UNSPECIFIED, m_in_deps_total, m_in_deps_total > 0 ? m_in_deps : 0, event); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_read, res); } } ptr_received += m_vars[i].size; } break; } default: break; } // destroy buffers for obsolete stacks if (m_destroy_stack.size() != 0) { for (PtrDataList::iterator it = m_destroy_stack.begin(); it != m_destroy_stack.end(); it++) { PtrData *ptr_data = *it; m_destroy_buffers.push_back(ptr_data->mic_buf); OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n", ptr_data->mic_addr); } m_destroy_stack.clear(); } if (m_vars[i].free_if) { // remove association for automatic variables if (m_is_openmp && !m_vars[i].flags.is_static && (m_vars[i].type.src == c_data || m_vars[i].type.src == c_void_ptr || m_vars[i].type.src == c_cean_var)) { AutoData *auto_data = m_vars_extra[i].auto_data; if (auto_data != 0 && auto_data->remove_reference() == 0) { m_device.remove_auto_data(auto_data->cpu_addr.start()); } } // destroy buffers if (m_vars[i].direction.out || m_vars[i].into == NULL) { if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) && !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) && !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) { continue; } PtrData *ptr_data = m_vars_extra[i].src_data; if (ptr_data->remove_reference() == 0) { // destroy buffers if (ptr_data->cpu_buf != 0) { m_destroy_buffers.push_back(ptr_data->cpu_buf); } if (ptr_data->mic_buf != 0) { m_destroy_buffers.push_back(ptr_data->mic_buf); } OFFLOAD_TRACE(3, "Removing association for addr %p\n", ptr_data->cpu_addr.start()); // remove association from map m_device.remove_ptr_data(ptr_data->cpu_addr.start()); } } else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) || VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) || VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) { PtrData *ptr_data = m_vars_extra[i].dst_data; if (ptr_data->remove_reference() == 0) { // destroy buffers if (ptr_data->cpu_buf != 0) { m_destroy_buffers.push_back(ptr_data->cpu_buf); } if (ptr_data->mic_buf != 0) { m_destroy_buffers.push_back(ptr_data->mic_buf); } OFFLOAD_TRACE(3, "Removing association for addr %p\n", ptr_data->cpu_addr.start()); // remove association from map m_device.remove_ptr_data(ptr_data->cpu_addr.start()); } } } } if (m_status) { m_status->data_received += ptr_received; } OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received); OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_received_pointer_data, "Total pointer data received from target: [%lld] bytes\n", ptr_received); return true; } bool OffloadDescriptor::scatter_copyout_data() { OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs); if (m_need_runfunction && m_out_datalen > 0) { // total size that need to be transferred from target to host COIMAPINSTANCE map_inst; COIRESULT res; char *data; // output data buffer if (m_func_desc->data_offset == 0) { OffloadTimer timer_map(get_timer_data(), c_offload_host_map_out_data_buffer); COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen, COI_MAP_READ_ONLY, 0, 0, 0, &map_inst, reinterpret_cast(&data)); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_map, res); } } else { data = (char*) m_func_desc + m_func_desc->data_offset; } // get timing data OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data); data += OFFLOAD_TIMER_DATALEN(); // initialize output marshaller m_out.init_buffer(data, m_out_datalen); for (int i = 0; i < m_vars_total; i++) { switch (m_vars[i].type.src) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.out && !m_vars[i].flags.is_static) { if (m_vars[i].into) { char *ptr = offload_get_src_base( static_cast(m_vars[i].into), m_vars[i].type.dst); m_out.receive_data(ptr + m_vars_extra[i].cpu_disp, m_vars[i].size); } else { m_out.receive_data( static_cast(m_vars[i].ptr) + m_vars_extra[i].cpu_disp, m_vars[i].size); } } break; case c_func_ptr: if (m_vars[i].direction.out) { m_out.receive_func_ptr((const void**) m_vars[i].ptr); } break; default: break; } } if (m_status) { m_status->data_received += m_out.get_tfr_size(); } if (m_func_desc->data_offset == 0) { OffloadTimer timer_unmap(get_timer_data(), c_offload_host_unmap_out_data_buffer); COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0); if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); return false; } report_coi_error(c_buf_unmap, res); } } } OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size()); OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n", m_out.get_tfr_size()); return true; } void get_arr_desc_numbers( const arr_desc *ap, int64_t el_size, int64_t &offset, int64_t &size, int &el_number, CeanReadRanges* &ptr_ranges ) { if (is_arr_desc_contiguous(ap)) { ptr_ranges = NULL; __arr_data_offset_and_length(ap, offset, size); el_number = size / el_size; } else { ptr_ranges = init_read_ranges_arr_desc(ap); el_number = (ptr_ranges->range_size / el_size) * ptr_ranges->range_max_number; size = ptr_ranges->range_size; } } arr_desc * make_arr_desc( void* ptr_val, int64_t extent_start_val, int64_t extent_elements_val, int64_t size ) { arr_desc *res; res = (arr_desc *)malloc(sizeof(arr_desc)); res->base = reinterpret_cast(ptr_val); res->rank = 1; res->dim[0].size = size; res->dim[0].lindex = 0; res->dim[0].lower = extent_start_val; res->dim[0].upper = extent_elements_val + extent_start_val - 1; res->dim[0].stride = 1; return res; } bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) { int pointers_number; int tmp_val; int new_index = m_vars_total; const arr_desc *ap; const VarDesc3 *vd3 = static_cast(m_vars[i].ptr); int flags = vd3->array_fields; bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); ReadArrElements ptr; ReadArrElements into; ReadArrElements ext_start; ReadArrElements ext_elements; ReadArrElements align; ReadArrElements alloc_if; ReadArrElements free_if; ReadArrElements into_start; ReadArrElements into_elem; ReadArrElements alloc_start; ReadArrElements alloc_elem; ap = static_cast(vd3->ptr_array); // "pointers_number" for total number of transferred pointers. // For each of them we create new var_desc and put it at the bottom // of the var_desc's array get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size, pointers_number, ptr.ranges); ptr.base = reinterpret_cast(ap->base); // 2. prepare memory for new var_descs m_vars_total += pointers_number; m_vars = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc)); m_vars_extra = (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra)); m_in_deps = (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1)); m_out_deps = (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total); // 3. Prepare for reading new var_desc's fields // EXTENT START if ((flags & (1<(vd3->extent_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset, ext_start.size, tmp_val, ext_start.ranges); ext_start.base = reinterpret_cast(ap->base); ext_start.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start"); return false; } } else if ((flags & (1<extent_start; } else { ext_start.val = 0; } // EXTENT ELEMENTS NUMBER if ((flags & (1<(vd3->extent_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_elements.offset, ext_elements.size, tmp_val, ext_elements.ranges); ext_elements.base = reinterpret_cast(ap->base); ext_elements.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements"); return false; } } else if ((flags & (1<extent_elements; } else { ext_elements.val = m_vars[i].count; } // ALLOC_IF if ((flags & (1<(vd3->alloc_if_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset, alloc_if.size, tmp_val, alloc_if.ranges); alloc_if.base = reinterpret_cast(ap->base); alloc_if.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if"); return false; } } else { alloc_if.val = m_vars[i].count; } // FREE_IF if ((flags & (1<(vd3->free_if_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset, free_if.size, tmp_val, free_if.ranges); free_if.base = reinterpret_cast(ap->base); free_if.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if"); return false; } } else { free_if.val = m_vars[i].count; } // ALIGN if ((flags & (1<(vd3->align_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset, align.size, tmp_val, align.ranges); align.base = reinterpret_cast(ap->base); align.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align"); return false; } } else { align.val = m_vars[i].align; } // 3.1 INTO if (m_vars[i].into) { ap = static_cast(m_vars[i].into); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset, into.size, tmp_val, into.ranges); into.base = reinterpret_cast(ap->base); if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into"); return false; } } // 3.2 INTO_START if ((flags & (1<(vd3->into_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset, into_start.size, tmp_val, into_start.ranges); into_start.base = reinterpret_cast(ap->base); into_start.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start"); return false; } } else if ((flags & (1<into_start; } else { into_start.val = 0; } // 3.3 INTO_ELEMENTS if ((flags & (1<(vd3->into_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset, into_elem.size, tmp_val, into_elem.ranges); into_elem.base = reinterpret_cast(ap->base); into_elem.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements"); return false; } } else if ((flags & (1<into_elements; } else { into_elem.val = m_vars[i].count; } // alloc_start if ((flags & (1<(vd3->alloc_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_start.offset, alloc_start.size, tmp_val, alloc_start.ranges); alloc_start.base = reinterpret_cast(ap->base); alloc_start.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start"); return false; } } else if ((flags & (1<alloc_start; } else { alloc_start.val = 0; } // alloc_elem if ((flags & (1<(vd3->alloc_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset, alloc_elem.size, tmp_val, alloc_elem.ranges); alloc_elem.base = reinterpret_cast(ap->base); alloc_elem.el_size = ap->dim[ap->rank - 1].size; if (tmp_val < pointers_number) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements"); return false; } } else if ((flags & (1<alloc_elements; } else { alloc_elem.val = 0; } for (int k = 0; k < pointers_number; k++) { int type = flags & 0x3f; int type_src, type_dst; // Get new values // type_src, type_dst type_src = type_dst = (type == c_data_ptr_array) ? c_data_ptr : (type == c_func_ptr_array) ? c_func_ptr : (type == c_void_ptr_array) ? c_void_ptr : (type == c_string_ptr_array) ? c_string_ptr : 0; // Get ptr val if (!ptr.read_next(true)) { break; } else { ptr.val = (void*)(ptr.base + ptr.offset); } // !!! If we got error at phase of reading - it's an internal // !!! error, as we must detect mismatch before // Get into val if (m_vars[i].into) { if (!into.read_next(true)) { LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into"); LIBOFFLOAD_ABORT; } else { into.val = (void*)(into.base + into.offset); } } // Get other components of the clause if (!ext_start.read_next(flags & (1<(make_arr_desc( ptr.val, ext_start.val, ext_elements.val, m_vars[i].size)); type_src = type_src == c_data_ptr ? c_cean_var_ptr : c_string_ptr ? c_cean_var_ptr : type_src; if (!m_vars[i].into) { type_dst = type_src; } } if (m_vars[i].into && into_elem.val != 0) { m_vars[new_index + k].into = static_cast(make_arr_desc( into.val, into_start.val, into_elem.val, m_vars[i].size)); type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr : (type == c_string_ptr_array) ? c_cean_var_ptr : type_src; } else { m_vars[new_index + k].into = NULL; } if (alloc_elem.val != 0) { m_vars[new_index + k].alloc = static_cast(make_arr_desc( ptr.val, alloc_start.val, alloc_elem.val, m_vars[i].size)); } else { m_vars[new_index + k].alloc = NULL; } m_vars[new_index + k].type.src = type_src; m_vars[new_index + k].type.dst = type_dst; m_vars_extra[new_index + k].is_arr_ptr_el = 1; m_vars_extra[new_index + k].ptr_arr_offset = src_is_for_mic ? ptr.offset : into.offset; } // count and alloc fields are useless at target. They can be reused // for pointer arrays. m_vars[i].count = pointers_number; m_vars[i].ptr_arr_offset = new_index; return true; } static void __offload_fini_library(void) { OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n"); if (mic_engines_total > 0) { delete[] mic_engines; if (mic_proxy_fs_root != 0) { free(mic_proxy_fs_root); mic_proxy_fs_root = 0; } if (mic_library_path != 0) { free(mic_library_path); mic_library_path = 0; } // destroy thread key thread_key_delete(mic_thread_key); } // unload COI library if (COI::is_available) { COI::fini(); } OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n"); } static void __offload_init_library_once(void) { COIRESULT res; uint32_t num_devices; std::bitset devices; prefix = report_get_message_str(c_report_host); // initialize trace const char *env_var = getenv(htrace_envname); if (env_var != 0 && *env_var != '\0') { int64_t new_val; if (__offload_parse_int_string(env_var, new_val)) { console_enabled = new_val & 0x0f; } } env_var = getenv(offload_report_envname); if (env_var != 0 && *env_var != '\0') { int64_t env_val; if (__offload_parse_int_string(env_var, env_val)) { if (env_val == OFFLOAD_REPORT_1 || env_val == OFFLOAD_REPORT_2 || env_val == OFFLOAD_REPORT_3) { offload_report_level = env_val; } else { LIBOFFLOAD_ERROR(c_invalid_env_report_value, offload_report_envname); } } else { LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, offload_report_envname); } } else if (!offload_report_level) { env_var = getenv(timer_envname); if (env_var != 0 && *env_var != '\0') { timer_enabled = atoi(env_var); } } // initialize COI if (!COI::init()) { return; } // get number of devices installed in the system res = COI::EngineGetCount(COI_ISA_KNC, &num_devices); if (res != COI_SUCCESS) { return; } if (num_devices > MIC_ENGINES_MAX) { num_devices = MIC_ENGINES_MAX; } // fill in the list of devices that can be used for offloading env_var = getenv("OFFLOAD_DEVICES"); if (env_var != 0) { if (strcasecmp(env_var, "none") != 0) { // value is composed of comma separated physical device indexes char *buf = strdup(env_var); char *str, *ptr; for (str = strtok_r(buf, ",", &ptr); str != 0; str = strtok_r(0, ",", &ptr)) { // convert string to an int int64_t num; if (!__offload_parse_int_string(str, num)) { LIBOFFLOAD_ERROR(c_mic_init5); // fallback to using all installed devices devices.reset(); for (int i = 0; i < num_devices; i++) { devices.set(i); } break; } if (num < 0 || num >= num_devices) { LIBOFFLOAD_ERROR(c_mic_init6, num); continue; } devices.set(num); } free(buf); } } else { // use all available devices for (int i = 0; i < num_devices; i++) { COIENGINE engine; res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine); if (res == COI_SUCCESS) { devices.set(i); } } } mic_engines_total = devices.count(); // no need to continue if there are no devices to offload to if (mic_engines_total <= 0) { return; } // initialize indexes for available devices mic_engines = new Engine[mic_engines_total]; for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) { if (devices[p_idx]) { mic_engines[l_idx].set_indexes(l_idx, p_idx); l_idx++; } } // library search path for device binaries env_var = getenv("MIC_LD_LIBRARY_PATH"); if (env_var != 0) { mic_library_path = strdup(env_var); } // memory size reserved for COI buffers env_var = getenv("MIC_BUFFERSIZE"); if (env_var != 0) { uint64_t new_size; if (__offload_parse_size_string(env_var, new_size)) { mic_buffer_size = new_size; } else { LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE"); } } // determine stacksize for the pipeline on the device env_var = getenv("MIC_STACKSIZE"); if (env_var != 0 && *env_var != '\0') { uint64_t new_size; if (__offload_parse_size_string(env_var, new_size) && (new_size >= 16384) && ((new_size & 4095) == 0)) { mic_stack_size = new_size; } else { LIBOFFLOAD_ERROR(c_mic_init3); } } // proxy I/O env_var = getenv("MIC_PROXY_IO"); if (env_var != 0 && *env_var != '\0') { int64_t new_val; if (__offload_parse_int_string(env_var, new_val)) { mic_proxy_io = new_val; } else { LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO"); } } env_var = getenv("MIC_PROXY_FS_ROOT"); if (env_var != 0 && *env_var != '\0') { mic_proxy_fs_root = strdup(env_var); } // Prepare environment for the target process using the following // rules // - If MIC_ENV_PREFIX is set then any environment variable on the // host which has that prefix are copied to the device without // the prefix. // All other host environment variables are ignored. // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host // environment is duplicated. env_var = getenv("MIC_ENV_PREFIX"); if (env_var != 0 && *env_var != '\0') { mic_env_vars.set_prefix(env_var); int len = strlen(env_var); for (int i = 0; environ[i] != 0; i++) { if (strncmp(environ[i], env_var, len) == 0 && strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 && environ[i][len] != '=') { mic_env_vars.analyze_env_var(environ[i]); } } } // create key for thread data if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) { LIBOFFLOAD_ERROR(c_mic_init4, errno); return; } // cpu frequency cpu_frequency = COI::PerfGetCycleFrequency(); env_var = getenv(mic_use_2mb_buffers_envname); if (env_var != 0 && *env_var != '\0') { uint64_t new_size; if (__offload_parse_size_string(env_var, new_size)) { __offload_use_2mb_buffers = new_size; } else { LIBOFFLOAD_ERROR(c_invalid_env_var_value, mic_use_2mb_buffers_envname); } } env_var = getenv(mic_use_async_buffer_write_envname); if (env_var != 0 && *env_var != '\0') { uint64_t new_size; if (__offload_parse_size_string(env_var, new_size)) { __offload_use_async_buffer_write = new_size; } } env_var = getenv(mic_use_async_buffer_read_envname); if (env_var != 0 && *env_var != '\0') { uint64_t new_size; if (__offload_parse_size_string(env_var, new_size)) { __offload_use_async_buffer_read = new_size; } } // mic initialization type env_var = getenv(offload_init_envname); if (env_var != 0 && *env_var != '\0') { if (strcmp(env_var, "on_offload") == 0) { __offload_init_type = c_init_on_offload; } else if (strcmp(env_var, "on_offload_all") == 0) { __offload_init_type = c_init_on_offload_all; } #ifndef TARGET_WINNT else if (strcmp(env_var, "on_start") == 0) { __offload_init_type = c_init_on_start; } #endif // TARGET_WINNT else { LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname); } } // active wait env_var = getenv(offload_active_wait_envname); if (env_var != 0 && *env_var != '\0') { int64_t new_val; if (__offload_parse_int_string(env_var, new_val)) { __offload_active_wait = new_val; } else { LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, offload_active_wait_envname); } } // omp device num env_var = getenv(omp_device_num_envname); if (env_var != 0 && *env_var != '\0') { int64_t new_val; if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) { __omp_device_num = new_val; } else { LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env, omp_device_num_envname); } } // init ORSL ORSL::init(); } extern int __offload_init_library(void) { // do one time intialization static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT; __offload_run_once(&ctrl, __offload_init_library_once); // offload is available if COI is available and the number of devices > 0 bool is_available = COI::is_available && (mic_engines_total > 0); // register pending libraries if there are any if (is_available && __target_libs) { mutex_locker_t locker(__target_libs_lock); for (TargetImageList::iterator it = __target_libs_list.begin(); it != __target_libs_list.end(); it++) { // Register library in COI COI::ProcessRegisterLibraries(1, &it->data, &it->size, &it->origin, &it->offset); // add lib to all engines for (int i = 0; i < mic_engines_total; i++) { mic_engines[i].add_lib(*it); } } __target_libs = false; __target_libs_list.clear(); } return is_available; } extern "C" void __offload_register_image(const void *target_image) { const struct Image *image = static_cast(target_image); // decode image const char *name = image->data; const void *data = image->data + strlen(image->data) + 1; uint64_t size = image->size; const char *origin = 0; uint64_t offset = 0; // our actions depend on the image type const Elf64_Ehdr *hdr = static_cast(data); switch (hdr->e_type) { case ET_EXEC: // Each offload application is supposed to have only one target // image representing target executable. // No thread synchronization is required here as the initialization // code is always executed in a single thread. if (__target_exe != 0) { LIBOFFLOAD_ERROR(c_multiple_target_exes); exit(1); } __target_exe = new TargetImage(name, data, size, origin, offset); // Registration code for execs is always called from the context // of main and thus we can safely call any function here, // including LoadLibrary API on windows. This is the place where // we do the offload library initialization. if (__offload_init_library()) { // initialize engine if init_type is on_start if (__offload_init_type == c_init_on_start) { for (int i = 0; i < mic_engines_total; i++) { mic_engines[i].init(); } } } break; case ET_DYN: // Registration code for libraries is called from the DllMain // context (on windows) and thus we cannot do anything useful // here. So we just add it to the list of pending libraries for // the later use. __target_libs_lock.lock(); __target_libs = true; __target_libs_list.push_back(TargetImage(name, data, size, origin, offset)); __target_libs_lock.unlock(); break; default: // something is definitely wrong, issue an error and exit LIBOFFLOAD_ERROR(c_unknown_binary_type); exit(1); } } extern "C" void __offload_unregister_image(const void *target_image) { // Target image is packed as follows: // 8 bytes - size of the target binary // null-terminated string - binary name // bytes - binary contents const struct Image { int64_t size; char data[]; } *image = static_cast(target_image); // decode image const char *name = image->data; const void *data = image->data + strlen(image->data) + 1; // our actions depend on the image type const Elf64_Ehdr *hdr = static_cast(data); if (hdr->e_type == ET_EXEC) { // We are executing exec's desctructors. // It is time to do a library cleanup. if (timer_enabled) { Offload_Timer_Print(); } #ifdef MYO_SUPPORT __offload_myoFini(); #endif // MYO_SUPPORT __offload_fini_library(); } } // Runtime trace interface for user programs void __offload_console_trace(int level) { console_enabled = level; } // User-visible offload API int _Offload_number_of_devices(void) { __offload_init_library(); return mic_engines_total; } int _Offload_get_device_number(void) { return -1; } int _Offload_get_physical_device_number(void) { return -1; } int _Offload_signaled(int index, void *signal) { __offload_init_library(); // check index value if (index < 0 || mic_engines_total <= 0) { LIBOFFLOAD_ERROR(c_offload_signaled1, index); LIBOFFLOAD_ABORT; } // find associated async task OffloadDescriptor *task = mic_engines[index % mic_engines_total].find_signal(signal, false); if (task == 0) { LIBOFFLOAD_ERROR(c_offload_signaled2, signal); LIBOFFLOAD_ABORT; } return task->is_signaled(); } void _Offload_report(int val) { if (val == OFFLOAD_REPORT_ON || val == OFFLOAD_REPORT_OFF) { offload_report_enabled = val; } } // IDB support int __dbg_is_attached = 0; int __dbg_target_id = -1; pid_t __dbg_target_so_pid = -1; char __dbg_target_exe_name[MAX_TARGET_NAME] = {0}; const int __dbg_api_major_version = 1; const int __dbg_api_minor_version = 0; void __dbg_target_so_loaded() { } void __dbg_target_so_unloaded() { }