//===----------------------------------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.txt for details. // //===----------------------------------------------------------------------===// #include "offload_target.h" #include #include #ifdef SEP_SUPPORT #include #include #endif // SEP_SUPPORT #include #include // typedef offload_func_with_parms. // Pointer to function that represents an offloaded entry point. // The parameters are a temporary fix for parameters on the stack. typedef void (*offload_func_with_parms)(void *); // Target console and file logging const char *prefix; int console_enabled = 0; int offload_report_level = 0; // Trace information static const char* vardesc_direction_as_string[] = { "NOCOPY", "IN", "OUT", "INOUT" }; static const char* vardesc_type_as_string[] = { "unknown", "data", "data_ptr", "func_ptr", "void_ptr", "string_ptr", "dv", "dv_data", "dv_data_slice", "dv_ptr", "dv_ptr_data", "dv_ptr_data_slice", "cean_var", "cean_var_ptr", "c_data_ptr_array" }; int mic_index = -1; int mic_engines_total = -1; uint64_t mic_frequency = 0; int offload_number = 0; static std::map ref_data; static mutex_t add_ref_lock; #ifdef SEP_SUPPORT static const char* sep_monitor_env = "SEP_MONITOR"; static bool sep_monitor = false; static const char* sep_device_env = "SEP_DEVICE"; static const char* sep_device = "/dev/sep3.8/c"; static int sep_counter = 0; #define SEP_API_IOC_MAGIC 99 #define SEP_IOCTL_PAUSE _IO (SEP_API_IOC_MAGIC, 31) #define SEP_IOCTL_RESUME _IO (SEP_API_IOC_MAGIC, 32) static void add_ref_count(void * buf, bool created) { mutex_locker_t locker(add_ref_lock); RefInfo * info = ref_data[buf]; if (info) { info->count++; } else { info = new RefInfo((int)created,(long)1); } info->is_added |= created; ref_data[buf] = info; } static void BufReleaseRef(void * buf) { mutex_locker_t locker(add_ref_lock); RefInfo * info = ref_data[buf]; if (info) { --info->count; if (info->count == 0 && info->is_added) { BufferReleaseRef(buf); info->is_added = 0; } } } static int VTPauseSampling(void) { int ret = -1; int handle = open(sep_device, O_RDWR); if (handle > 0) { ret = ioctl(handle, SEP_IOCTL_PAUSE); close(handle); } return ret; } static int VTResumeSampling(void) { int ret = -1; int handle = open(sep_device, O_RDWR); if (handle > 0) { ret = ioctl(handle, SEP_IOCTL_RESUME); close(handle); } return ret; } #endif // SEP_SUPPORT void OffloadDescriptor::offload( uint32_t buffer_count, void** buffers, void* misc_data, uint16_t misc_data_len, void* return_data, uint16_t return_data_len ) { FunctionDescriptor *func = (FunctionDescriptor*) misc_data; const char *name = func->data; OffloadDescriptor ofld; char *in_data = 0; char *out_data = 0; char *timer_data = 0; console_enabled = func->console_enabled; timer_enabled = func->timer_enabled; offload_report_level = func->offload_report_level; offload_number = func->offload_number; ofld.set_offload_number(func->offload_number); #ifdef SEP_SUPPORT if (sep_monitor) { if (__sync_fetch_and_add(&sep_counter, 1) == 0) { OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n"); VTResumeSampling(); } } #endif // SEP_SUPPORT OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(), c_offload_start_target_func, "Offload \"%s\" started\n", name); // initialize timer data OFFLOAD_TIMER_INIT(); OFFLOAD_TIMER_START(c_offload_target_total_time); OFFLOAD_TIMER_START(c_offload_target_descriptor_setup); // get input/output buffer addresses if (func->in_datalen > 0 || func->out_datalen > 0) { if (func->data_offset != 0) { in_data = (char*) misc_data + func->data_offset; out_data = (char*) return_data; } else { char *inout_buf = (char*) buffers[--buffer_count]; in_data = inout_buf; out_data = inout_buf; } } // assign variable descriptors ofld.m_vars_total = func->vars_num; if (ofld.m_vars_total > 0) { uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc); ofld.m_vars = (VarDesc*) malloc(var_data_len); memcpy(ofld.m_vars, in_data, var_data_len); in_data += var_data_len; func->in_datalen -= var_data_len; } // timer data if (func->timer_enabled) { uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN(); timer_data = out_data; out_data += timer_data_len; func->out_datalen -= timer_data_len; } // init Marshallers ofld.m_in.init_buffer(in_data, func->in_datalen); ofld.m_out.init_buffer(out_data, func->out_datalen); // copy buffers to offload descriptor std::copy(buffers, buffers + buffer_count, std::back_inserter(ofld.m_buffers)); OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup); // find offload entry address OFFLOAD_TIMER_START(c_offload_target_func_lookup); offload_func_with_parms entry = (offload_func_with_parms) __offload_entries.find_addr(name); if (entry == NULL) { #if OFFLOAD_DEBUG > 0 if (console_enabled > 2) { __offload_entries.dump(); } #endif LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name); exit(1); } OFFLOAD_TIMER_STOP(c_offload_target_func_lookup); OFFLOAD_TIMER_START(c_offload_target_func_time); // execute offload entry entry(&ofld); OFFLOAD_TIMER_STOP(c_offload_target_func_time); OFFLOAD_TIMER_STOP(c_offload_target_total_time); // copy timer data to the buffer OFFLOAD_TIMER_TARGET_DATA(timer_data); OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name); #ifdef SEP_SUPPORT if (sep_monitor) { if (__sync_sub_and_fetch(&sep_counter, 1) == 0) { OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n"); VTPauseSampling(); } } #endif // SEP_SUPPORT } void OffloadDescriptor::merge_var_descs( VarDesc *vars, VarDesc2 *vars2, int vars_total ) { // number of variable descriptors received from host and generated // locally should match if (m_vars_total < vars_total) { LIBOFFLOAD_ERROR(c_merge_var_descs1); exit(1); } for (int i = 0; i < m_vars_total; i++) { if (i < vars_total) { // variable type must match if (m_vars[i].type.bits != vars[i].type.bits) { LIBOFFLOAD_ERROR(c_merge_var_descs2); exit(1); } m_vars[i].ptr = vars[i].ptr; m_vars[i].into = vars[i].into; const char *var_sname = ""; if (vars2 != NULL) { if (vars2[i].sname != NULL) { var_sname = vars2[i].sname; } } OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var, " VarDesc %d, var=%s, %s, %s\n", i, var_sname, vardesc_direction_as_string[m_vars[i].direction.bits], vardesc_type_as_string[m_vars[i].type.src]); if (vars2 != NULL && vars2[i].dname != NULL) { OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname, vardesc_type_as_string[m_vars[i].type.dst]); } } OFFLOAD_TRACE(2, " type_src=%d, type_dstn=%d, direction=%d, " "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, " "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n", m_vars[i].type.src, m_vars[i].type.dst, m_vars[i].direction.bits, m_vars[i].alloc_if, m_vars[i].free_if, m_vars[i].align, m_vars[i].mic_offset, m_vars[i].flags.bits, m_vars[i].offset, m_vars[i].size, m_vars[i].count, m_vars[i].ptr, m_vars[i].into); } } void OffloadDescriptor::scatter_copyin_data() { OFFLOAD_TIMER_START(c_offload_target_scatter_inputs); OFFLOAD_DEBUG_TRACE(2, "IN buffer @ %p size %lld\n", m_in.get_buffer_start(), m_in.get_buffer_size()); OFFLOAD_DEBUG_DUMP_BYTES(2, m_in.get_buffer_start(), m_in.get_buffer_size()); // receive data for (int i = 0; i < m_vars_total; i++) { bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); void** ptr_addr = src_is_for_mic ? static_cast(m_vars[i].ptr) : static_cast(m_vars[i].into); int type = src_is_for_mic ? m_vars[i].type.src : m_vars[i].type.dst; bool is_static = src_is_for_mic ? m_vars[i].flags.is_static : m_vars[i].flags.is_static_dstn; void *ptr = NULL; if (m_vars[i].flags.alloc_disp) { int64_t offset = 0; m_in.receive_data(&offset, sizeof(offset)); m_vars[i].offset = -offset; } if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) { ArrDesc *dvp = (type == c_dv_data_slice || type == c_dv_data)? reinterpret_cast(ptr_addr) : *reinterpret_cast(ptr_addr); ptr_addr = reinterpret_cast(&dvp->Base); } // Set pointer values switch (type) { case c_data_ptr_array: { int j = m_vars[i].ptr_arr_offset; int max_el = j + m_vars[i].count; char *dst_arr_ptr = (src_is_for_mic)? *(reinterpret_cast(m_vars[i].ptr)) : reinterpret_cast(m_vars[i].into); for (; j < max_el; j++) { if (src_is_for_mic) { m_vars[j].ptr = dst_arr_ptr + m_vars[j].ptr_arr_offset; } else { m_vars[j].into = dst_arr_ptr + m_vars[j].ptr_arr_offset; } } } break; case c_data: case c_void_ptr: case c_cean_var: case c_dv: break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].alloc_if) { void *buf; if (m_vars[i].flags.sink_addr) { m_in.receive_data(&buf, sizeof(buf)); } else { buf = m_buffers.front(); m_buffers.pop_front(); } if (buf) { if (!is_static) { if (!m_vars[i].flags.sink_addr) { // increment buffer reference OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs); BufferAddRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs); } add_ref_count(buf, 0 == m_vars[i].flags.sink_addr); } ptr = static_cast(buf) + m_vars[i].mic_offset + (m_vars[i].flags.is_stack_buf ? 0 : m_vars[i].offset); } *ptr_addr = ptr; } else if (m_vars[i].flags.sink_addr) { void *buf; m_in.receive_data(&buf, sizeof(buf)); void *ptr = static_cast(buf) + m_vars[i].mic_offset + (m_vars[i].flags.is_stack_buf ? 0 : m_vars[i].offset); *ptr_addr = ptr; } break; case c_func_ptr: break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (m_vars[i].alloc_if) { void *buf; if (m_vars[i].flags.sink_addr) { m_in.receive_data(&buf, sizeof(buf)); } else { buf = m_buffers.front(); m_buffers.pop_front(); } if (buf) { if (!is_static) { if (!m_vars[i].flags.sink_addr) { // increment buffer reference OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs); BufferAddRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs); } add_ref_count(buf, 0 == m_vars[i].flags.sink_addr); } ptr = static_cast(buf) + m_vars[i].mic_offset + m_vars[i].offset; } *ptr_addr = ptr; } else if (m_vars[i].flags.sink_addr) { void *buf; m_in.receive_data(&buf, sizeof(buf)); ptr = static_cast(buf) + m_vars[i].mic_offset + m_vars[i].offset; *ptr_addr = ptr; } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, type); abort(); } // Release obsolete buffers for stack of persistent objects if (type = c_data_ptr && m_vars[i].flags.is_stack_buf && !m_vars[i].direction.bits && m_vars[i].alloc_if && m_vars[i].size != 0) { for (int j=0; j < m_vars[i].size; j++) { void *buf; m_in.receive_data(&buf, sizeof(buf)); BufferReleaseRef(buf); ref_data.erase(buf); } } // Do copyin switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.in && !m_vars[i].flags.is_static_dstn) { int64_t size; int64_t disp; char* ptr = m_vars[i].into ? static_cast(m_vars[i].into) : static_cast(m_vars[i].ptr); if (m_vars[i].type.dst == c_cean_var) { m_in.receive_data((&size), sizeof(int64_t)); m_in.receive_data((&disp), sizeof(int64_t)); } else { size = m_vars[i].size; disp = 0; } m_in.receive_data(ptr + disp, size); } break; case c_dv: if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { char* ptr = m_vars[i].into ? static_cast(m_vars[i].into) : static_cast(m_vars[i].ptr); m_in.receive_data(ptr + sizeof(uint64_t), m_vars[i].size - sizeof(uint64_t)); } break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: break; case c_func_ptr: if (m_vars[i].direction.in) { m_in.receive_func_ptr((const void**) m_vars[i].ptr); } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst); abort(); } } OFFLOAD_TRACE(1, "Total copyin data received from host: [%lld] bytes\n", m_in.get_tfr_size()); OFFLOAD_TIMER_STOP(c_offload_target_scatter_inputs); OFFLOAD_TIMER_START(c_offload_target_compute); } void OffloadDescriptor::gather_copyout_data() { OFFLOAD_TIMER_STOP(c_offload_target_compute); OFFLOAD_TIMER_START(c_offload_target_gather_outputs); for (int i = 0; i < m_vars_total; i++) { bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); switch (m_vars[i].type.src) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: if (m_vars[i].direction.out && !m_vars[i].flags.is_static) { m_out.send_data( static_cast(m_vars[i].ptr) + m_vars[i].disp, m_vars[i].size); } break; case c_dv: break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].free_if && src_is_for_mic && !m_vars[i].flags.is_static) { void *buf = *static_cast(m_vars[i].ptr) - m_vars[i].mic_offset - (m_vars[i].flags.is_stack_buf? 0 : m_vars[i].offset); if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs); } break; case c_func_ptr: if (m_vars[i].direction.out) { m_out.send_func_ptr(*((void**) m_vars[i].ptr)); } break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (src_is_for_mic && m_vars[i].free_if && !m_vars[i].flags.is_static) { ArrDesc *dvp = (m_vars[i].type.src == c_dv_data || m_vars[i].type.src == c_dv_data_slice) ? static_cast(m_vars[i].ptr) : *static_cast(m_vars[i].ptr); void *buf = reinterpret_cast(dvp->Base) - m_vars[i].mic_offset - m_vars[i].offset; if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs); } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst); abort(); } if (m_vars[i].into) { switch (m_vars[i].type.dst) { case c_data_ptr_array: break; case c_data: case c_void_ptr: case c_cean_var: case c_dv: break; case c_string_ptr: case c_data_ptr: case c_cean_var_ptr: case c_dv_ptr: if (m_vars[i].direction.in && m_vars[i].free_if && !m_vars[i].flags.is_static_dstn) { void *buf = *static_cast(m_vars[i].into) - m_vars[i].mic_offset - (m_vars[i].flags.is_stack_buf? 0 : m_vars[i].offset); if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START( c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP( c_offload_target_release_buffer_refs); } break; case c_func_ptr: break; case c_dv_data: case c_dv_ptr_data: case c_dv_data_slice: case c_dv_ptr_data_slice: if (m_vars[i].free_if && m_vars[i].direction.in && !m_vars[i].flags.is_static_dstn) { ArrDesc *dvp = (m_vars[i].type.dst == c_dv_data_slice || m_vars[i].type.dst == c_dv_data) ? static_cast(m_vars[i].into) : *static_cast(m_vars[i].into); void *buf = reinterpret_cast(dvp->Base) - m_vars[i].mic_offset - m_vars[i].offset; if (buf == NULL) { break; } // decrement buffer reference count OFFLOAD_TIMER_START( c_offload_target_release_buffer_refs); BufReleaseRef(buf); OFFLOAD_TIMER_STOP( c_offload_target_release_buffer_refs); } break; default: LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst); abort(); } } } OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n", m_out.get_buffer_start(), m_out.get_buffer_size()); OFFLOAD_DEBUG_DUMP_BYTES(2, m_out.get_buffer_start(), m_out.get_buffer_size()); OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data, "Total copyout data sent to host: [%lld] bytes\n", m_out.get_tfr_size()); OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs); } void __offload_target_init(void) { #ifdef SEP_SUPPORT const char* env_var = getenv(sep_monitor_env); if (env_var != 0 && *env_var != '\0') { sep_monitor = atoi(env_var); } env_var = getenv(sep_device_env); if (env_var != 0 && *env_var != '\0') { sep_device = env_var; } #endif // SEP_SUPPORT prefix = report_get_message_str(c_report_mic); // init frequency mic_frequency = COIPerfGetCycleFrequency(); } // User-visible offload API int _Offload_number_of_devices(void) { return mic_engines_total; } int _Offload_get_device_number(void) { return mic_index; } int _Offload_get_physical_device_number(void) { uint32_t index; EngineGetIndex(&index); return index; }