27 files changed, 642 insertions, 0 deletions
diff --git a/libclc/amdgpu/lib/OVERRIDES b/libclc/amdgpu/lib/OVERRIDES
new file mode 100644
index 00000000000..3f941d890be
--- /dev/null
+++ b/libclc/amdgpu/lib/OVERRIDES
@@ -0,0 +1,2 @@
+workitem/get_group_id.cl
+workitem/get_global_size.cl
diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES
new file mode 100644
index 00000000000..7505f3f9692
--- /dev/null
+++ b/libclc/amdgpu/lib/SOURCES
@@ -0,0 +1,25 @@
+atomic/atomic.cl
+math/ldexp.cl
+math/nextafter.cl
+math/sqrt.cl
+workitem/get_num_groups.ll
+workitem/get_group_id.ll
+workitem/get_local_size.ll
+workitem/get_local_id.ll
+workitem/get_global_size.ll
+workitem/get_work_dim.ll
+synchronization/barrier.cl
+image/get_image_width.cl
+image/get_image_height.cl
+image/get_image_depth.cl
+image/get_image_channel_data_type.cl
+image/get_image_channel_order.cl
+image/get_image_attributes_impl.ll
+image/read_imagef.cl
+image/read_imagei.cl
+image/read_imageui.cl
+image/read_image_impl.ll
+image/write_imagef.cl
+image/write_imagei.cl
+image/write_imageui.cl
+image/write_image_impl.ll
diff --git a/libclc/amdgpu/lib/atomic/atomic.cl b/libclc/amdgpu/lib/atomic/atomic.cl
new file mode 100644
index 00000000000..5bfe07b94bf
--- /dev/null
+++ b/libclc/amdgpu/lib/atomic/atomic.cl
@@ -0,0 +1,65 @@
+#include <clc/clc.h>
+
+#define ATOMIC_FUNC_DEFINE(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE val) { \
+	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)val); \
+}
+
+/* For atomic functions that don't need different bitcode dependending on argument signedness */
+#define ATOMIC_FUNC_SIGN(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE); \
+	ATOMIC_FUNC_DEFINE(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	ATOMIC_FUNC_DEFINE(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
+
+#define ATOMIC_FUNC_ADDRSPACE(TYPE, FUNCTION) \
+	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, global, 1) \
+	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, local, 3)
+
+#define ATOMIC_FUNC(FUNCTION) \
+	ATOMIC_FUNC_ADDRSPACE(int, FUNCTION)
+
+#define ATOMIC_FUNC_DEFINE_3_ARG(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE cmp, RET_SIGN TYPE val) { \
+	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)cmp, (ARG_SIGN TYPE)val); \
+}
+
+/* For atomic functions that don't need different bitcode dependending on argument signedness */
+#define ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE, signed TYPE); \
+	ATOMIC_FUNC_DEFINE_3_ARG(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	ATOMIC_FUNC_DEFINE_3_ARG(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
+
+#define ATOMIC_FUNC_ADDRSPACE_3_ARG(TYPE, FUNCTION) \
+	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, global, 1) \
+	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, local, 3)
+
+#define ATOMIC_FUNC_3_ARG(FUNCTION) \
+	ATOMIC_FUNC_ADDRSPACE_3_ARG(int, FUNCTION)
+
+ATOMIC_FUNC(atomic_add)
+ATOMIC_FUNC(atomic_and)
+ATOMIC_FUNC(atomic_or)
+ATOMIC_FUNC(atomic_sub)
+ATOMIC_FUNC(atomic_xchg)
+ATOMIC_FUNC(atomic_xor)
+ATOMIC_FUNC_3_ARG(atomic_cmpxchg)
+
+_CLC_DECL signed int __clc_atomic_max_addr1(volatile global signed int*, signed int);
+_CLC_DECL signed int __clc_atomic_max_addr3(volatile local signed int*, signed int);
+_CLC_DECL uint __clc_atomic_umax_addr1(volatile global uint*, uint);
+_CLC_DECL uint __clc_atomic_umax_addr3(volatile local uint*, uint);
+
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, global, 1)
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, local, 3)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, global, 1)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, local, 3)
+
+_CLC_DECL signed int __clc_atomic_min_addr1(volatile global signed int*, signed int);
+_CLC_DECL signed int __clc_atomic_min_addr3(volatile local signed int*, signed int);
+_CLC_DECL uint __clc_atomic_umin_addr1(volatile global uint*, uint);
+_CLC_DECL uint __clc_atomic_umin_addr3(volatile local uint*, uint);
+
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, global, 1)
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, local, 3)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, global, 1)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, local, 3)
diff --git a/libclc/amdgpu/lib/image/get_image_attributes_impl.ll b/libclc/amdgpu/lib/image/get_image_attributes_impl.ll
new file mode 100644
index 00000000000..7f1965de760
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_attributes_impl.ll
@@ -0,0 +1,87 @@
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare [3 x i32] @llvm.OpenCL.image.get.size.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare [3 x i32] @llvm.OpenCL.image.get.size.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare [2 x i32] @llvm.OpenCL.image.get.format.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare [2 x i32] @llvm.OpenCL.image.get.format.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+define i32 @__clc_get_image_width_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 0
+  ret i32 %2
+}
+define i32 @__clc_get_image_width_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 0
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_height_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 1
+  ret i32 %2
+}
+define i32 @__clc_get_image_height_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 1
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_depth_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 2
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_channel_data_type_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 0
+  ret i32 %2
+}
+define i32 @__clc_get_image_channel_data_type_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 0
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_channel_order_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 1
+  ret i32 %2
+}
+define i32 @__clc_get_image_channel_order_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 1
+  ret i32 %2
+}
+
+attributes #0 = { nounwind readnone alwaysinline }
diff --git a/libclc/amdgpu/lib/image/get_image_channel_data_type.cl b/libclc/amdgpu/lib/image/get_image_channel_data_type.cl
new file mode 100644
index 00000000000..2a2478f73a7
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_channel_data_type.cl
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_channel_data_type_2d(image2d_t);
+_CLC_DECL int __clc_get_image_channel_data_type_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_data_type(image2d_t image) {
+  return __clc_get_image_channel_data_type_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_data_type(image3d_t image) {
+  return __clc_get_image_channel_data_type_3d(image);
+}
diff --git a/libclc/amdgpu/lib/image/get_image_channel_order.cl b/libclc/amdgpu/lib/image/get_image_channel_order.cl
new file mode 100644
index 00000000000..91e9b89e17e
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_channel_order.cl
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_channel_order_2d(image2d_t);
+_CLC_DECL int __clc_get_image_channel_order_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_order(image2d_t image) {
+  return __clc_get_image_channel_order_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_order(image3d_t image) {
+  return __clc_get_image_channel_order_3d(image);
+}
diff --git a/libclc/amdgpu/lib/image/get_image_depth.cl b/libclc/amdgpu/lib/image/get_image_depth.cl
new file mode 100644
index 00000000000..18646458e9a
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_depth.cl
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_depth_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_depth(image3d_t image) {
+	return __clc_get_image_depth_3d(image);
+}
diff --git a/libclc/amdgpu/lib/image/get_image_height.cl b/libclc/amdgpu/lib/image/get_image_height.cl
new file mode 100644
index 00000000000..80b364090c3
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_height.cl
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_height_2d(image2d_t);
+_CLC_DECL int __clc_get_image_height_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_height(image2d_t image) {
+  return __clc_get_image_height_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_height(image3d_t image) {
+  return __clc_get_image_height_3d(image);
+}
diff --git a/libclc/amdgpu/lib/image/get_image_width.cl b/libclc/amdgpu/lib/image/get_image_width.cl
new file mode 100644
index 00000000000..29e4e9468ba
--- /dev/null
+++ b/libclc/amdgpu/lib/image/get_image_width.cl
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_width_2d(image2d_t);
+_CLC_DECL int __clc_get_image_width_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_width(image2d_t image) {
+  return __clc_get_image_width_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_width(image3d_t image) {
+  return __clc_get_image_width_3d(image);
+}
diff --git a/libclc/amdgpu/lib/image/read_image_impl.ll b/libclc/amdgpu/lib/image/read_image_impl.ll
new file mode 100644
index 00000000000..229a2526c37
--- /dev/null
+++ b/libclc/amdgpu/lib/image/read_image_impl.ll
@@ -0,0 +1,46 @@
+%opencl.image2d_t = type opaque
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32,
+                                   i32, i32, i32) readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.sampler.get.resource.id(i32) readnone
+
+define <4 x float> @__clc_v4f_from_v2f(<2 x float> %v) alwaysinline {
+  %e0 = extractelement <2 x float> %v, i32 0
+  %e1 = extractelement <2 x float> %v, i32 1
+  %res.0 = insertelement <4 x float> undef,  float %e0, i32 0
+  %res.1 = insertelement <4 x float> %res.0, float %e1, i32 1
+  %res.2 = insertelement <4 x float> %res.1, float 0.0, i32 2
+  %res.3 = insertelement <4 x float> %res.2, float 0.0, i32 3
+  ret <4 x float> %res.3
+}
+
+define <4 x float> @__clc_read_imagef_tex(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    i32 %sampler, <2 x float> %coord) alwaysinline {
+entry:
+  %coord_v4 = call <4 x float> @__clc_v4f_from_v2f(<2 x float> %coord)
+  %smp_id = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %sampler)
+  %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %img)
+  %tex_id = add i32 %img_id, 2    ; First 2 IDs are reserved.
+
+  %coord_norm = and i32 %sampler, 1
+  %is_norm = icmp eq i32 %coord_norm, 1
+  br i1 %is_norm, label %NormCoord, label %UnnormCoord
+NormCoord:
+  %data.norm = call <4 x float> @llvm.R600.tex(
+      <4 x float> %coord_v4,
+      i32 0, i32 0, i32 0,        ; Offset.
+      i32 2, i32 %smp_id,
+      i32 1, i32 1, i32 1, i32 1) ; Normalized coords.
+  ret <4 x float> %data.norm
+UnnormCoord:
+  %data.unnorm = call <4 x float> @llvm.R600.tex(
+      <4 x float> %coord_v4,
+      i32 0, i32 0, i32 0,        ; Offset.
+      i32 %tex_id, i32 %smp_id,
+      i32 0, i32 0, i32 0, i32 0) ; Unnormalized coords.
+  ret <4 x float> %data.unnorm
+}
diff --git a/libclc/amdgpu/lib/image/read_imagef.cl b/libclc/amdgpu/lib/image/read_imagef.cl
new file mode 100644
index 00000000000..af80adad189
--- /dev/null
+++ b/libclc/amdgpu/lib/image/read_imagef.cl
@@ -0,0 +1,14 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler,
+                                          int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_read_imagef_tex(image, sampler, coord_float);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler,
+                                          float2 coord) {
+  return __clc_read_imagef_tex(image, sampler, coord);
+}
diff --git a/libclc/amdgpu/lib/image/read_imagei.cl b/libclc/amdgpu/lib/image/read_imagei.cl
new file mode 100644
index 00000000000..b973aae94a0
--- /dev/null
+++ b/libclc/amdgpu/lib/image/read_imagei.cl
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+int4 __clc_reinterpret_v4f_to_v4i(float4 v) {
+  union {
+    int4 v4i;
+    float4 v4f;
+  } res = { .v4f = v};
+  return res.v4i;
+}
+
+_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler,
+                                        int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_reinterpret_v4f_to_v4i(
+    __clc_read_imagef_tex(image, sampler, coord_float));
+}
+_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler,
+                                        float2 coord) {
+  return __clc_reinterpret_v4f_to_v4i(
+    __clc_read_imagef_tex(image, sampler, coord));
+}
diff --git a/libclc/amdgpu/lib/image/read_imageui.cl b/libclc/amdgpu/lib/image/read_imageui.cl
new file mode 100644
index 00000000000..ec9836e7ec2
--- /dev/null
+++ b/libclc/amdgpu/lib/image/read_imageui.cl
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+uint4 __clc_reinterpret_v4f_to_v4ui(float4 v) {
+  union {
+    uint4 v4ui;
+    float4 v4f;
+  } res = { .v4f = v};
+  return res.v4ui;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler,
+                                          int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_reinterpret_v4f_to_v4ui(
+    __clc_read_imagef_tex(image, sampler, coord_float));
+}
+_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler,
+                                          float2 coord) {
+  return __clc_reinterpret_v4f_to_v4ui(
+    __clc_read_imagef_tex(image, sampler, coord));
+}
diff --git a/libclc/amdgpu/lib/image/write_image_impl.ll b/libclc/amdgpu/lib/image/write_image_impl.ll
new file mode 100644
index 00000000000..265f5d6045e
--- /dev/null
+++ b/libclc/amdgpu/lib/image/write_image_impl.ll
@@ -0,0 +1,52 @@
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord, i32 %rat_id)
+
+define void @__clc_write_imageui_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x i32> %color) #0 {
+
+  ; Coordinate int2 -> int4.
+  %e0 = extractelement <2 x i32> %coord, i32 0
+  %e1 = extractelement <2 x i32> %coord, i32 1
+  %coord.0 = insertelement <4 x i32> undef,    i32 %e0, i32 0
+  %coord.1 = insertelement <4 x i32> %coord.0, i32 %e1, i32 1
+  %coord.2 = insertelement <4 x i32> %coord.1, i32 0,  i32 2
+  %coord.3 = insertelement <4 x i32> %coord.2, i32 0,  i32 3
+
+  ; Get RAT ID.
+  %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %img)
+  %rat_id = add i32 %img_id, 1
+
+  ; Call store intrinsic.
+  call void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord.3, i32 %rat_id)
+  ret void
+}
+
+define void @__clc_write_imagei_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x i32> %color) #0 {
+  call void @__clc_write_imageui_2d(
+      %opencl.image2d_t addrspace(1)* nocapture %img,
+      <2 x i32> %coord, <4 x i32> %color)
+  ret void
+}
+
+define void @__clc_write_imagef_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x float> %color) #0 {
+  %color.i32 = bitcast <4 x float> %color to <4 x i32>
+  call void @__clc_write_imageui_2d(
+      %opencl.image2d_t addrspace(1)* nocapture %img,
+      <2 x i32> %coord, <4 x i32> %color.i32)
+  ret void
+}
+
+attributes #0 = { alwaysinline }
diff --git a/libclc/amdgpu/lib/image/write_imagef.cl b/libclc/amdgpu/lib/image/write_imagef.cl
new file mode 100644
index 00000000000..4483fcf68db
--- /dev/null
+++ b/libclc/amdgpu/lib/image/write_imagef.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imagef_2d(image2d_t image, int2 coord, float4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imagef(image2d_t image, int2 coord, float4 color)
+{
+  __clc_write_imagef_2d(image, coord, color);
+}
diff --git a/libclc/amdgpu/lib/image/write_imagei.cl b/libclc/amdgpu/lib/image/write_imagei.cl
new file mode 100644
index 00000000000..394a223d0a8
--- /dev/null
+++ b/libclc/amdgpu/lib/image/write_imagei.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imagei_2d(image2d_t image, int2 coord, int4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imagei(image2d_t image, int2 coord, int4 color)
+{
+  __clc_write_imagei_2d(image, coord, color);
+}
diff --git a/libclc/amdgpu/lib/image/write_imageui.cl b/libclc/amdgpu/lib/image/write_imageui.cl
new file mode 100644
index 00000000000..91344de8a1d
--- /dev/null
+++ b/libclc/amdgpu/lib/image/write_imageui.cl
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imageui_2d(image2d_t image, int2 coord, uint4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imageui(image2d_t image, int2 coord, uint4 color)
+{
+  __clc_write_imageui_2d(image, coord, color);
+}
diff --git a/libclc/amdgpu/lib/math/ldexp.cl b/libclc/amdgpu/lib/math/ldexp.cl
new file mode 100644
index 00000000000..80439ce3a43
--- /dev/null
+++ b/libclc/amdgpu/lib/math/ldexp.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../../../generic/lib/clcmacro.h"
+
+#ifdef __HAS_LDEXPF__
+#define BUILTINF __builtin_amdgpu_ldexpf
+#else
+#include "math/clc_ldexp.h"
+#define BUILTINF __clc_ldexp
+#endif
+
+// This defines all the ldexp(floatN, intN) variants.
+_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, BUILTINF, float, int);
+
+#ifdef cl_khr_fp64
+  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    // This defines all the ldexp(doubleN, intN) variants.
+  _CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __builtin_amdgpu_ldexp, double, int);
+#endif
+
+// This defines all the ldexp(GENTYPE, int);
+#define __CLC_BODY <../../../generic/lib/math/ldexp.inc>
+#include <clc/math/gentype.inc>
+
+#undef BUILTINF
diff --git a/libclc/amdgpu/lib/math/nextafter.cl b/libclc/amdgpu/lib/math/nextafter.cl
new file mode 100644
index 00000000000..4611c81ae91
--- /dev/null
+++ b/libclc/amdgpu/lib/math/nextafter.cl
@@ -0,0 +1,4 @@
+#include <clc/clc.h>
+#include "../lib/clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float)
diff --git a/libclc/amdgpu/lib/math/sqrt.cl b/libclc/amdgpu/lib/math/sqrt.cl
new file mode 100644
index 00000000000..3e5b17c5db0
--- /dev/null
+++ b/libclc/amdgpu/lib/math/sqrt.cl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../../../generic/lib/clcmacro.h"
+#include "math/clc_sqrt.h"
+
+_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
+
+  uint vcc = x < 0x1p-767;
+  uint exp0 = vcc ? 0x100 : 0;
+  unsigned exp1 = vcc ? 0xffffff80 : 0;
+
+  double v01 = ldexp(x, exp0);
+  double v23 = __builtin_amdgpu_rsq(v01);
+  double v45 = v01 * v23;
+  v23 = v23 * 0.5;
+
+  double v67 = fma(-v23, v45, 0.5);
+  v45 = fma(v45, v67, v45);
+  double v89 = fma(-v45, v45, v01);
+  v23 = fma(v23, v67, v23);
+  v45 = fma(v89, v23, v45);
+  v67 = fma(-v45, v45, v01);
+  v23 = fma(v67, v23, v45);
+
+  v23 = ldexp(v23, exp1);
+  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+
+#endif
diff --git a/libclc/amdgpu/lib/synchronization/barrier.cl b/libclc/amdgpu/lib/synchronization/barrier.cl
new file mode 100644
index 00000000000..6f2900b06ee
--- /dev/null
+++ b/libclc/amdgpu/lib/synchronization/barrier.cl
@@ -0,0 +1,10 @@
+
+#include <clc/clc.h>
+
+_CLC_DEF int __clc_clk_local_mem_fence() {
+  return CLK_LOCAL_MEM_FENCE;
+}
+
+_CLC_DEF int __clc_clk_global_mem_fence() {
+  return CLK_GLOBAL_MEM_FENCE;
+}
diff --git a/libclc/amdgpu/lib/workitem/get_global_size.ll b/libclc/amdgpu/lib/workitem/get_global_size.ll
new file mode 100644
index 00000000000..ac2d08d8ee1
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_global_size.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.global.size.x() nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() nounwind readnone
+
+define i32 @get_global_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.global.size.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.global.size.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.global.size.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/libclc/amdgpu/lib/workitem/get_group_id.ll b/libclc/amdgpu/lib/workitem/get_group_id.ll
new file mode 100644
index 00000000000..0dc86e5edfe
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_group_id.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.tgid.x() nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() nounwind readnone
+
+define i32 @get_group_id(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/libclc/amdgpu/lib/workitem/get_local_id.ll b/libclc/amdgpu/lib/workitem/get_local_id.ll
new file mode 100644
index 00000000000..ac5522a7822
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_local_id.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() nounwind readnone
+
+define i32 @get_local_id(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.tidig.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.tidig.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/libclc/amdgpu/lib/workitem/get_local_size.ll b/libclc/amdgpu/lib/workitem/get_local_size.ll
new file mode 100644
index 00000000000..0a98de683ae
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_local_size.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.local.size.x() nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() nounwind readnone
+
+define i32 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/libclc/amdgpu/lib/workitem/get_num_groups.ll b/libclc/amdgpu/lib/workitem/get_num_groups.ll
new file mode 100644
index 00000000000..a708f422c27
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_num_groups.ll
@@ -0,0 +1,18 @@
+declare i32 @llvm.r600.read.ngroups.x() nounwind readnone
+declare i32 @llvm.r600.read.ngroups.y() nounwind readnone
+declare i32 @llvm.r600.read.ngroups.z() nounwind readnone
+
+define i32 @get_num_groups(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.ngroups.x() nounwind readnone
+  ret i32 %x
+y_dim:
+  %y = call i32 @llvm.r600.read.ngroups.y() nounwind readnone
+  ret i32 %y
+z_dim:
+  %z = call i32 @llvm.r600.read.ngroups.z() nounwind readnone
+  ret i32 %z
+default:
+  ret i32 0
+}
diff --git a/libclc/amdgpu/lib/workitem/get_work_dim.ll b/libclc/amdgpu/lib/workitem/get_work_dim.ll
new file mode 100644
index 00000000000..1f86b5e05f5
--- /dev/null
+++ b/libclc/amdgpu/lib/workitem/get_work_dim.ll
@@ -0,0 +1,8 @@
+declare i32 @llvm.AMDGPU.read.workdim() nounwind readnone
+
+define i32 @get_work_dim() nounwind readnone alwaysinline {
+  %x = call i32 @llvm.AMDGPU.read.workdim() nounwind readnone , !range !0
+  ret i32 %x
+}
+
+!0 = !{ i32 1, i32 4 }