summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Target/TargetLowering.h10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp3
-rw-r--r--llvm/lib/Target/R600/AMDGPUISelLowering.cpp23
-rw-r--r--llvm/lib/Target/R600/AMDGPUISelLowering.h3
-rw-r--r--llvm/test/CodeGen/R600/no-shrink-extloads.ll191
-rw-r--r--llvm/test/CodeGen/R600/store.ll37
6 files changed, 252 insertions, 15 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index c0f2395eca6..17cfe9d913b 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -753,6 +753,16 @@ public:
/// reduce runtime.
virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
+ // Return true if it is profitable to reduce the given load node to a smaller
+ // type.
+ //
+ // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
+ virtual bool shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ return true;
+ }
+
/// When splitting a value of the specified type into parts, does the Lo
/// or Hi part come first? This usually follows the endianness, except
/// for ppcf128, where the Hi part always comes first.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 38a06618a8c..ed3d06cc920 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6036,6 +6036,9 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
return SDValue();
+ if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
+ return SDValue();
+
EVT PtrType = N0.getOperand(1).getValueType();
if (PtrType == MVT::Untyped || PtrType.isExtended())
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
index 35371798c18..0e34b4625c8 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -432,6 +432,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
}
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+ ISD::LoadExtType,
+ EVT NewVT) const {
+
+ unsigned NewSize = NewVT.getStoreSizeInBits();
+
+ // If we are reducing to a 32-bit load, this is always better.
+ if (NewSize == 32)
+ return true;
+
+ EVT OldVT = N->getValueType(0);
+ unsigned OldSize = OldVT.getStoreSizeInBits();
+
+ // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+ // extloads, so doing one requires using a buffer_load. In cases where we
+ // still couldn't use a scalar load, using the wider load shouldn't really
+ // hurt anything.
+
+ // If the old size already had to be an extload, there's no harm in continuing
+ // to reduce the width.
+ return (OldSize < 32);
+}
+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h
index 36b4ee6b1d6..7386eaea73d 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -124,6 +124,9 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtType,
+ EVT ExtVT) const override;
bool isLoadBitCastBeneficial(EVT, EVT) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
diff --git a/llvm/test/CodeGen/R600/no-shrink-extloads.ll b/llvm/test/CodeGen/R600/no-shrink-extloads.ll
new file mode 100644
index 00000000000..135d22d3036
--- /dev/null
+++ b/llvm/test/CodeGen/R600/no-shrink-extloads.ll
@@ -0,0 +1,191 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; Make sure we don't turn the 32-bit argument load into a 16-bit
+; load. There aren't extending scalar lods, so that would require
+; using a buffer_load instruction.
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
+; SI: s_load_dword s
+; SI: buffer_store_short v
+define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+ %trunc = trunc i32 %arg to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; It should be OK (and probably performance neutral) to reduce this,
+; but we don't know if the load is uniform yet.
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
+; SI: buffer_load_dword v
+; SI: buffer_store_short v
+define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i16 addrspace(1)* %out, i32 %tid
+ %load = load i32 addrspace(1)* %gep.in
+ %trunc = trunc i32 %load to i16
+ store i16 %trunc, i16 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+ %trunc = trunc i32 %arg to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+ %load = load i32 addrspace(1)* %gep.in
+ %trunc = trunc i32 %load to i8
+ store i8 %trunc, i8 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+ %trunc = trunc i32 %arg to i1
+ store i1 %trunc, i1 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i32 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i1 addrspace(1)* %out, i32 %tid
+ %load = load i32 addrspace(1)* %gep.in
+ %trunc = trunc i32 %load to i1
+ store i1 %trunc, i1 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+ %trunc = trunc i64 %arg to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %load = load i64 addrspace(1)* %gep.in
+ %trunc = trunc i64 %load to i32
+ store i32 %trunc, i32 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+ %srl = lshr i64 %arg, 32
+ %trunc = trunc i64 %srl to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %load = load i64 addrspace(1)* %gep.in
+ %srl = lshr i64 %load, 32
+ %trunc = trunc i64 %srl to i32
+ store i32 %trunc, i32 addrspace(1)* %gep.out
+ ret void
+}
+
+; Might as well reduce to 8-bit loads.
+; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+ %trunc = trunc i16 %arg to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
+; SI: buffer_load_ubyte v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i16 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+ %load = load i16 addrspace(1)* %gep.in
+ %trunc = trunc i16 %load to i8
+ store i8 %trunc, i8 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+ %srl = lshr i64 %arg, 32
+ %trunc = trunc i64 %srl to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+ %load = load i64 addrspace(1)* %gep.in
+ %srl = lshr i64 %load, 32
+ %trunc = trunc i64 %srl to i8
+ store i8 %trunc, i8 addrspace(1)* %gep.out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+ %trunc = trunc i64 %arg to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %gep.in = getelementptr i64 addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i8 addrspace(1)* %out, i32 %tid
+ %load = load i64 addrspace(1)* %gep.in
+ %trunc = trunc i64 %load to i8
+ store i8 %trunc, i8 addrspace(1)* %gep.out
+ ret void
+}
diff --git a/llvm/test/CodeGen/R600/store.ll b/llvm/test/CodeGen/R600/store.ll
index 713ecd67d7c..b082dd80435 100644
--- a/llvm/test/CodeGen/R600/store.ll
+++ b/llvm/test/CodeGen/R600/store.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-CHECK -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG-CHECK -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM-CHECK -check-prefix=FUNC %s
;===------------------------------------------------------------------------===;
; Global Address Space
@@ -17,16 +17,18 @@ entry:
; i8 store
; EG-CHECK-LABEL: {{^}}store_i8:
; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
+
; IG 0: Get the byte index and truncate the value
-; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+
+
; IG 1: Truncate the calculated the shift amount for the mask
-; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG-CHECK-NEXT: 3
+
; IG 2: Shift the value and the mask
-; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
; EG-CHECK-NEXT: 255
; IG 3: Initialize the Y and Z channels to zero
@@ -46,16 +48,21 @@ entry:
; i16 store
; EG-CHECK-LABEL: {{^}}store_i16:
; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
-; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
+
; IG 0: Get the byte index and truncate the value
-; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
-; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+
+
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK-NEXT: 3(4.203895e-45),
+
+; EG-CHECK: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+
; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; IG 1: Truncate the calculated the shift amount for the mask
-; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
-; EG-CHECK: 3
+
; IG 2: Shift the value and the mask
-; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
; EG-CHECK-NEXT: 65535
; IG 3: Initialize the Y and Z channels to zero
OpenPOWER on IntegriCloud