summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAhmed Bougacha <ahmed.bougacha@gmail.com>2015-09-10 01:42:28 +0000
committerAhmed Bougacha <ahmed.bougacha@gmail.com>2015-09-10 01:42:28 +0000
commitb8886b517d417f14951e8d0dc6f00aef85638c34 (patch)
treeab61734f33f7c9fdf409e9b3ba8bbf6881b3e3db
parent80f766a032fca529ddcb78d952ee882536223b3b (diff)
downloadbcm5719-llvm-b8886b517d417f14951e8d0dc6f00aef85638c34.tar.gz
bcm5719-llvm-b8886b517d417f14951e8d0dc6f00aef85638c34.zip
[AArch64] Support selecting STNP.
We could go through the load/store optimizer and match STNP where we would have matched a nontemporal-annotated STP, but that's not reliable enough, as an opportunistic optimization. Insetad, we can guarantee emitting STNP, by matching them at ISel. Since there are no single-input nontemporal stores, we have to resort to some high-bits-extracting trickery to generate an STNP from a plain store. Also, we need to support another, LDP/STP-specific addressing mode, base + signed scaled 7-bit immediate offset. For now, only match the base. Let's make it smart separately. Part of PR24086. llvm-svn: 247231
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp33
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td39
-rw-r--r--llvm/test/CodeGen/AArch64/nontemporal.ll192
4 files changed, 270 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d93e59ccf77..77896af196d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -77,6 +77,21 @@ public:
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, true, Reg, Shift);
}
+ bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+ }
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
}
@@ -164,6 +179,8 @@ public:
private:
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
+ bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -606,6 +623,22 @@ static bool isWorthFoldingADDlow(SDValue N) {
return true;
}
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ SDLoc dl(N);
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // stp x1, x2, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+}
+
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 0903f320601..d644f264eb9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
let PrintMethod = "printImmScale<16>";
}
+def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d3c244977ec..5f01debf4ce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5825,6 +5825,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR128:$Rt),
+ (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+ (CPYi64 FPR128:$Rt, (i64 1)),
+ GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR64:$Rt),
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+ (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
new file mode 100644
index 00000000000..6db05cb4877
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -0,0 +1,192 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false | FileCheck %s
+
+define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i64:
+; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]
+; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d1, d[[HI1]], [x[[PTR]]]
+; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT: ret
+ store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i32:
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i16:
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v16i8:
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v2i32:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i16:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i8:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64:
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32:
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v1f64:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v1i64:
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x0]
+; CHECK-NEXT: ret
+ store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_i64(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64:
+; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: stnp w1, w[[HI]], [x0]
+; CHECK-NEXT: ret
+ store i64 %v, i64* %p, align 1, !nontemporal !0
+ ret void
+}
+
+
+define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset:
+; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
+ store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset_neg:
+; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #16
+; CHECK-NEXT: mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
+ store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset:
+; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
+ store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg:
+; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT: mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT: stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
+ store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset:
+; CHECK-NEXT: add x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr i64, i64* %p, i32 1
+ store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset_neg:
+; CHECK-NEXT: sub x[[PTR:[0-9]+]], x0, #8
+; CHECK-NEXT: ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT: stnp w1, w[[HI]], [x[[PTR]]]
+; CHECK-NEXT: ret
+ %tmp0 = getelementptr i64, i64* %p, i32 -1
+ store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+ ret void
+}
+
+!0 = !{ i32 1 }
+
+attributes #0 = { nounwind }
OpenPOWER on IntegriCloud