diff options
author | Ahmed Bougacha <ahmed.bougacha@gmail.com> | 2015-01-05 17:10:26 +0000 |
---|---|---|
committer | Ahmed Bougacha <ahmed.bougacha@gmail.com> | 2015-01-05 17:10:26 +0000 |
commit | d54c448d34873283806ebd42a81573dbf6977e83 (patch) | |
tree | 0164545d8126b26989424d10ae642ded5200b62c /llvm | |
parent | 4ae1f67ccfbb559139ea0015f2fe1750d9e2dfac (diff) | |
download | bcm5719-llvm-d54c448d34873283806ebd42a81573dbf6977e83.tar.gz bcm5719-llvm-d54c448d34873283806ebd42a81573dbf6977e83.zip |
[AArch64] Improve codegen of store lane instructions by avoiding GPR usage.
We used to generate code similar to:
umov.b w8, v0[2]
strb w8, [x0, x1]
because the STR*ro* patterns were preferred to ST1*.
Instead, we can avoid going through GPRs, and generate:
add x8, x0, x1
st1.b { v0 }[2], [x8]
This patch increases the ST1* AddedComplexity to achieve that.
rdar://16372710
Differential Revision: http://reviews.llvm.org/D6202
llvm-svn: 225183
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/arm64-st1.ll | 108 |
2 files changed, 106 insertions, 6 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index cae02d0a32e..46292f86e34 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4856,7 +4856,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store @@ -4872,7 +4872,7 @@ def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>; def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>; def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index a4818bd8850..76d52f44b48 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -8,10 +8,20 @@ define void @st1lane_16b(<16 x i8> %A, i8* %D) { ret void } +define void @st1lane_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_16b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <16 x i8> %A, i32 1 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_16b -; CHECK: umov.b w[[WREG:[0-9]+]], v0[0] -; CHECK: strb w[[WREG]], [x0, x1] +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[0], [x[[XREG]]] %ptr = getelementptr i8* %D, i64 %offset %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, i8* %ptr @@ -26,6 +36,16 @@ define void @st1lane_8h(<8 x i16> %A, i16* %D) { ret void } +define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_8h +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.h { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <8 x i16> %A, i32 1 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_8h ; CHECK: str h0, [x0, x1, lsl #1] @@ -43,6 +63,16 @@ define void @st1lane_4s(<4 x i32> %A, i32* %D) { ret void } +define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4s +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <4 x i32> %A, i32 1 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_4s ; CHECK: str s0, [x0, x1, lsl #2] @@ -60,6 +90,16 @@ define void @st1lane_4s_float(<4 x float> %A, float* %D) { ret void } +define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4s_float +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <4 x float> %A, i32 1 + store float %tmp, float* %ptr + ret void +} + define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_4s_float ; CHECK: str s0, [x0, x1, lsl #2] @@ -77,6 +117,16 @@ define void @st1lane_2d(<2 x i64> %A, i64* %D) { ret void } +define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2d +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.d { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i64* %D, i64 %offset + %tmp = extractelement <2 x i64> %A, i32 1 + store i64 %tmp, i64* %ptr + ret void +} + define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_2d ; CHECK: str d0, [x0, x1, lsl #3] @@ -94,6 +144,16 @@ define void @st1lane_2d_double(<2 x double> %A, double* %D) { ret void } +define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2d_double +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.d { v0 }[1], [x[[XREG]]] + %ptr = getelementptr double* %D, i64 %offset + %tmp = extractelement <2 x double> %A, i32 1 + store double %tmp, double* %ptr + ret void +} + define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_2d_double ; CHECK: str d0, [x0, x1, lsl #3] @@ -111,10 +171,20 @@ define void @st1lane_8b(<8 x i8> %A, i8* %D) { ret void } +define void @st1lane_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_8b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <8 x i8> %A, i32 1 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_8b -; CHECK: umov.b w[[WREG:[0-9]+]], v0[0] -; CHECK: strb w[[WREG]], [x0, x1] +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[0], [x[[XREG]]] %ptr = getelementptr i8* %D, i64 %offset %tmp = extractelement <8 x i8> %A, i32 0 store i8 %tmp, i8* %ptr @@ -129,6 +199,16 @@ define void @st1lane_4h(<4 x i16> %A, i16* %D) { ret void } +define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4h +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.h { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <4 x i16> %A, i32 1 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_4h ; CHECK: str h0, [x0, x1, lsl #1] @@ -146,6 +226,16 @@ define void @st1lane_2s(<2 x i32> %A, i32* %D) { ret void } +define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2s +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <2 x i32> %A, i32 1 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_2s ; CHECK: str s0, [x0, x1, lsl #2] @@ -163,6 +253,16 @@ define void @st1lane_2s_float(<2 x float> %A, float* %D) { ret void } +define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2s_float +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <2 x float> %A, i32 1 + store float %tmp, float* %ptr + ret void +} + define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { ; CHECK-LABEL: st1lane0_ro_2s_float ; CHECK: str s0, [x0, x1, lsl #2] |