diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-02-17 20:08:21 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-02-17 20:08:21 +0000 |
commit | b811c1d6a57beae697be440144b9f8b8b33e0605 (patch) | |
tree | 249a0045df268a18224ea5052220b26ab54084b5 /llvm/lib/Target | |
parent | 127b6c3ba7fef9c0b39dac4c343d14eeba2c53d2 (diff) | |
download | bcm5719-llvm-b811c1d6a57beae697be440144b9f8b8b33e0605.tar.gz bcm5719-llvm-b811c1d6a57beae697be440144b9f8b8b33e0605.zip |
prevent folding a scalar FP load into a packed logical FP instruction (PR22371)
Change the memory operands in sse12_fp_packed_scalar_logical_alias from scalars to vectors.
That's what the hardware packed logical FP instructions define: 128-bit memory operands.
There are no scalar versions of these instructions...because this is x86.
Generating the wrong code (folding a scalar load into a 128-bit load) is still possible
using the peephole optimization pass and the load folding tables. We won't completely
solve this bug until we either fix the lowering in fabs/fneg/fcopysign and any other
places where scalar FP logic is created or fix the load folding in foldMemoryOperandImpl()
to make sure it isn't changing the size of the load.
Differential Revision: http://reviews.llvm.org/D7474
llvm-svn: 229531
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 19 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 33 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 14 |
3 files changed, 50 insertions, 16 deletions
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index c385b1962ef..1f4b49f7099 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -366,6 +366,15 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +// These are needed to match a scalar load that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def loadf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; +def loadf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -457,6 +466,16 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; + + // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. // FIXME: 8 byte alignment for mmx reads is not required diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 57a078ef95c..f2f11c0004e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -933,6 +933,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, + + // FIXME: We should not be folding Fs* scalar loads into vector + // instructions because the vector instructions require vector-sized + // loads. Lowering should create vector-sized instructions (the Fv* + // variants below) to allow load folding. { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, @@ -941,6 +946,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + + { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, + { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, + { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, + { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, + { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, + { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, + { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, + { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -1142,14 +1156,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, { X86::VDPPDrri, X86::VDPPDrmi, 0 }, { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - { X86::VFsANDNPDrr, X86::VFsANDNPDrm, 0 }, - { X86::VFsANDNPSrr, X86::VFsANDNPSrm, 0 }, - { X86::VFsANDPDrr, X86::VFsANDPDrm, 0 }, - { X86::VFsANDPSrr, X86::VFsANDPSrm, 0 }, - { X86::VFsORPDrr, X86::VFsORPDrm, 0 }, - { X86::VFsORPSrr, X86::VFsORPSrm, 0 }, - { X86::VFsXORPDrr, X86::VFsXORPDrm, 0 }, - { X86::VFsXORPSrr, X86::VFsXORPSrm, 0 }, + // Do not fold VFs* loads because there are no scalar load variants for + // these instructions. When folded, the load is required to be 128-bits, so + // the load size would not match. + { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, + { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, + { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, + { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, + { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, + { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, + { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, + { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 84c67c46021..9d2c83001d5 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2874,21 +2874,19 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, multiclass sse12_fp_packed_scalar_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, loadf32, SSEPackedSingle, itins, 0>, - PS, VEX_4V; + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, + PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, loadf64, SSEPackedDouble, itins, 0>, - PD, VEX_4V; + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32, SSEPackedSingle, itins>, - PS; + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64, SSEPackedDouble, itins>, - PD; + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; } } |