summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--llvm/test/CodeGen/X86/haddsub.ll111
-rw-r--r--llvm/test/CodeGen/X86/phaddsub-extract.ll30
3 files changed, 91 insertions, 61 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 660f81b831c..1d28ccf973e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19037,13 +19037,12 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
- // This is free if we're extracting from the bottom lane: ymm/zmm -> xmm.
- if (NumEltsPerLane <= LExtIndex)
- return Op;
-
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index 0126249a20f..8f042590613 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1004,14 +1004,22 @@ define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract67_v8f32_fadd_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 6
%x1 = extractelement <8 x float> %x, i32 7
%x01 = fadd float %x0, %x1
@@ -1098,14 +1106,22 @@ define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract67_v8f32_fadd_f32_commute:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 6
%x1 = extractelement <8 x float> %x, i32 7
%x01 = fadd float %x1, %x0
@@ -1158,13 +1174,20 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract23_v4f64_fadd_f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x0, %x1
@@ -1217,13 +1240,20 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract23_v4f64_fadd_f64_commute:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x1, %x0
@@ -1310,13 +1340,20 @@ define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-LABEL: extract_extract45_v8f32_fsub_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x0 = extractelement <8 x float> %x, i32 4
%x1 = extractelement <8 x float> %x, i32 5
%x01 = fsub float %x0, %x1
diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll
index f862e5eed45..7851bcd8122 100644
--- a/llvm/test/CodeGen/X86/phaddsub-extract.ll
+++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll
@@ -878,9 +878,8 @@ define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) {
; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX1-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-FAST-NEXT: addl %ecx, %eax
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -898,9 +897,8 @@ define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) {
; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX2-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-FAST-NEXT: addl %ecx, %eax
+; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
@@ -918,9 +916,8 @@ define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) {
; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX512-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-FAST-NEXT: addl %ecx, %eax
+; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
@@ -1035,9 +1032,8 @@ define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) {
; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX1-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-FAST-NEXT: addl %ecx, %eax
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -1055,9 +1051,8 @@ define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) {
; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX2-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-FAST-NEXT: addl %ecx, %eax
+; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
@@ -1075,9 +1070,8 @@ define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) {
; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %ecx
-; AVX512-FAST-NEXT: vpextrw $1, %xmm0, %eax
-; AVX512-FAST-NEXT: addl %ecx, %eax
+; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-FAST-NEXT: vmovd %xmm0, %eax
; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
OpenPOWER on IntegriCloud