From f83cccf917c1c0db45bfaa6728793eb106dda8af Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 20 May 2019 09:54:06 +0000 Subject: Match types of accumulator and result for llvm.experimental.vector.reduce.fadd/fmul The scalar start/accumulator value of the fadd- and fmul reduction should match the result type of the reduction, as well as the vector element-type of the input vector. Although this was not explicitly specified in the LangRef, it was taken for granted in code implementing the reductions. The patch also fixes the LangRef by adding this constraint. Reviewed By: aemerson, nikic Differential Revision: https://reviews.llvm.org/D60260 llvm-svn: 361133 --- llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll | 64 ++++++++++++------------ 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll') diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll index 54513af0804..3f72450d977 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -47,7 +47,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -101,7 +101,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -169,7 +169,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -246,7 +246,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -291,7 +291,7 @@ define float @test_v2f32_zero(<2 x float> %a0) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -346,7 +346,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -415,7 +415,7 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -493,7 +493,7 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -538,7 +538,7 @@ define float @test_v2f32_undef(<2 x float> %a0) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -593,7 +593,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -662,7 +662,7 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -740,7 +740,7 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) + %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -778,7 +778,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -825,7 +825,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -879,7 +879,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -944,7 +944,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -983,7 +983,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1031,7 +1031,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1086,7 +1086,7 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1151,7 +1151,7 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1190,7 +1190,7 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1238,7 +1238,7 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1293,7 +1293,7 @@ define double @test_v8f64_undef(<8 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1358,16 +1358,16 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) + %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v8f64(double, <8 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v16f64(double, <16 x double>) -- cgit v1.2.3